Revert "[sparse] Add i8i8->i32 support for cuSPARSELt (#110499)"

This reverts commit 33da6c89516d9d9067f7181826826224a4cf5afe.

Reverted https://github.com/pytorch/pytorch/pull/110499 on behalf of https://github.com/jcaip due to cslt v0.5.0 requires a newer linker and we will be using v0.4.0 as the base version ([comment](https://github.com/pytorch/pytorch/pull/110499#issuecomment-1758039953))
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
index 252e919..09462fd 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -119,7 +119,6 @@
   cudaDataType output_type;
   cusparseComputeType compute_type;
   auto compression_factor = 9;
-  c10::ScalarType pytorch_output_type;
 
 
   switch(compressed_A.scalar_type())
@@ -158,13 +157,6 @@
     {
         output_type = CUDA_R_16F;
         mixed_dtype_mode = true;
-        pytorch_output_type = out_dtype;
-    }
-    else if (input_type == CUDA_R_8I and out_dtype == at::ScalarType::Int)
-    {
-        output_type = CUDA_R_32I;
-        mixed_dtype_mode = true;
-        pytorch_output_type = out_dtype;
     }
     else
     {
@@ -205,8 +197,8 @@
   at::Tensor res;
   if (mixed_dtype_mode)
   {
-      res = (transpose_result) ? at::empty({n, m}, c10::TensorOptions().dtype(pytorch_output_type).device(dense_B.device()))
-                               : at::empty({m, n}, c10::TensorOptions().dtype(pytorch_output_type).device(dense_B.device()));
+      res = (transpose_result) ? at::empty({n, m}, c10::TensorOptions().dtype(c10::kHalf).device(dense_B.device()))
+                               : at::empty({m, n}, c10::TensorOptions().dtype(c10::kHalf).device(dense_B.device()));
   }
   else
   {
diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py
index 8cfd6dc..1f0cf17 100644
--- a/test/test_sparse_semi_structured.py
+++ b/test/test_sparse_semi_structured.py
@@ -166,8 +166,7 @@
             # test transpose
             # NOTE: CUTLASS and cuSPARSELt have slightly different int8 behavior.
             # CUTLASS will output to an int32 tensor while cuSPARSELt will output to a int8 tensor
-            dense_result = torch.mm(A.cpu().to(torch.int64), B.t().cpu().to(torch.int64))
-            dense_result = dense_result.to(device, dtype=torch.int32 if backend == "cutlass" else torch.int8)
+            dense_result = torch.mm(A.cpu(), B.t().cpu()).to(device, dtype=torch.int32 if backend == "cutlass" else torch.int8)
             sparse_result = torch.mm(A_sparse, B.t())
             assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
         else:
@@ -211,8 +210,7 @@
 
         # Currently we don't support int matmul on GPU, so evaluate on CPU and copy over
         if dtype is torch.int8:
-            dense_result = torch.mm(A.cpu().to(torch.int64), B.t().cpu().to(torch.int64))
-            dense_result = dense_result.to(device, dtype=torch.int32 if backend == "cutlass" else torch.int8)
+            dense_result = torch.mm(A.cpu(), B.t().cpu()).to(device, dtype=torch.int32 if backend == "cutlass" else torch.int8)
             sparse_result = torch.mm(A, B_sparse.t())
         else:
             dense_result = torch.mm(A, B.t())
@@ -222,7 +220,7 @@
 
     def test_cslt_sparse_mm_int8_in_fp16_out(self, device):
         """
-        Test sparse mam with int8 input with fp16 output for cuSPARSELt
+        This test is only needed for cuSPARSELt
         """
         if "cusparselt" in SEMI_STRUCTURED_SUPPORTED_BACKENDS:
             SparseSemiStructuredTensor._FORCE_CUTLASS = False
@@ -235,21 +233,6 @@
             sparse_result = torch._cslt_sparse_mm(A_sparse.compressed_tensor_cusparselt, B.t(), out_dtype=torch.float16)
             assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
 
-    def test_cslt_sparse_mm_int8_in_int32_out(self, device):
-        """
-        Test sparse mam with int8 input with int32 output for cuSPARSELt
-        """
-        if "cusparselt" in SEMI_STRUCTURED_SUPPORTED_BACKENDS:
-            SparseSemiStructuredTensor._FORCE_CUTLASS = False
-            A = rand_sparse_semi_structured_mask(128, 128, dtype=torch.int8)
-            A_sparse = to_sparse_semi_structured(A)
-
-            B = torch.rand((128, 128), device=A_sparse.device).to(torch.int8)
-
-            dense_result = torch.mm(A.cpu().to(torch.int64), B.t().cpu().to(torch.int64)).to(device, dtype=torch.int32)
-            sparse_result = torch._cslt_sparse_mm(A_sparse.compressed_tensor_cusparselt, B.t(), out_dtype=torch.int32)
-            assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
-
     @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
     @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
     def test_mm_sparse_second_NT(self, dtype, device, backend):