aten/src/ATen/native/mkldnn/Matmul.cpp - platform/external/pytorch - Git at Google

 #include <ATen/ATen.h>
 #include <ATen/Config.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/mkldnn/Matmul.h>
 #if !AT_MKLDNN_ENABLED()

 namespace at {
 namespace native {

 void mkldnn_matmul(
     const Tensor &mat1,
     const Tensor &mat2,
     const Tensor &result,
     float beta,
     float alpha) {
   TORCH_CHECK(false, "mkldnn_matmul: ATen not compiled with MKLDNN support");
 }

 bool use_mkldnn_bf16_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result_opt){
   return false;
 }

 bool mkldnn_bf16_gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     float alpha,
     const c10::BFloat16 *a, int64_t lda,
     const c10::BFloat16 *b, int64_t ldb,
     float beta,
     c10::BFloat16 *c, int64_t ldc) {
   return false;
 }

 } // namespace native
 } // namespace at

 #else // AT_MKLDNN_ENABLED

 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>

 namespace at {
 namespace native {

 static bool use_mkldnn_bf16_matmul() {
   return (
       at::globalContext().userEnabledMkldnn() &&
       mkldnn_bf16_device_check());
 }

 bool mkldnn_bf16_gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     float alpha,
     const c10::BFloat16 *a_data, int64_t lda,
     const c10::BFloat16 *b_data, int64_t ldb,
     float beta,
     c10::BFloat16 *c_data, int64_t ldc) {
   if (!use_mkldnn_bf16_matmul() ||
       (m * n * k <= 16 * 16 * 16) ||
       (alpha == 0.0f)) {
     return false;
   }

   ideep::attr_t op_attr;
   // Use mkldnn post ops to perform the add.
   if (beta != 0.0f) {
     op_attr = ideep::attr_t::fuse_sum();
   }

   // NOTE: View as c-contiguous to avoid extra reordering in mkldnn
   // Use identity: C = AB <=> C^T = B^T A^T
   ideep::tensor::dims a_strides{{lda, 1}}, b_strides{{ldb, 1}}, c_strides{{ldc, 1}};
   if (transa != TransposeType::NoTranspose) {
     std::swap(a_strides[0], a_strides[1]);
   }
   if (transb != TransposeType::NoTranspose) {
     std::swap(b_strides[0], b_strides[1]);
   }

   ideep::tensor a({
       /*sizes=*/{k, m},
       ideep::tensor::data_type::bf16,
       /*strides=*/a_strides},
     const_cast<c10::BFloat16*>(a_data));
   ideep::tensor b({
       /*sizes=*/{n, k},
       ideep::tensor::data_type::bf16,
       /*strides=*/b_strides},
     const_cast<c10::BFloat16*>(b_data));
   ideep::tensor c({
       /*sizes=*/{n, m},
       ideep::tensor::data_type::bf16,
       /*strides=*/c_strides},
     c_data);

   ideep::matmul_forward::compute(
       b, a, c, alpha, beta,
       ideep::scale_t(), ideep::scale_t(), ideep::scale_t(), op_attr);

   if (c.get_data_handle() != c_data){
     // ideep will query onednn expect format of output
     // if given output format is not expected, ideep will re-init an output buffer
     // under this case, we need copy the re-inited buffer back to given buffer
     ideep::tensor real_output({
         /*sizes=*/{n, m},
         ideep::tensor::data_type::bf16,
         /*strides=*/c_strides},
       c_data);
     c.reorder_to(real_output);
   }

   return true;
 }

 void mkldnn_matmul(
     const Tensor &mat1,
     const Tensor &mat2,
     const Tensor &result,
     float beta,
     float alpha) {
   TORCH_CHECK((mat1.dim() == 2 && mat2.dim() == 2) || // aten::addmm
               (mat1.dim() == 3 && mat2.dim() == 3) || // aten::bmm, aten::baddbmm
               (mat1.dim() == 2 && mat2.dim() == 1) || // aten::mv
               (mat1.dim() == 1 && mat2.dim() == 1),  // aten::dot
               "mkldnn_matmul:  unsupported dims for mat and mat2");

   TORCH_CHECK(mkldnn_bf16_device_check(),
     "mkldnn_matmul: mkldnn_matmul bf16 path needs the cpu support avx512bw, avx512vl and avx512dq, or AWS Graviton3");

 #if defined(__aarch64__)
   if (mkldnn_bf16_device_check_arm()) {
      //onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g. Arm Neoverse V1
      //so, don't restrict the mkldnn_matmul only for bf16 inputs, allow it for float as well
      TORCH_CHECK((mat1.scalar_type() == mat2.scalar_type()) && (mat1.scalar_type() == result.scalar_type()) &&
                  ((mat1.scalar_type() == at::kFloat) || (mat1.scalar_type() == at::kBFloat16)),
                  "mkldnn_matmul:  only enabled for fp32 and bf16 path");
   } else
 #endif
   {
      TORCH_CHECK(mat1.scalar_type() == at::kBFloat16 &&
                  mat2.scalar_type() == at::kBFloat16 &&
                  result.scalar_type() == at::kBFloat16, "mkldnn_matmul:  only enabled for bf16 path");
   }

   auto mat1_unsqueezed = mat1.dim() == 1 ? mat1.unsqueeze(0) : mat1;
   auto mat2_unsqueezed = mat2.dim() == 1 ? mat2.unsqueeze(1) : mat2;
   auto result_unsqueezed = result.dim() == 1 ? result.unsqueeze(1) : result;

   ideep::attr_t op_attr;
   // "addmm", "addbmm" "baddbmm" in pytorch allow bias to be 2-D or 3-D tensor
   // but mkldnn matmul primitive only support bias be 1-D tensors
   // to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over
   if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum();
   // If alpha = 0, dose not need actually do gemm computation
   if (alpha == 0)
     return;

   auto is_mkldnn_optimized_format = [&](const Tensor& t) {
     if (t.is_contiguous()) return true;
     const auto sizes = t.sizes();
     const auto strides = t.strides();
     if (t.dim() == 2){
       return strides[0] == 1 && strides[1] == sizes[0];
     } else {
       // dim = 3
       return strides[0] == sizes[1] * sizes[2] && strides[1] == 1 && strides[2] == sizes[1];
     }
   };

   // Mkldnn only optimized for contiguous or transposed (transpose last 2 dim if 3-D tensor) format now
   // Will remove this "contiguous" after mkldnn have fully supported
   Tensor mat1_ = is_mkldnn_optimized_format(mat1_unsqueezed) ? mat1_unsqueezed : mat1_unsqueezed.contiguous();
   Tensor mat2_ = is_mkldnn_optimized_format(mat2_unsqueezed) ? mat2_unsqueezed : mat2_unsqueezed.contiguous();

   // mkldnn_matmul only proceed CPU tensor
   const ideep::tensor x = itensor_view_from_dense(mat1_);
   const ideep::tensor w = itensor_view_from_dense(mat2_);
   ideep::tensor y = itensor_view_from_dense(result_unsqueezed);
   ideep::matmul_forward::compute(x, w, y, alpha, beta,
       ideep::scale_t(), ideep::scale_t(), ideep::scale_t(), op_attr);
   if (y.get_data_handle() != result.data_ptr()){
     // ideep will query onednn expect format of output
     // if given output format is not expected, ideep will re-init an output buffer
     // under this case, we need copy the re-inited buffer back to given buffer
     ideep::tensor public_y = itensor_view_from_dense(result);
     y.reorder_to(public_y);
   }

   if (mat1.dim() == 1 && mat2.dim() == 1){
     // aten::dot
     result.squeeze_();
   }

 }

 inline bool checksize(const Tensor& mat1, const Tensor& mat2){
   // if dim = 2, mat1's size = (m * n), mat2's size = (n * k)
   // else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
   // else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
   // only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
   static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
   if (mat1.dim() == 1 && mat2.dim() == 1) {
     // aten::dot
     return mat1.size(0) > mkldnn_gemm_min_size;
   } else if (mat1.dim() == 2 && mat2.dim() == 1) {
     // aten::mv
     return mat1.size(0) * mat1.size(1) > mkldnn_gemm_min_size;
   } else if (mat2.dim() == 2 && mat2.dim() == 2) {
     // aten::addmm
     return mat1.size(0) * mat1.size(1) * mat2.size(1) > mkldnn_gemm_min_size;
   } else {
     // aten::bmm, aten::baddbmm
     return mat1.size(0) * mat1.size(1) * mat1.size(2) * mat2.size(2) > mkldnn_gemm_min_size;
   }
 }

 bool use_mkldnn_bf16_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result) {
 #if defined(__aarch64__)
   if (mkldnn_bf16_device_check_arm()) {
      //onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g. Arm Neoverse V1
      //so, don't restrict the mkldnn_matmul only for bf16 inputs, allow it for float as well
      return (
         use_mkldnn_bf16_matmul() &&
         (mat1.scalar_type() == mat2.scalar_type()) && (!result.defined() || (mat1.scalar_type() == result.scalar_type())) &&
         ((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16)) &&
         mat1.numel() != 0 &&
         mat2.numel() != 0 &&
         checksize(mat1, mat2));
   } else
 #endif
   {
      return (
         use_mkldnn_bf16_matmul() &&
         mat1.scalar_type() == kBFloat16 &&
         mat2.scalar_type() == kBFloat16 &&
         (!result.defined() || result.scalar_type() == kBFloat16) &&
         mat1.numel() != 0 &&
         mat2.numel() != 0 &&
         checksize(mat1, mat2));
   }
 }

 } // namespace native
 } // namespace at

 #endif // AT_MKLDNN_ENABLED
	#include <ATen/ATen.h>
	#include <ATen/Config.h>
	#include <ATen/NativeFunctions.h>
	#include <ATen/native/mkldnn/Matmul.h>
	#if !AT_MKLDNN_ENABLED()

	namespace at {
	namespace native {

	void mkldnn_matmul(
	const Tensor &mat1,
	const Tensor &mat2,
	const Tensor &result,
	float beta,
	float alpha) {
	TORCH_CHECK(false, "mkldnn_matmul: ATen not compiled with MKLDNN support");
	}

	bool use_mkldnn_bf16_matmul(
	const Tensor& mat1,
	const Tensor& mat2,
	const Tensor& result_opt){
	return false;
	}

	bool mkldnn_bf16_gemm(
	TransposeType transa, TransposeType transb,
	int64_t m, int64_t n, int64_t k,
	float alpha,
	const c10::BFloat16 *a, int64_t lda,
	const c10::BFloat16 *b, int64_t ldb,
	float beta,
	c10::BFloat16 *c, int64_t ldc) {
	return false;
	}

	} // namespace native
	} // namespace at

	#else // AT_MKLDNN_ENABLED

	#include <ATen/native/mkldnn/MKLDNNCommon.h>
	#include <ATen/native/mkldnn/Utils.h>

	namespace at {
	namespace native {

	static bool use_mkldnn_bf16_matmul() {
	return (
	at::globalContext().userEnabledMkldnn() &&
	mkldnn_bf16_device_check());
	}

	bool mkldnn_bf16_gemm(
	TransposeType transa, TransposeType transb,
	int64_t m, int64_t n, int64_t k,
	float alpha,
	const c10::BFloat16 *a_data, int64_t lda,
	const c10::BFloat16 *b_data, int64_t ldb,
	float beta,
	c10::BFloat16 *c_data, int64_t ldc) {
	if (!use_mkldnn_bf16_matmul() \|\|
	(m * n * k <= 16 * 16 * 16) \|\|
	(alpha == 0.0f)) {
	return false;
	}

	ideep::attr_t op_attr;
	// Use mkldnn post ops to perform the add.
	if (beta != 0.0f) {
	op_attr = ideep::attr_t::fuse_sum();
	}

	// NOTE: View as c-contiguous to avoid extra reordering in mkldnn
	// Use identity: C = AB <=> C^T = B^T A^T
	ideep::tensor::dims a_strides{{lda, 1}}, b_strides{{ldb, 1}}, c_strides{{ldc, 1}};
	if (transa != TransposeType::NoTranspose) {
	std::swap(a_strides[0], a_strides[1]);
	}
	if (transb != TransposeType::NoTranspose) {
	std::swap(b_strides[0], b_strides[1]);
	}

	ideep::tensor a({
	/sizes=/{k, m},
	ideep::tensor::data_type::bf16,
	/strides=/a_strides},
	const_cast<c10::BFloat16*>(a_data));
	ideep::tensor b({
	/sizes=/{n, k},
	ideep::tensor::data_type::bf16,
	/strides=/b_strides},
	const_cast<c10::BFloat16*>(b_data));
	ideep::tensor c({
	/sizes=/{n, m},
	ideep::tensor::data_type::bf16,
	/strides=/c_strides},
	c_data);

	ideep::matmul_forward::compute(
	b, a, c, alpha, beta,
	ideep::scale_t(), ideep::scale_t(), ideep::scale_t(), op_attr);

	if (c.get_data_handle() != c_data){
	// ideep will query onednn expect format of output
	// if given output format is not expected, ideep will re-init an output buffer
	// under this case, we need copy the re-inited buffer back to given buffer
	ideep::tensor real_output({
	/sizes=/{n, m},
	ideep::tensor::data_type::bf16,
	/strides=/c_strides},
	c_data);
	c.reorder_to(real_output);
	}

	return true;
	}

	void mkldnn_matmul(
	const Tensor &mat1,
	const Tensor &mat2,
	const Tensor &result,
	float beta,
	float alpha) {
	TORCH_CHECK((mat1.dim() == 2 && mat2.dim() == 2) \|\| // aten::addmm
	(mat1.dim() == 3 && mat2.dim() == 3) \|\| // aten::bmm, aten::baddbmm
	(mat1.dim() == 2 && mat2.dim() == 1) \|\| // aten::mv
	(mat1.dim() == 1 && mat2.dim() == 1), // aten::dot
	"mkldnn_matmul: unsupported dims for mat and mat2");

	TORCH_CHECK(mkldnn_bf16_device_check(),
	"mkldnn_matmul: mkldnn_matmul bf16 path needs the cpu support avx512bw, avx512vl and avx512dq, or AWS Graviton3");

	#if defined(__aarch64__)
	if (mkldnn_bf16_device_check_arm()) {
	//onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g. Arm Neoverse V1
	//so, don't restrict the mkldnn_matmul only for bf16 inputs, allow it for float as well
	TORCH_CHECK((mat1.scalar_type() == mat2.scalar_type()) && (mat1.scalar_type() == result.scalar_type()) &&
	((mat1.scalar_type() == at::kFloat) \|\| (mat1.scalar_type() == at::kBFloat16)),
	"mkldnn_matmul: only enabled for fp32 and bf16 path");
	} else
	#endif
	{
	TORCH_CHECK(mat1.scalar_type() == at::kBFloat16 &&
	mat2.scalar_type() == at::kBFloat16 &&
	result.scalar_type() == at::kBFloat16, "mkldnn_matmul: only enabled for bf16 path");
	}

	auto mat1_unsqueezed = mat1.dim() == 1 ? mat1.unsqueeze(0) : mat1;
	auto mat2_unsqueezed = mat2.dim() == 1 ? mat2.unsqueeze(1) : mat2;
	auto result_unsqueezed = result.dim() == 1 ? result.unsqueeze(1) : result;

	ideep::attr_t op_attr;
	// "addmm", "addbmm" "baddbmm" in pytorch allow bias to be 2-D or 3-D tensor
	// but mkldnn matmul primitive only support bias be 1-D tensors
	// to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over
	if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum();
	// If alpha = 0, dose not need actually do gemm computation
	if (alpha == 0)
	return;

	auto is_mkldnn_optimized_format = [&](const Tensor& t) {
	if (t.is_contiguous()) return true;
	const auto sizes = t.sizes();
	const auto strides = t.strides();
	if (t.dim() == 2){
	return strides[0] == 1 && strides[1] == sizes[0];
	} else {
	// dim = 3
	return strides[0] == sizes[1] * sizes[2] && strides[1] == 1 && strides[2] == sizes[1];
	}
	};

	// Mkldnn only optimized for contiguous or transposed (transpose last 2 dim if 3-D tensor) format now
	// Will remove this "contiguous" after mkldnn have fully supported
	Tensor mat1_ = is_mkldnn_optimized_format(mat1_unsqueezed) ? mat1_unsqueezed : mat1_unsqueezed.contiguous();
	Tensor mat2_ = is_mkldnn_optimized_format(mat2_unsqueezed) ? mat2_unsqueezed : mat2_unsqueezed.contiguous();

	// mkldnn_matmul only proceed CPU tensor
	const ideep::tensor x = itensor_view_from_dense(mat1_);
	const ideep::tensor w = itensor_view_from_dense(mat2_);
	ideep::tensor y = itensor_view_from_dense(result_unsqueezed);
	ideep::matmul_forward::compute(x, w, y, alpha, beta,
	ideep::scale_t(), ideep::scale_t(), ideep::scale_t(), op_attr);
	if (y.get_data_handle() != result.data_ptr()){
	// ideep will query onednn expect format of output
	// if given output format is not expected, ideep will re-init an output buffer
	// under this case, we need copy the re-inited buffer back to given buffer
	ideep::tensor public_y = itensor_view_from_dense(result);
	y.reorder_to(public_y);
	}

	if (mat1.dim() == 1 && mat2.dim() == 1){
	// aten::dot
	result.squeeze_();
	}

	}

	inline bool checksize(const Tensor& mat1, const Tensor& mat2){
	// if dim = 2, mat1's size = (m * n), mat2's size = (n * k)
	// else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
	// else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
	// only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
	static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
	if (mat1.dim() == 1 && mat2.dim() == 1) {
	// aten::dot
	return mat1.size(0) > mkldnn_gemm_min_size;
	} else if (mat1.dim() == 2 && mat2.dim() == 1) {
	// aten::mv
	return mat1.size(0) * mat1.size(1) > mkldnn_gemm_min_size;
	} else if (mat2.dim() == 2 && mat2.dim() == 2) {
	// aten::addmm
	return mat1.size(0) * mat1.size(1) * mat2.size(1) > mkldnn_gemm_min_size;
	} else {
	// aten::bmm, aten::baddbmm
	return mat1.size(0) * mat1.size(1) * mat1.size(2) * mat2.size(2) > mkldnn_gemm_min_size;
	}
	}

	bool use_mkldnn_bf16_matmul(
	const Tensor& mat1,
	const Tensor& mat2,
	const Tensor& result) {
	#if defined(__aarch64__)
	if (mkldnn_bf16_device_check_arm()) {
	//onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g. Arm Neoverse V1
	//so, don't restrict the mkldnn_matmul only for bf16 inputs, allow it for float as well
	return (
	use_mkldnn_bf16_matmul() &&
	(mat1.scalar_type() == mat2.scalar_type()) && (!result.defined() \|\| (mat1.scalar_type() == result.scalar_type())) &&
	((mat1.scalar_type() == kFloat) \|\| (mat1.scalar_type() == kBFloat16)) &&
	mat1.numel() != 0 &&
	mat2.numel() != 0 &&
	checksize(mat1, mat2));
	} else
	#endif
	{
	return (
	use_mkldnn_bf16_matmul() &&
	mat1.scalar_type() == kBFloat16 &&
	mat2.scalar_type() == kBFloat16 &&
	(!result.defined() \|\| result.scalar_type() == kBFloat16) &&
	mat1.numel() != 0 &&
	mat2.numel() != 0 &&
	checksize(mat1, mat2));
	}
	}

	} // namespace native
	} // namespace at

	#endif // AT_MKLDNN_ENABLED