blob: e3b5bb69db8dcaa7c7e6a808397b5a7683ceb787 [file] [log] [blame]
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// See docs in ../ops/math_ops.cc.
// This file uses MKL CBLAS xGEMM for acceleration of TF Matrix-Matrix
// Multiplication (MatMul) operations.
// We currently register this kernel only for MKL supported data
// types (float, double, complex64, complex128). The macro INTEL_MKL is defined
// by the build system only when MKL is chosen as an option at configure stage
// and when it is undefined at build time, this file becomes an empty
// compilation unit
#if defined(INTEL_MKL)
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/fill_functor.h"
#include "tensorflow/core/util/mkl_util.h"
// This header file is part of MKL ML, need equivalent file in MKL DNN
#ifndef INTEL_MKL_DNN_ONLY
#include "mkl_cblas.h"
#else
#include "mkldnn.h"
#endif
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
template <typename Device, typename T, bool USE_CUBLAS>
class MklMatMulOp : public OpKernel {
public:
explicit MklMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
}
void Compute(OpKernelContext* ctx) override {
const Tensor& a = ctx->input(0);
const Tensor& b = ctx->input(1);
// Check that the dimensions of the two matrices are valid.
OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
errors::InvalidArgument("In[0] is not a matrix"));
OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
errors::InvalidArgument("In[1] is not a matrix"));
Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
dim_pair[0].first = transpose_a_ ? 0 : 1;
dim_pair[0].second = transpose_b_ ? 1 : 0;
OP_REQUIRES(
ctx, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
errors::InvalidArgument(
"Matrix size-incompatible: In[0]: ", a.shape().DebugString(),
", In[1]: ", b.shape().DebugString()));
int a_dim_remaining = 1 - dim_pair[0].first;
int b_dim_remaining = 1 - dim_pair[0].second;
TensorShape out_shape(
{a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
Tensor* out = nullptr;
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
if (out->NumElements() == 0) {
// If a has shape [0, x] or b has shape [x, 0], the output shape
// is a 0-element matrix, so there is nothing to do.
return;
}
if (a.NumElements() == 0 && b.NumElements() == 0) {
// If a has shape [x, 0] and b has shape [0, y], the
// output shape is [x, y] where x and y are non-zero, so we fill
// the output with zeros.
functor::SetZeroFunctor<Device, T> f;
f(ctx->eigen_device<Device>(), out->flat<T>());
return;
}
const int m = a.dim_size(1 - dim_pair[0].first);
const int k = a.dim_size(dim_pair[0].first);
const int n = b.dim_size(1 - dim_pair[0].second);
bool transpose_a = dim_pair[0].first == 0;
bool transpose_b = dim_pair[0].second == 1;
auto a_ptr = (a.template flat<T>().data());
auto b_ptr = (b.template flat<T>().data());
auto c_ptr = (out->template flat<T>().data());
MklBlasGemm(transpose_a, transpose_b, m, n, k, a_ptr, transpose_a ? m : k,
b_ptr, transpose_b ? k : n, c_ptr, n);
}
private:
bool transpose_a_;
bool transpose_b_;
// --------------------------------------------------------------------------
//
// @brief Matrix-Matrix Multiplication with FP32 tensors, a, b, c using CBLAS
// interface. c = op(a) * op(b)
//
// @param transa Specifies the form of op(a) used in MatMul. If transa is
// true, then op(a) = a^T, otherwise op(a) = a
//
// @param transb Specifies the form of op(b) used in MatMul. If transb is
// true, then op(b) = b^T, otherwise op(b) = b
//
// @param m Specifies the number of rows of the matrix op(a) and of the
// matrix c. The value of m must be at least zero.
//
// @param n Specifies the number of columns of the matrix op(b) and the
// number of columns of the matrix c. The value of n must be at least zero.
//
// @param k Specifies the number of columns of the matrix op(a) and the
// number of rows of the matrix op(b)
//
// @param a Address of matrix a
//
// @param lda Leading dimension of 'a' matrix. This is set at calling site
// depending on transa parameter. Since TF uses row-major
// layout, leading dimension is the stride between consecutive rows
// lda = max(1,k) when transa is false, otherwise lda = max(1,m)
//
// @param b Address of matrix b
//
// @param ldb Leading dimension of 'b' matrix. This is set at calling site
// depending on transb parameter. Since TF uses row-major
// layout, leading dimension is the stride between consecutive rows
// ldb = max(1,n) when transb is false, otherwise ldb = max(1,k)
//
// @param c Address of matrix c
//
// @param ldc Leading dimension of 'c' matrix. Since TF uses row-major
// layout, leading dimension is the stride between consecutive rows, max(1,n)
//
// --------------------------------------------------------------------------
void MklBlasGemm(bool transa, bool transb, const int m, const int n,
const int k, const float* a, const int lda, const float* b,
const int ldb, float* c, const int ldc) {
// BLAS GEMM API defines Matrix Multiplication as c = alpha * op(a) * op(b)
// + beta * c.
// Since TF MatMul does not have parameters for alpha, beta, we set them to
// 1.0 and 0.0 respectively.
const float alpha = 1.0f;
const float beta = 0.0f;
#if defined(INTEL_MKL_DNN_ONLY)
const char* const ftrans[] = {"N", "T", "C"};
int index_transa = transa ? 1 : 0;
int index_transb = transb ? 1 : 0;
VLOG(2) << "MKL DNN SGEMM called";
// MKL DNN only supports the Fortran api and requires column major while
// Tensorflow uses row major so we reverse the order A and B
mkldnn_sgemm(ftrans[index_transb], ftrans[index_transa], &n, &m, &k, &alpha,
b, &ldb, a, &lda, &beta, c, &ldc);
#else
// MKL ML binary uses CBLAS API
cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
ldb, beta, c, ldc);
#endif
}
// MKLDNN only supports SGEMM
#ifndef INTEL_MKL_DNN_ONLY
// Matrix-Matrix Multiplication with FP64 tensors. For detailed info about
// parameters, look at FP32 function description.
void MklBlasGemm(bool transa, bool transb, const int m, const int n,
const int k, const double* a, const int lda, const double* b,
const int ldb, double* c, const int ldc) {
const double alpha = 1.0;
const double beta = 0.0;
cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
ldb, beta, c, ldc);
}
// Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
// For detailed info about parameters, look at FP32 function description.
void MklBlasGemm(bool transa, bool transb, const int m, const int n,
const int k, const complex64* a, const int lda,
const complex64* b, const int ldb, complex64* c,
int const ldc) {
const MKL_Complex8 alpha = {1.0f, 0.0f};
const MKL_Complex8 beta = {0.0f, 0.0f};
cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
reinterpret_cast<const MKL_Complex8*>(a), lda,
reinterpret_cast<const MKL_Complex8*>(b), ldb, &beta,
reinterpret_cast<MKL_Complex8*>(c), ldc);
}
// Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
// tensors. For detailed info about parameters, look at FP32 function
// description.
void MklBlasGemm(bool transa, bool transb, const int m, const int n,
const int k, const complex128* a, const int lda,
const complex128* b, const int ldb, complex128* c,
const int ldc) {
const MKL_Complex16 alpha = {1.0, 0.0};
const MKL_Complex16 beta = {0.0, 0.0};
cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
reinterpret_cast<const MKL_Complex16*>(a), lda,
reinterpret_cast<const MKL_Complex16*>(b), ldb, &beta,
reinterpret_cast<MKL_Complex16*>(c), ldc);
}
#endif // !INTEL_MKL_DNN_ONLY
};
#define REGISTER_CPU(T) \
REGISTER_KERNEL_BUILDER( \
Name("_MklMatMul") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.Label(mkl_op_registry::kMklNameChangeOpLabel), \
MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);
#ifdef ENABLE_MKL
// TODO(inteltf) Consider template specialization when adding/removing
// additional types
TF_CALL_float(REGISTER_CPU);
#ifndef INTEL_MKL_DNN_ONLY
TF_CALL_double(REGISTER_CPU);
TF_CALL_complex64(REGISTER_CPU);
TF_CALL_complex128(REGISTER_CPU);
#endif // !INTEL_MKL_DNN_ONLY
#endif // ENABLE_MKL
} // namespace tensorflow
#endif // INTEL_MKL