| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| // Implements a quantized eight-bit version of the matmul operation with bias, |
| // relu and requantization fusion support utilizing mkldnn u8s8s32 inner |
| // product API. Right now, this version can support |
| // - Input: quantized as uint8 via either MIN_FIRST or SCALE mode. |
| // SCALE mode is selected when input is guaranteed to be non- |
| // negative, e.g., MatMul is fed by Relu. Otherwise, MIN_FIRST is |
| // selected. |
| // - Weight: quantized to int8 via SCALE mode. |
| // - Bias: float32/int32. For int32, it is quantized according to input and |
| // filter min-max values. |
| // Other than that, this op does not support other input combination yet. |
| // When input is quantized to uint8 via MIN_FIRST, bias needs compensation. |
| // The detailed algorithm is illustrated as below: |
| // |
| // A𝑓32 is the original fp32 activation 2D tensor. |
| // Min(A𝑓32) is the minimum scalar value of A𝑓32. |
| // Max(A𝑓32) is the maximum scalar value of A𝑓32. |
| // Qa is the quantization scale for activation. |
| // Au8 is the quantized unsigned int8 activation tensor. |
| // With SCALE quantization (used for non-negative A𝑓32), Qa and Au8 can be |
| // calculated as below: |
| // Qa = 255.0 / Max(A𝑓32) |
| // Au8 = round(Qa * A𝑓32). |
| // With MIN_FIRST quantization, Q'a and A'u8 can be calculated as below: |
| // Q'a = 255.0 / (Max(A𝑓32) – Min(A𝑓32)) |
| // A'u8 = round(Q'a * (A𝑓32 – Min(A𝑓32) * ones(A𝑓32))), |
| // where, ones(.) is a tensor of all 1s with the same shape of its argument and |
| // round(.) rounds a number to its nearest integer. |
| // |
| // W𝑓32 is the original fp32 2D weight tensor. |
| // MaxAbs(W𝑓32) is the maximum absolute scalar value of W𝑓32. |
| // Qw is the quantization scale of weight. |
| // Ws8 is the quantized signed int8 weight tensor. |
| // Qw and Ws8 can be calculated as below: |
| // Qw = 127.0 / MaxAbs(W𝑓32) |
| // Ws8 = round(Qw * W𝑓32). |
| // |
| // B𝑓32 is the original fp32 1D bias tensor matching the innermost dim of W𝑓32. |
| // With SCALE quantization of activation, the scaled bias, Bs32, is calculated |
| // as below: |
| // Bs32 = Qa * Qw * B𝑓32. |
| // With MIN_FIRST quantization of activation, the scaled bias tensor with |
| // compensation, B's32, is calculated as below: |
| // B's32 = Q'a * Qw * B𝑓32 + Q'a * Qw * Min(A𝑓32) * 1 * W𝑓32 |
| // = Q'a * Qw * B𝑓32 + Q'a * Min(A𝑓32) * 1 * Ws8. |
| // where, 1 denotes a row vector matching the outermost dim of W𝑓32. |
| // |
| // The QuantizedMatMulWithBias op calculates 32bit integer output as below: |
| // - with SCALE activation quantizaiton: |
| // Xs32 = Au8 * Ws8 + 1' * Bs32 |
| // = Qa * Qw * A𝑓32 * W𝑓32 + Qa * Qw * 1' * B𝑓32 |
| // = Qa * Qw * (A𝑓32 * W𝑓32 + 1' * B𝑓32) = Qa * Qw * X𝑓32, |
| // where, 1' denotes a column vector matching the outermost dim of A𝑓32 and |
| // X𝑓32 represents the output of original fp32 MatMul with BiasAdd fusion. |
| // |
| // - with MIN_FIRST activation quantization: |
| // Xs32 = A'u8 * Ws8 + 1' * B's32 |
| // = Q'a * (A𝑓32 - Min(A𝑓32) * ones(A𝑓32)) * Qw * W𝑓32 + |
| // Q'a * Qw * 1' * B𝑓32 + Q'a * Qw * Min(A𝑓32) * 1' * 1 * W𝑓32 |
| // = Q'a * Qw * (A𝑓32 * W𝑓32 + 1' * B𝑓32) |
| // = Q'a * Qw * X𝑓32. |
| // Note that 1' * 1 = ones(A𝑓32). |
| // |
| // The QuantizedMatMulWithBiasAndRelu op does the same calculation as above |
| // except adding relu function for the 32bit integer output. |
| // |
| // The QuantizedMatMulWithBiasAndReluAndRequantize op does one more step of |
| // requantize calculation based on above. Since the fusion ends with a Relu the |
| // activation X𝑓32 at Relu, in the original fp32 graph, is guaranteed to be |
| // non-negative. The requantize scale Qr is calculated from offline calibration. |
| // Qr = 255 / Max(X𝑓32) |
| // Xu8 = Qr * X𝑓32. |
| // |
| // More information of this implementation can be found in |
| // https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training |
| #ifdef INTEL_MKL |
| |
| #include "mkldnn.h" |
| #include "tensorflow/core/framework/op.h" |
| #include "tensorflow/core/framework/op_kernel.h" |
| #include "tensorflow/core/framework/register_types.h" |
| #include "tensorflow/core/kernels/fill_functor.h" |
| #include "tensorflow/core/kernels/mkl_quantized_conv_ops.h" |
| #include "tensorflow/core/kernels/no_op.h" |
| #include "tensorflow/core/lib/core/errors.h" |
| #include "tensorflow/core/util/mkl_util.h" |
| |
| using mkldnn::inner_product_forward; |
| using mkldnn::prop_kind; |
| using mkldnn::stream; |
| |
| namespace { |
| enum { |
| QUANTIZE_MODE_MIN_FIRST, |
| QUANTIZE_MODE_SCALED, |
| }; |
| } // namespace |
| |
| namespace tensorflow { |
| |
| // This structure aggregates multiple inputs to MklDnnMatMul* methods. |
| struct MklDnnMatMulFwdParams { |
| memory::dims src_dims; |
| memory::dims weight_dims; |
| memory::dims bias_dims; |
| memory::dims dst_dims; |
| string dtypes = string(""); |
| struct PostOpParam { |
| string name; |
| std::vector<float> param; |
| }; |
| std::vector<PostOpParam> post_op_params; |
| |
| MklDnnMatMulFwdParams(memory::dims src_dims, memory::dims weight_dims, |
| memory::dims bias_dims, memory::dims dst_dims) |
| : src_dims(src_dims), |
| weight_dims(weight_dims), |
| bias_dims(bias_dims), |
| dst_dims(dst_dims) {} |
| }; |
| |
| // With quantization, input, weight, bias, and output can have different types. |
| // So we use different template parameters for each type. |
| // TODO(intel-tf): The template type "T" is currently used to match the |
| // templatized class MklPrimitiveFactory (tensorflow/core/util/mkl_util.h). |
| // In future, with the removal of "T" from MklPrimitiveFactory, this class |
| // needs to drop "T". |
| template <typename T, typename Tinput, typename Tweight, typename Tbias, |
| typename Toutput> |
| class MklDnnMatMulFwdPrimitive : public MklPrimitive { |
| public: |
| explicit MklDnnMatMulFwdPrimitive( |
| const MklDnnMatMulFwdParams& matmulFwdParams) |
| : cpu_engine_(engine::cpu, 0) { |
| context_.fwd_stream.reset(new stream(stream::kind::eager)); |
| // Create matmul primitive |
| if (context_.matmul_fwd == nullptr) { |
| Setup(matmulFwdParams); |
| } |
| } |
| |
| ~MklDnnMatMulFwdPrimitive() {} |
| |
| // Inner-product forward execute with bias: |
| // - src_data: input data buffer of src |
| // - weight_data: input data buffer of weight |
| // - bias_data: input data buffer of bias |
| // - dst_data: output data buffer of dst |
| void Execute(const Tinput* src_data, const Tweight* weight_data, |
| const Tbias* bias_data, Toutput* dst_data) { |
| context_.src_mem->set_data_handle( |
| static_cast<void*>(const_cast<Tinput*>(src_data))); |
| context_.weight_mem->set_data_handle( |
| static_cast<void*>(const_cast<Tweight*>(weight_data))); |
| context_.bias_mem->set_data_handle( |
| static_cast<void*>(const_cast<Tbias*>(bias_data))); |
| context_.dst_mem->set_data_handle(static_cast<void*>(dst_data)); |
| context_.fwd_stream->submit(context_.fwd_primitives); |
| |
| // After execution, set data handle back |
| context_.src_mem->set_data_handle(DummyData); |
| context_.weight_mem->set_data_handle(DummyData); |
| context_.bias_mem->set_data_handle(DummyData); |
| context_.dst_mem->set_data_handle(DummyData); |
| } |
| |
| memory::format GetSrcMemoryFormat() const { return context_.src_fmt; } |
| memory::format GetweightMemoryFormat() const { return context_.weight_fmt; } |
| std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> |
| GetPrimitiveDesc() const { |
| return context_.fwd_pd; |
| } |
| |
| private: |
| // Primitive reuse context for inner-product Fwd op |
| struct MklDnnMatMulFwdContext { |
| // Expected memory format for this primitive instance |
| memory::format src_fmt; |
| memory::format weight_fmt; |
| |
| // MKL-DNN memory |
| std::shared_ptr<mkldnn::memory> src_mem; |
| std::shared_ptr<mkldnn::memory> weight_mem; |
| std::shared_ptr<mkldnn::memory> bias_mem; |
| std::shared_ptr<mkldnn::memory> dst_mem; |
| |
| // Descriptor and primitive-descriptor for forward inner-product |
| std::shared_ptr<mkldnn::inner_product_forward::desc> fwd_desc; |
| std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> fwd_pd; |
| |
| // Memory descriptors |
| std::shared_ptr<mkldnn::memory::desc> src_md; |
| std::shared_ptr<mkldnn::memory::desc> weight_md; |
| std::shared_ptr<mkldnn::memory::desc> bias_md; |
| std::shared_ptr<mkldnn::memory::desc> dst_md; |
| |
| // Inner-product primitive |
| std::shared_ptr<mkldnn::primitive> matmul_fwd; |
| std::shared_ptr<mkldnn::stream> fwd_stream; |
| std::vector<mkldnn::primitive> fwd_primitives; |
| |
| MklDnnMatMulFwdContext() |
| : src_fmt(memory::format::any), |
| weight_fmt(memory::format::any), |
| src_mem(nullptr), |
| weight_mem(nullptr), |
| bias_mem(nullptr), |
| dst_mem(nullptr), |
| fwd_desc(nullptr), |
| fwd_pd(nullptr), |
| src_md(nullptr), |
| weight_md(nullptr), |
| bias_md(nullptr), |
| matmul_fwd(nullptr), |
| fwd_stream(nullptr) {} |
| }; |
| |
| void Setup(const MklDnnMatMulFwdParams& matmul_fwd_params) { |
| // Create memory descriptors for inner-product data with no specified format |
| context_.src_md.reset(new memory::desc({matmul_fwd_params.src_dims}, |
| MklDnnType<Tinput>(), |
| memory::format::any)); |
| |
| context_.weight_md.reset(new memory::desc({matmul_fwd_params.weight_dims}, |
| MklDnnType<Tweight>(), |
| memory::format::any)); |
| |
| context_.dst_md.reset(new memory::desc({matmul_fwd_params.dst_dims}, |
| MklDnnType<Toutput>(), |
| memory::format::any)); |
| |
| context_.bias_md.reset(new memory::desc({matmul_fwd_params.bias_dims}, |
| MklDnnType<Tbias>(), |
| memory::format::any)); |
| // Create an inner-product |
| context_.fwd_desc.reset(new inner_product_forward::desc( |
| prop_kind::forward_inference, *context_.src_md, *context_.weight_md, |
| *context_.bias_md, *context_.dst_md)); |
| context_.fwd_pd.reset(new inner_product_forward::primitive_desc( |
| *context_.fwd_desc, cpu_engine_)); |
| // Check if there is any fusion as post-ops |
| auto const& post_op_params = matmul_fwd_params.post_op_params; |
| mkldnn::primitive_attr post_ops_attr; |
| mkldnn::post_ops post_ops; |
| if (!post_op_params.empty()) { |
| for (auto const& post_op_param : post_op_params) { |
| if (post_op_param.name == "relu") { |
| DCHECK_EQ(post_op_param.param.size(), 3); |
| float op_scale = post_op_param.param[0]; |
| float op_alpha = post_op_param.param[1]; |
| float op_beta = post_op_param.param[2]; |
| post_ops.append_eltwise(op_scale, mkldnn::eltwise_relu, op_alpha, |
| op_beta); |
| } else if (post_op_param.name == "output_scale") { |
| DCHECK_EQ(post_op_param.param.size(), 1); |
| std::vector<float> scales; |
| scales.push_back(post_op_param.param[0]); |
| post_ops_attr.set_output_scales(0, scales); |
| } else { |
| DCHECK((post_op_param.name == "relu") || |
| (post_op_param.name == "output_scale")); |
| } |
| } |
| post_ops_attr.set_post_ops(post_ops); |
| context_.fwd_pd.reset(new inner_product_forward::primitive_desc( |
| *context_.fwd_desc, post_ops_attr, cpu_engine_)); |
| } else { |
| context_.fwd_pd.reset(new inner_product_forward::primitive_desc( |
| *context_.fwd_desc, cpu_engine_)); |
| } |
| |
| // Store the expected memory format |
| context_.src_fmt = static_cast<mkldnn::memory::format>( |
| context_.fwd_pd.get()->src_primitive_desc().desc().data.format); |
| |
| context_.weight_fmt = static_cast<mkldnn::memory::format>( |
| context_.fwd_pd.get()->weights_primitive_desc().desc().data.format); |
| |
| // Create memory primitive based on dummy data |
| context_.src_mem.reset( |
| new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData)); |
| context_.weight_mem.reset( |
| new memory(context_.fwd_pd.get()->weights_primitive_desc(), DummyData)); |
| context_.dst_mem.reset( |
| new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData)); |
| context_.bias_mem.reset(new memory({{{matmul_fwd_params.bias_dims}, |
| MklDnnType<Tbias>(), |
| memory::format::x}, |
| cpu_engine_}, |
| DummyData)); |
| |
| // Create inner-product primitive |
| context_.matmul_fwd.reset(new inner_product_forward( |
| *context_.fwd_pd, *context_.src_mem, *context_.weight_mem, |
| *context_.bias_mem, *context_.dst_mem)); |
| |
| context_.fwd_primitives.push_back(*context_.matmul_fwd); |
| return; |
| } |
| |
| struct MklDnnMatMulFwdContext context_; |
| engine cpu_engine_; |
| }; |
| |
| template <typename T, typename Tinput, typename Tweight, typename Tbias, |
| typename Toutput> |
| class MklDnnMatMulFwdPrimitiveFactory : public MklPrimitiveFactory<T> { |
| public: |
| static MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>* Get( |
| const MklDnnMatMulFwdParams& mkldnn_matmul_fwd_dims, bool do_not_cache) { |
| MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>* matmul_fwd = |
| nullptr; |
| |
| if (do_not_cache) { |
| // Always create new primitive |
| matmul_fwd = |
| new MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>( |
| mkldnn_matmul_fwd_dims); |
| } else { |
| // try to find a suitable one in pool |
| matmul_fwd = dynamic_cast< |
| MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>*>( |
| MklDnnMatMulFwdPrimitiveFactory<T, Tinput, Tweight, Tbias, |
| Toutput>::GetInstance() |
| .GetMklDnnMatMulFwd(mkldnn_matmul_fwd_dims)); |
| if (matmul_fwd == nullptr) { |
| matmul_fwd = |
| new MklDnnMatMulFwdPrimitive<T, Tinput, Tweight, Tbias, Toutput>( |
| mkldnn_matmul_fwd_dims); |
| MklDnnMatMulFwdPrimitiveFactory<T, Tinput, Tweight, Tbias, |
| Toutput>::GetInstance() |
| .SetMklDnnMatMulFwd(mkldnn_matmul_fwd_dims, matmul_fwd); |
| } |
| } |
| return matmul_fwd; |
| } |
| |
| private: |
| MklDnnMatMulFwdPrimitiveFactory() {} |
| ~MklDnnMatMulFwdPrimitiveFactory() {} |
| |
| static MklDnnMatMulFwdPrimitiveFactory& GetInstance() { |
| static MklDnnMatMulFwdPrimitiveFactory instance_; |
| return instance_; |
| } |
| |
| static string CreateKey(const MklDnnMatMulFwdParams& mkldnn_matmul_fwd_dims) { |
| string prefix = "matmul_fwd_"; |
| FactoryKeyCreator key_creator; |
| key_creator.AddAsKey(prefix); |
| key_creator.AddAsKey(mkldnn_matmul_fwd_dims.src_dims); |
| key_creator.AddAsKey(mkldnn_matmul_fwd_dims.weight_dims); |
| key_creator.AddAsKey(mkldnn_matmul_fwd_dims.bias_dims); |
| key_creator.AddAsKey(mkldnn_matmul_fwd_dims.dst_dims); |
| key_creator.AddAsKey(mkldnn_matmul_fwd_dims.dtypes); |
| |
| // Generate keys for post-ops |
| for (auto const& post_op_param : mkldnn_matmul_fwd_dims.post_op_params) { |
| if (post_op_param.name == "relu") { |
| DCHECK_EQ(post_op_param.param.size(), 3); |
| key_creator.AddAsKey(post_op_param.name); |
| key_creator.AddAsKey(post_op_param.param[0]); |
| key_creator.AddAsKey(post_op_param.param[1]); |
| key_creator.AddAsKey(post_op_param.param[2]); |
| } else if (post_op_param.name == "output_scale") { |
| DCHECK_EQ(post_op_param.param.size(), 1); |
| key_creator.AddAsKey(post_op_param.name); |
| key_creator.AddAsKey(post_op_param.param[0]); |
| } else { |
| return string("not_a_key"); |
| } |
| } |
| return key_creator.GetKey(); |
| } |
| |
| MklPrimitive* GetMklDnnMatMulFwd( |
| const MklDnnMatMulFwdParams& mkldnn_matmul_fwd_dims) { |
| string key = CreateKey(mkldnn_matmul_fwd_dims); |
| return this->GetOp(key); |
| } |
| |
| void SetMklDnnMatMulFwd(const MklDnnMatMulFwdParams& mkldnn_matmul_fwd_dims, |
| MklPrimitive* op) { |
| string key = CreateKey(mkldnn_matmul_fwd_dims); |
| this->SetOp(key, op); |
| } |
| }; |
| |
| typedef Eigen::ThreadPoolDevice CPUDevice; |
| |
| template <typename Device, typename Tinput, typename Tweight, typename Tbias, |
| typename Toutput> |
| class MklDnnQuantizedMatMulOp : public OpKernel { |
| public: |
| virtual ~MklDnnQuantizedMatMulOp() { |
| if (this->input_bias_ != nullptr) { |
| delete this->input_bias_; |
| input_bias_ = nullptr; |
| } |
| if (this->scaled_bias_ != nullptr) { |
| delete this->scaled_bias_; |
| scaled_bias_ = nullptr; |
| } |
| if (this->comp_bias_ != nullptr) { |
| delete this->comp_bias_; |
| comp_bias_ = nullptr; |
| } |
| } |
| |
| float* GetCompBiasBuffer(int size) { |
| if (comp_bias_ == nullptr) { |
| comp_bias_ = new float[size]; |
| } |
| return comp_bias_; |
| } |
| |
| explicit MklDnnQuantizedMatMulOp(OpKernelConstruction* context) |
| : OpKernel(context) { |
| string mode_string; |
| OP_REQUIRES_OK(context, context->GetAttr("input_quant_mode", &mode_string)); |
| if (mode_string == "MIN_FIRST") { |
| mode_ = QUANTIZE_MODE_MIN_FIRST; |
| } else if (mode_string == "SCALED") { |
| mode_ = QUANTIZE_MODE_SCALED; |
| } else { |
| context->CtxFailure(errors::InvalidArgument( |
| "Quantization mode must be either MIN_FIRST or SCALED, but received ", |
| mode_string)); |
| } |
| } |
| |
| void Compute(OpKernelContext* context) override { |
| try { |
| // Input tensors |
| const Tensor& src_tensor = MklGetInput(context, kInputIndexSrc); |
| const Tensor& weight_tensor = MklGetInput(context, kInputIndexWeight); |
| const Tensor& bias_tensor = MklGetInput(context, kInputIndexBias); |
| |
| MklDnnShape src_mkl_shape, weight_mkl_shape; |
| GetMklShape(context, kInputIndexSrc, &src_mkl_shape); |
| GetMklShape(context, kInputIndexWeight, &weight_mkl_shape); |
| OP_REQUIRES(context, !weight_mkl_shape.IsMklTensor(), |
| errors::InvalidArgument("Weight should not be in " |
| "MKL Layout")); |
| |
| MklDnnData<Tinput> src(&cpu_engine_); |
| MklDnnData<Tweight> weight(&cpu_engine_); |
| |
| memory::dims src_dims, weight_dims; |
| memory::dims dst_dims_tf_order, dst_dims_mkl_order; |
| |
| // Get shapes of input tensors in MKL-DNN order |
| auto src_tf_shape = src_mkl_shape.IsMklTensor() |
| ? src_mkl_shape.GetTfShape() |
| : src_tensor.shape(); |
| auto weight_tf_shape = weight_mkl_shape.IsMklTensor() |
| ? weight_mkl_shape.GetTfShape() |
| : weight_tensor.shape(); |
| |
| src_dims = TFShapeToMklDnnDims(src_tf_shape); |
| weight_dims = TFShapeToMklDnnDims(weight_tf_shape); |
| dst_dims_mkl_order = {static_cast<int>(src_tf_shape.dim_size(0)), |
| static_cast<int>(weight_tf_shape.dim_size(1))}; |
| |
| // Weight dims need to be reversed to create inner-product forward |
| // descriptor |
| weight_dims = {static_cast<int>(weight_tf_shape.dim_size(1)), |
| static_cast<int>(weight_tf_shape.dim_size(0))}; |
| |
| // Create memory for user data. |
| // Describe how the inputs and outputs of inner-product look like. Also |
| // specify buffers containing actual input and output data. |
| Tensor* dst_tensor = nullptr; |
| auto input_output_fmt = memory::format::nc; |
| |
| // If input is in MKL layout, then simply take input layout; otherwise, |
| // construct input TF layout. For TF layout, although input shape |
| // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's |
| // layout depending on data format. |
| auto src_md = |
| src_mkl_shape.IsMklTensor() |
| ? src_mkl_shape.GetMklLayout() |
| : memory::desc(src_dims, MklDnnType<Tinput>(), input_output_fmt); |
| src.SetUsrMem(src_md, &src_tensor); |
| |
| // Although weight shape (weight_dims) required is in MKL-DNN order, |
| // the layout is TensorFlow's layout. |
| auto weight_md = weight_mkl_shape.IsMklTensor() |
| ? weight_mkl_shape.GetMklLayout() |
| : memory::desc(weight_dims, MklDnnType<Tweight>(), |
| memory::format::io); |
| weight.SetUsrMem(weight_md, &weight_tensor); |
| |
| MklDnnMatMulFwdPrimitive<float, Tinput, Tweight, Tbias, Toutput>* |
| matmul_fwd = nullptr; |
| memory::dims bias_dims = {static_cast<int>(bias_tensor.dim_size(0))}; |
| |
| MklDnnMatMulFwdParams matmul_fwd_dims(src_dims, weight_dims, bias_dims, |
| dst_dims_mkl_order); |
| |
| // Extend the basic parameters for data types and fusions. |
| this->ExtendMklDnnMatMulFwdParams(context, matmul_fwd_dims); |
| |
| // Get a MatMul fwd from primitive pool. |
| matmul_fwd = |
| MklDnnMatMulFwdPrimitiveFactory<float, Tinput, Tweight, Tbias, |
| Toutput>::Get(matmul_fwd_dims, 0); |
| |
| // Allocate output Tensor. |
| std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> |
| matmul_fwd_pd = matmul_fwd->GetPrimitiveDesc(); |
| AllocateOutputTensor(context, *matmul_fwd_pd, dst_dims_mkl_order, |
| input_output_fmt, &dst_tensor); |
| |
| Toutput* dst_data = |
| reinterpret_cast<Toutput*>(dst_tensor->flat<Toutput>().data()); |
| |
| // Check if src and weight data need to be reordered. |
| Tinput* src_data = nullptr; |
| if (src_md.data.format != matmul_fwd->GetSrcMemoryFormat()) { |
| src.SetUsrMem(src_md, &src_tensor); |
| src.CheckReorderToOpMem(matmul_fwd_pd.get()->src_primitive_desc()); |
| src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle()); |
| } else { |
| src_data = static_cast<Tinput*>( |
| const_cast<Tinput*>(src_tensor.flat<Tinput>().data())); |
| } |
| Tweight* weight_data = nullptr; |
| if (weight_md.data.format != matmul_fwd->GetweightMemoryFormat()) { |
| weight.SetUsrMem(weight_md, &weight_tensor); |
| weight.CheckReorderToOpMem( |
| matmul_fwd_pd.get()->weights_primitive_desc()); |
| weight_data = |
| static_cast<Tweight*>(weight.GetOpMem().get_data_handle()); |
| } else { |
| weight_data = static_cast<Tweight*>( |
| const_cast<Tweight*>(weight_tensor.flat<Tweight>().data())); |
| } |
| |
| // Execute inner-product |
| Tbias* bias_data = this->GetBiasHandle(context, matmul_fwd_pd, |
| bias_tensor, weight_tensor); |
| matmul_fwd->Execute(src_data, weight_data, bias_data, dst_data); |
| } catch (mkldnn::error& e) { |
| string error_msg = tensorflow::strings::StrCat( |
| "Status: ", e.status, ", message: ", string(e.message), ", in file ", |
| __FILE__, ":", __LINE__); |
| OP_REQUIRES_OK( |
| context, |
| errors::Aborted("Operation received an exception:", error_msg)); |
| } |
| float min_output_value; |
| float max_output_value; |
| if (std::is_same<Toutput, quint8>::value || |
| std::is_same<Toutput, qint8>::value) { |
| // This is the case the inner-product and requantization are fused. |
| // "min_freezed_output" and "max_freezed_output" are the requested range |
| // for the output. |
| min_output_value = context->input(7).flat<float>()(0); |
| max_output_value = context->input(8).flat<float>()(0); |
| } else { |
| ComputeOutputRangeForInt32(context, &min_output_value, &max_output_value); |
| } |
| |
| Tensor* output_min = nullptr; |
| Tensor* output_max = nullptr; |
| MklDnnShape output_min_mkl_shape, output_max_mkl_shape; |
| output_min_mkl_shape.SetMklTensor(false); |
| output_max_mkl_shape.SetMklTensor(false); |
| AllocateOutputSetMklShape(context, 1, &output_min, {}, |
| output_min_mkl_shape); |
| AllocateOutputSetMklShape(context, 2, &output_max, {}, |
| output_max_mkl_shape); |
| output_min->flat<float>()(0) = min_output_value; |
| output_max->flat<float>()(0) = max_output_value; |
| } |
| |
| protected: |
| void ComputeOutputRangeForInt32(OpKernelContext* context, |
| float* min_output_value, |
| float* max_output_value) { |
| const float min_input = context->input(3).flat<float>()(0); |
| const float max_input = context->input(4).flat<float>()(0); |
| const float min_weight = context->input(5).flat<float>()(0); |
| const float max_weight = context->input(6).flat<float>()(0); |
| MklQuantizationRangeForMultiplication<quint8, qint8, qint32>( |
| min_input, max_input, min_weight, max_weight, min_output_value, |
| max_output_value); |
| } |
| |
| virtual void ExtendMklDnnMatMulFwdParams(OpKernelContext* context, |
| MklDnnMatMulFwdParams& params) { |
| // Append data type names of input, weight, bias, and output. |
| params.dtypes.append(typeid(Tinput).name()); |
| params.dtypes.append(typeid(Tweight).name()); |
| params.dtypes.append(typeid(Tbias).name()); |
| params.dtypes.append(typeid(Toutput).name()); |
| |
| // When the output type is quint8, the output data is requantized into |
| // quint8. A post_op "output_scale" is added to do the conversion. |
| if (std::is_same<Toutput, quint8>::value || |
| std::is_same<Toutput, qint8>::value) { |
| float min_output_value; |
| float max_output_value; |
| ComputeOutputRangeForInt32(context, &min_output_value, &max_output_value); |
| float scale_int32 = |
| std::max(std::abs(min_output_value), std::abs(max_output_value)); |
| const float min_freezed_output = context->input(7).flat<float>()(0); |
| const float max_freezed_output = context->input(8).flat<float>()(0); |
| float scale_eightbit = |
| std::max(std::abs(min_freezed_output), std::abs(max_freezed_output)); |
| float scale = 1.0; |
| if (std::is_same<Toutput, quint8>::value) |
| scale = scale_int32 / scale_eightbit / static_cast<float>(1 << 23); |
| else |
| scale = scale_int32 / scale_eightbit / static_cast<float>(1 << 24); |
| |
| std::vector<float> output_scale; |
| output_scale.push_back(scale); |
| params.post_op_params.push_back({"output_scale", output_scale}); |
| } |
| } |
| |
| // This function handles bias conversion and compensation for MIN_FIRST and |
| // SCALE mode. If input is quantized via MIN_FIRST, |
| // B's32 = Q'a * Qw * B𝑓32 + Q'a * Qw * Min(A𝑓32) * 1 * W𝑓32 |
| // If input is quantized via SCALE, |
| // Bs32 = Qa * Qw * B𝑓32. |
| Tbias* GetBiasHandle( |
| OpKernelContext* context, |
| std::shared_ptr<mkldnn::inner_product_forward::primitive_desc>& |
| mkldnn_matmul_fwd_pd, |
| const Tensor& bias_tensor, const Tensor& weight_tensor) { |
| // If the bias is qint32, it means the bias is already converted offline. |
| // and it can be added to matmul output directly. |
| if (std::is_same<Tbias, qint32>::value) { |
| return static_cast<Tbias*>( |
| const_cast<Tbias*>(bias_tensor.flat<Tbias>().data())); |
| } else { |
| // If the bias is fp32, then need to calculate the bias |
| const float min_input = context->input(3).flat<float>()(0); |
| const float max_input = context->input(4).flat<float>()(0); |
| const float min_weight = context->input(5).flat<float>()(0); |
| const float max_weight = context->input(6).flat<float>()(0); |
| |
| std::vector<mkldnn::primitive> net; |
| float out_scale; |
| // If the bias is float and input quantize is MIN_FIRST, bias has to be |
| // compensated with B's32 = Q'a * Qw * B𝑓32 + Q'a * Qw * Min(A𝑓32) * 1 * |
| // W𝑓32. |
| if (mode_ == QUANTIZE_MODE_MIN_FIRST) { |
| int k = weight_tensor.dim_size(0); |
| int n = weight_tensor.dim_size(1); |
| float* comp_bias = GetCompBiasBuffer(n); |
| |
| qint8* wt_buf = static_cast<qint8*>( |
| const_cast<qint8*>(weight_tensor.flat<qint8>().data())); |
| |
| const float* bias_buf = static_cast<float*>( |
| const_cast<float*>(bias_tensor.flat<float>().data())); |
| |
| float qa_amin = 255 * min_input / (max_input - min_input); |
| |
| out_scale = (255.0 * 127.0) / |
| ((max_input - min_input) * |
| std::max(std::abs(max_weight), std::abs(min_weight))); |
| |
| #pragma omp parallel for schedule(static) |
| for (int j = 0; j < n; ++j) { |
| int x = 0; |
| for (int i = 0; i < k; ++i) { |
| x += wt_buf[i * n + j]; |
| } |
| comp_bias[j] = |
| ((bias_buf[j] * out_scale) + static_cast<float>(x * qa_amin)); |
| } |
| |
| return reinterpret_cast<Tbias*>(comp_bias_); |
| |
| } else if (mode_ == QUANTIZE_MODE_SCALED) { |
| // If the bias is float and input quantize is SCALE, bias has to be |
| // compensated with Bs32 = Qa * Qw * Bf32. |
| out_scale = 255.0 * 127.0 / max_input * |
| std::max(std::abs(max_weight), std::abs(min_weight)); |
| |
| std::vector<float> scales; |
| scales.push_back(out_scale); |
| mkldnn::primitive_attr bias_attr; |
| bias_attr.set_output_scales(0, scales); |
| |
| void* bias_buf = static_cast<void*>( |
| const_cast<Tbias*>(bias_tensor.flat<Tbias>().data())); |
| input_bias_ = |
| new memory(mkldnn_matmul_fwd_pd->bias_primitive_desc(), bias_buf); |
| scaled_bias_ = new memory(mkldnn_matmul_fwd_pd->bias_primitive_desc()); |
| auto reorder_desc = mkldnn::reorder::primitive_desc( |
| input_bias_->get_primitive_desc(), |
| scaled_bias_->get_primitive_desc(), bias_attr); |
| net.push_back( |
| mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_)); |
| stream(stream::kind::eager).submit(net).wait(); |
| return reinterpret_cast<Tbias*>(scaled_bias_->get_data_handle()); |
| } else { |
| context->CtxFailure( |
| errors::InvalidArgument("Quantization mode must be" |
| "either MIN_FIRST or SCALED.")); |
| return nullptr; |
| } |
| } |
| } |
| |
| // Allocate output tensor. |
| virtual void AllocateOutputTensor( |
| OpKernelContext* context, |
| const inner_product_forward::primitive_desc& mkldnn_matmul_prim_desc, |
| const memory::dims& output_dims_mkl_order, |
| memory::format output_tf_format, Tensor** output_tensor) { |
| DCHECK(output_tensor); |
| auto dst_pd = mkldnn_matmul_prim_desc.dst_primitive_desc(); |
| |
| MklDnnShape output_mkl_shape; |
| output_mkl_shape.SetMklTensor(true); |
| output_mkl_shape.SetMklLayout(&dst_pd); |
| output_mkl_shape.SetElemType(MklDnnType<Toutput>()); |
| output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), |
| output_dims_mkl_order, output_tf_format, true); |
| |
| TensorShape output_tf_shape; |
| output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput))); |
| |
| // Allocate Output Tensor |
| AllocateOutputSetMklShape(context, kOutputIndexDst, output_tensor, |
| output_tf_shape, output_mkl_shape); |
| } |
| |
| engine cpu_engine_ = engine(engine::cpu, 0); |
| |
| private: |
| memory* input_bias_ = nullptr; |
| memory* scaled_bias_ = nullptr; |
| |
| // Buffer to save the compensated bias |
| float* comp_bias_ = nullptr; |
| |
| const int kInputIndexSrc = 0; |
| const int kInputIndexWeight = 1; |
| const int kInputIndexBias = 2; |
| const int kOutputIndexDst = 0; |
| |
| int mode_; |
| }; |
| |
| template <typename Device, typename Tinput, typename Tweight, typename Tbias, |
| typename Toutput> |
| class MklDnnQuantizedMatMulReluOp |
| : public MklDnnQuantizedMatMulOp<Device, Tinput, Tweight, Tbias, Toutput> { |
| public: |
| virtual ~MklDnnQuantizedMatMulReluOp() {} |
| |
| explicit MklDnnQuantizedMatMulReluOp(OpKernelConstruction* context) |
| : MklDnnQuantizedMatMulOp<Device, Tinput, Tweight, Tbias, Toutput>( |
| context) {} |
| |
| protected: |
| void ExtendMklDnnMatMulFwdParams(OpKernelContext* context, |
| MklDnnMatMulFwdParams& params) override { |
| MklDnnQuantizedMatMulOp<Device, quint8, qint8, Tbias, |
| Toutput>::ExtendMklDnnMatMulFwdParams(context, |
| params); |
| params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}}); |
| } |
| }; |
| |
| // Register NoOp kernel for QuantizedMatMulWithBias to get a python interface. |
| // This kernel will be replaced by an MKL kernel during graph |
| // optimization pass. |
| REGISTER_KERNEL_BUILDER(Name("QuantizedMatMulWithBias") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<quint8>("T1") |
| .TypeConstraint<qint8>("T2") |
| .TypeConstraint<qint32>("Toutput"), |
| NoOp); |
| |
| REGISTER_KERNEL_BUILDER( |
| Name("_MklQuantizedMatMulWithBias") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<quint8>("T1") |
| .TypeConstraint<qint8>("T2") |
| .TypeConstraint<float>("Tbias") |
| .TypeConstraint<qint32>("Toutput") |
| .Label(mkl_op_registry::kMklQuantizedOpLabel), |
| MklDnnQuantizedMatMulOp<CPUDevice, quint8, qint8, float, qint32>); |
| REGISTER_KERNEL_BUILDER( |
| Name("_MklQuantizedMatMulWithBias") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<quint8>("T1") |
| .TypeConstraint<qint8>("T2") |
| .TypeConstraint<qint32>("Tbias") |
| .TypeConstraint<qint32>("Toutput") |
| .Label(mkl_op_registry::kMklQuantizedOpLabel), |
| MklDnnQuantizedMatMulOp<CPUDevice, quint8, qint8, qint32, qint32>); |
| |
| // Register NoOp kernel for QuantizedMatMulWithBiasAndRelu to get a python |
| // interface. This kernel will be replaced by an MKL kernel during |
| // graph-optimization pass. |
| REGISTER_KERNEL_BUILDER(Name("QuantizedMatMulWithBiasAndRelu") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<quint8>("T1") |
| .TypeConstraint<qint8>("T2") |
| .TypeConstraint<qint32>("Toutput"), |
| NoOp); |
| // Register NoOp kernel for QuantizedIPWithBiasAndReluAndRequantize |
| // to get a python interface. This kernel will be replaced by an MKL kernel |
| // during graph-optimization pass. |
| REGISTER_KERNEL_BUILDER(Name("QuantizedMatMulWithBiasAndReluAndRequantize") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<quint8>("T1") |
| .TypeConstraint<qint8>("T2") |
| .TypeConstraint("Tbias", {DT_QINT32, DT_FLOAT}) |
| .TypeConstraint<quint8>("Toutput"), |
| NoOp); |
| |
| // Register a templatized implementation of _MklQuantizedMatMulWithBiasAndRelu. |
| REGISTER_KERNEL_BUILDER( |
| Name("_MklQuantizedMatMulWithBiasAndRelu") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<quint8>("T1") |
| .TypeConstraint<qint8>("T2") |
| .TypeConstraint<qint32>("Toutput") |
| .Label(mkl_op_registry::kMklQuantizedOpLabel), |
| MklDnnQuantizedMatMulReluOp<CPUDevice, quint8, qint8, float, qint32>); |
| // Register a templatized implementation of |
| // _MklQuantizedMatMulWithBiasAndReluAndRequantize. |
| REGISTER_KERNEL_BUILDER( |
| Name("_MklQuantizedMatMulWithBiasAndReluAndRequantize") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<quint8>("T1") |
| .TypeConstraint<qint8>("T2") |
| .TypeConstraint<qint32>("Tbias") |
| .TypeConstraint<quint8>("Toutput") |
| .Label(mkl_op_registry::kMklQuantizedOpLabel), |
| MklDnnQuantizedMatMulReluOp<CPUDevice, quint8, qint8, qint32, quint8>); |
| REGISTER_KERNEL_BUILDER( |
| Name("_MklQuantizedMatMulWithBiasAndReluAndRequantize") |
| .Device(DEVICE_CPU) |
| .TypeConstraint<quint8>("T1") |
| .TypeConstraint<qint8>("T2") |
| .TypeConstraint<float>("Tbias") |
| .TypeConstraint<quint8>("Toutput") |
| .Label(mkl_op_registry::kMklQuantizedOpLabel), |
| MklDnnQuantizedMatMulReluOp<CPUDevice, quint8, qint8, float, quint8>); |
| |
| } // namespace tensorflow |
| |
| #endif // INTEL_MKL |