tensorflow/core/grappler/costs/op_level_cost_estimator.cc - platform/external/tensorflow - Git at Google


 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"

 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/costs/utils.h"

 namespace tensorflow {
 namespace grappler {

 constexpr int kOpsPerMac = 2;
 constexpr char kGuaranteeConst[] = "GuaranteeConst";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
 constexpr char kFusedConv2dBiasActivation[] = "FusedConv2DBiasActivation";
 constexpr char kDepthwiseConv2dNative[] = "DepthwiseConv2dNative";
 constexpr char kDepthwiseConv2dNativeBackpropFilter[] =
     "DepthwiseConv2dNativeBackpropFilter";
 constexpr char kDepthwiseConv2dNativeBackpropInput[] =
     "DepthwiseConv2dNativeBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kXlaEinsum[] = "XlaEinsum";
 constexpr char kEinsum[] = "Einsum";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kSparseTensorDenseMatMul[] = "SparseTensorDenseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
 constexpr char kIdentity[] = "Identity";
 constexpr char kIdentityN[] = "IdentityN";
 constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
 constexpr char kReshape[] = "Reshape";
 constexpr char kSqueeze[] = "Squeeze";
 constexpr char kRecv[] = "_Recv";
 constexpr char kSend[] = "_Send";
 constexpr char kBatchMatMul[] = "BatchMatMul";
 constexpr char kRank[] = "Rank";
 constexpr char kShape[] = "Shape";
 constexpr char kShapeN[] = "ShapeN";
 constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
 constexpr char kGather[] = "Gather";
 constexpr char kGatherV2[] = "GatherV2";
 constexpr char kScatterAdd[] = "ScatterAdd";
 constexpr char kScatterDiv[] = "ScatterDiv";
 constexpr char kScatterMax[] = "ScatterMax";
 constexpr char kScatterMin[] = "ScatterMin";
 constexpr char kScatterMul[] = "ScatterMul";
 constexpr char kScatterSub[] = "ScatterSub";
 constexpr char kScatterUpdate[] = "ScatterUpdate";
 constexpr char kSlice[] = "Slice";
 constexpr char kMaxPool[] = "MaxPool";
 constexpr char kMaxPoolGrad[] = "MaxPoolGrad";
 constexpr char kAvgPool[] = "AvgPool";
 constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
 constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
 constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
 constexpr char kQuantizedMatMul[] = "QuantizedMatMul";
 constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
 // Dynamic control flow ops.
 constexpr char kSwitch[] = "Switch";
 constexpr char kMerge[] = "Merge";
 constexpr char kEnter[] = "Enter";
 constexpr char kExit[] = "Exit";
 constexpr char kNextIteration[] = "NextIteration";
 // Persistent ops.
 constexpr char kConst[] = "Const";
 constexpr char kVariable[] = "Variable";
 constexpr char kVariableV2[] = "VariableV2";
 constexpr char kAutoReloadVariable[] = "AutoReloadVariable";
 constexpr char kVarHandleOp[] = "VarHandleOp";
 constexpr char kVarHandlesOp[] = "_VarHandlesOp";
 constexpr char kReadVariableOp[] = "ReadVariableOp";
 constexpr char kReadVariablesOp[] = "_ReadVariablesOp";
 constexpr char kAssignVariableOp[] = "AssignVariableOp";
 constexpr char kAssignAddVariableOp[] = "AssignAddVariableOp";
 constexpr char kAssignSubVariableOp[] = "AssignSubVariableOp";

 static const Costs::Duration kMinComputeTime(1);

 namespace {

 string GetDataFormat(const OpInfo& op_info) {
   string data_format = "NHWC";  // Default format.
   if (op_info.attr().find("data_format") != op_info.attr().end()) {
     data_format = op_info.attr().at("data_format").s();
   }
   return data_format;
 }

 string GetFilterFormat(const OpInfo& op_info) {
   string filter_format = "HWIO";  // Default format.
   if (op_info.attr().find("filter_format") != op_info.attr().end()) {
     filter_format = op_info.attr().at("filter_format").s();
   }
   return filter_format;
 }

 Padding GetPadding(const OpInfo& op_info) {
   if (op_info.attr().find("padding") != op_info.attr().end() &&
       op_info.attr().at("padding").s() == "VALID") {
     return Padding::VALID;
   }
   return Padding::SAME;  // Default padding.
 }

 bool IsTraining(const OpInfo& op_info) {
   if (op_info.attr().find("is_training") != op_info.attr().end() &&
       op_info.attr().at("is_training").b()) {
     return true;
   }
   return false;
 }

 // TODO(dyoon): support non-4D tensors in the cost functions of convolution
 // related ops (Conv, Pool, BatchNorm, and their backprops) and the related
 // helper functions.
 std::vector<int64> GetStrides(const OpInfo& op_info) {
   if (op_info.attr().find("strides") != op_info.attr().end()) {
     const auto strides = op_info.attr().at("strides").list().i();
     DCHECK(strides.size() == 4)
         << "Attr strides is not a length-4 vector: " << op_info.DebugString();
     if (strides.size() != 4) return {1, 1, 1, 1};
     return {strides[0], strides[1], strides[2], strides[3]};
   }
   return {1, 1, 1, 1};
 }

 std::vector<int64> GetKernelSize(const OpInfo& op_info) {
   if (op_info.attr().find("ksize") != op_info.attr().end()) {
     const auto ksize = op_info.attr().at("ksize").list().i();
     DCHECK(ksize.size() == 4)
         << "Attr ksize is not a length-4 vector: " << op_info.DebugString();
     if (ksize.size() != 4) return {1, 1, 1, 1};
     return {ksize[0], ksize[1], ksize[2], ksize[3]};
   }
   // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
   // {1, 1, 1, 1} in that case.
   return {1, 1, 1, 1};
 }

 int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
                     const Padding& padding) {
   // Logic for calculating output shape is from GetWindowedOutputSizeVerbose()
   // function in third_party/tensorflow/core/framework/common_shape_fns.cc.
   if (padding == Padding::VALID) {
     return (input - filter + stride) / stride;
   } else {  // SAME.
     return (input + stride - 1) / stride;
   }
 }

 // Return the output element count of a binary element-wise op considering
 // broadcasting.
 int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
                               const TensorShapeProto& input_shape_2) {
   bool found_unknown_shapes;
   int rank = std::max(1, input_shape_1.dim_size());
   TensorShapeProto output_shape =
       MaybeGetMinimumShape(input_shape_1, rank, &found_unknown_shapes);

   if (input_shape_1.dim_size() == input_shape_2.dim_size()) {
     auto shape_1 =
         MaybeGetMinimumShape(input_shape_1, rank, &found_unknown_shapes);
     auto shape_2 =
         MaybeGetMinimumShape(input_shape_2, rank, &found_unknown_shapes);
     if (shape_1.dim_size() == shape_2.dim_size()) {
       for (int i = 0; i < shape_1.dim_size(); i++) {
         output_shape.mutable_dim(i)->set_size(
             std::max(shape_1.dim(i).size(), shape_2.dim(i).size()));
       }
     }
   }

   int64 count = 1;
   for (int i = 0; i < output_shape.dim_size(); i++) {
     count *= output_shape.dim(i).size();
   }
   return count;
 }

 // Helper function for determining whether there are repeated indices in the
 // input Einsum equation.
 bool CheckRepeatedDimensions(const string& dim_str) {
   int str_size = dim_str.size();
   for (int idx = 0; idx < str_size - 1; idx++) {
     if (dim_str.find(dim_str[idx], idx + 1) != std::string::npos) {
       return true;
     }
   }
   return false;
 }

 }  // namespace

 // Return a minimum shape if the shape is unknown. If known, return the original
 // shape.
 TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
                                       int rank, bool* found_unknown_shapes) {
   auto shape = original_shape;
   bool is_scalar = !shape.unknown_rank() && shape.dim_size() == 0;

   if (shape.unknown_rank() || (!is_scalar && shape.dim_size() < rank)) {
     *found_unknown_shapes = true;
     VLOG(2) << "Use minimum shape because the rank is unknown.";
     // The size of each dimension is at least 1, if unknown.
     for (int i = shape.dim_size(); i < rank; i++) {
       shape.add_dim()->set_size(1);
     }
   } else if (is_scalar) {
     for (int i = 0; i < rank; i++) {
       shape.add_dim()->set_size(1);
     }
   } else if (shape.dim_size() > rank) {
     *found_unknown_shapes = true;
     shape.clear_dim();
     for (int i = 0; i < rank; i++) {
       shape.add_dim()->set_size(original_shape.dim(i).size());
     }
   } else {
     for (int i = 0; i < shape.dim_size(); i++) {
       if (shape.dim(i).size() < 0) {
         *found_unknown_shapes = true;
         VLOG(2) << "Use minimum dim size 1 because the shape is unknown.";
         // The size of each dimension is at least 1, if unknown.
         shape.mutable_dim(i)->set_size(1);
       }
     }
   }
   return shape;
 }

 OpLevelCostEstimator::OpLevelCostEstimator() {
   // Syntactic sugar to build and return a lambda that takes an OpInfo and
   // returns a cost.
   typedef Costs (OpLevelCostEstimator::*CostImpl)(const OpContext& op_context)
       const;
   auto wrap = [this](CostImpl impl) -> std::function<Costs(const OpContext&)> {
     return [this, impl](const OpContext& op_context) {
       return (this->*impl)(op_context);
     };
   };

   device_cost_impl_.emplace(kConv2d,
                             wrap(&OpLevelCostEstimator::PredictConv2D));
   device_cost_impl_.emplace(
       kConv2dBackpropFilter,
       wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter));
   device_cost_impl_.emplace(
       kConv2dBackpropInput,
       wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput));
   device_cost_impl_.emplace(
       kFusedConv2dBiasActivation,
       wrap(&OpLevelCostEstimator::PredictFusedConv2DBiasActivation));
   // reuse Conv2D for DepthwiseConv2dNative because the calculation is the
   // same although the actual meaning of the parameters are different. See
   // comments in PredictConv2D and related functions
   device_cost_impl_.emplace(kDepthwiseConv2dNative,
                             wrap(&OpLevelCostEstimator::PredictConv2D));
   device_cost_impl_.emplace(
       kDepthwiseConv2dNativeBackpropFilter,
       wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter));
   device_cost_impl_.emplace(
       kDepthwiseConv2dNativeBackpropInput,
       wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput));
   device_cost_impl_.emplace(kMatMul,
                             wrap(&OpLevelCostEstimator::PredictMatMul));
   device_cost_impl_.emplace(kSparseMatMul,
                             wrap(&OpLevelCostEstimator::PredictMatMul));
   device_cost_impl_.emplace(
       kSparseTensorDenseMatMul,
       wrap(&OpLevelCostEstimator::PredictSparseTensorDenseMatMul));
   device_cost_impl_.emplace(kBatchMatMul,
                             wrap(&OpLevelCostEstimator::PredictBatchMatMul));
   device_cost_impl_.emplace(kQuantizedMatMul,
                             wrap(&OpLevelCostEstimator::PredictMatMul));
   device_cost_impl_.emplace(kQuantizedMatMulV2,
                             wrap(&OpLevelCostEstimator::PredictMatMul));
   device_cost_impl_.emplace(kXlaEinsum,
                             wrap(&OpLevelCostEstimator::PredictEinsum));
   device_cost_impl_.emplace(kEinsum,
                             wrap(&OpLevelCostEstimator::PredictEinsum));

   device_cost_impl_.emplace(kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp));
   device_cost_impl_.emplace(kGuaranteeConst,
                             wrap(&OpLevelCostEstimator::PredictNoOp));

   device_cost_impl_.emplace(kGather,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
   device_cost_impl_.emplace(kGatherV2,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));
   device_cost_impl_.emplace(kScatterAdd,
                             wrap(&OpLevelCostEstimator::PredictScatter));
   device_cost_impl_.emplace(kScatterDiv,
                             wrap(&OpLevelCostEstimator::PredictScatter));
   device_cost_impl_.emplace(kScatterMax,
                             wrap(&OpLevelCostEstimator::PredictScatter));
   device_cost_impl_.emplace(kScatterMin,
                             wrap(&OpLevelCostEstimator::PredictScatter));
   device_cost_impl_.emplace(kScatterMul,
                             wrap(&OpLevelCostEstimator::PredictScatter));
   device_cost_impl_.emplace(kScatterSub,
                             wrap(&OpLevelCostEstimator::PredictScatter));
   device_cost_impl_.emplace(kScatterUpdate,
                             wrap(&OpLevelCostEstimator::PredictScatter));

   device_cost_impl_.emplace(kSlice,
                             wrap(&OpLevelCostEstimator::PredictGatherOrSlice));

   device_cost_impl_.emplace(kPlaceholder,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kIdentity,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kIdentityN,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kRefIdentity,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kStopGradient,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kPreventGradient,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kReshape,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kSqueeze,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kRecv,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kSend,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kSwitch,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kMerge,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kEnter,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kExit,
                             wrap(&OpLevelCostEstimator::PredictIdentity));
   device_cost_impl_.emplace(kNextIteration,
                             wrap(&OpLevelCostEstimator::PredictIdentity));

   device_cost_impl_.emplace(kRank,
                             wrap(&OpLevelCostEstimator::PredictMetadata));
   device_cost_impl_.emplace(kShape,
                             wrap(&OpLevelCostEstimator::PredictMetadata));
   device_cost_impl_.emplace(kShapeN,
                             wrap(&OpLevelCostEstimator::PredictMetadata));
   device_cost_impl_.emplace(kSize,
                             wrap(&OpLevelCostEstimator::PredictMetadata));
   device_cost_impl_.emplace(kMaxPool,
                             wrap(&OpLevelCostEstimator::PredictMaxPool));
   device_cost_impl_.emplace(kMaxPoolGrad,
                             wrap(&OpLevelCostEstimator::PredictMaxPoolGrad));
   device_cost_impl_.emplace(kAvgPool,
                             wrap(&OpLevelCostEstimator::PredictAvgPool));
   device_cost_impl_.emplace(kAvgPoolGrad,
                             wrap(&OpLevelCostEstimator::PredictAvgPoolGrad));
   device_cost_impl_.emplace(kFusedBatchNorm,
                             wrap(&OpLevelCostEstimator::PredictFusedBatchNorm));
   device_cost_impl_.emplace(
       kFusedBatchNormGrad,
       wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad));
   device_cost_impl_.emplace(
       kAssignVariableOp, wrap(&OpLevelCostEstimator::PredictAssignVariableOps));
   device_cost_impl_.emplace(
       kAssignAddVariableOp,
       wrap(&OpLevelCostEstimator::PredictAssignVariableOps));
   device_cost_impl_.emplace(
       kAssignSubVariableOp,
       wrap(&OpLevelCostEstimator::PredictAssignVariableOps));

   persistent_ops_ = {
       kConst,       kVariable,       kVariableV2,   kAutoReloadVariable,
       kVarHandleOp, kReadVariableOp, kVarHandlesOp, kReadVariablesOp};

 #define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost

   // Quantize = apply min and max bounds, multiply by scale factor and round.
   const int quantize_v2_cost =
       EIGEN_COST(scalar_product_op<float>) + EIGEN_COST(scalar_max_op<float>) +
       EIGEN_COST(scalar_min_op<float>) + EIGEN_COST(scalar_round_op<float>);
   const int quantize_and_dequantize_v2_cost =
       quantize_v2_cost + EIGEN_COST(scalar_product_op<float>);

   // Unary ops alphabetically sorted
   elementwise_ops_.emplace("Acos", EIGEN_COST(scalar_acos_op<float>));
   elementwise_ops_.emplace("Asin", EIGEN_COST(scalar_asin_op<float>));
   elementwise_ops_.emplace("Atan", EIGEN_COST(scalar_atan_op<float>));
   elementwise_ops_.emplace("Atan2", EIGEN_COST(scalar_quotient_op<float>) +
                                         EIGEN_COST(scalar_atan_op<float>));
   // For now, we use Eigen cost model for float to int16 cast as an example
   // case; Eigen cost model is zero when src and dst types are identical,
   // and it uses AddCost (1) when different. We may implement a separate
   // cost functions for cast ops, using the actual input and output types.
   elementwise_ops_.emplace(
       "Cast", Eigen::internal::functor_traits<
                   Eigen::internal::scalar_cast_op<float, int16>>::Cost);
   elementwise_ops_.emplace("Ceil", EIGEN_COST(scalar_ceil_op<float>));
   elementwise_ops_.emplace("Cos", EIGEN_COST(scalar_cos_op<float>));
   elementwise_ops_.emplace("Dequantize", EIGEN_COST(scalar_product_op<float>));
   elementwise_ops_.emplace("Erf", 1);
   elementwise_ops_.emplace("Erfc", 1);
   elementwise_ops_.emplace("Exp", EIGEN_COST(scalar_exp_op<float>));
   elementwise_ops_.emplace("Expm1", EIGEN_COST(scalar_expm1_op<float>));
   elementwise_ops_.emplace("Floor", EIGEN_COST(scalar_floor_op<float>));
   elementwise_ops_.emplace("Inv", EIGEN_COST(scalar_inverse_op<float>));
   elementwise_ops_.emplace("InvGrad", 1);
   elementwise_ops_.emplace("Lgamma", 1);
   elementwise_ops_.emplace("Log", EIGEN_COST(scalar_log_op<float>));
   elementwise_ops_.emplace("Log1p", EIGEN_COST(scalar_log1p_op<float>));
   elementwise_ops_.emplace("Neg", EIGEN_COST(scalar_opposite_op<float>));
   elementwise_ops_.emplace("QuantizeAndDequantizeV2",
                            quantize_and_dequantize_v2_cost);
   elementwise_ops_.emplace("QuantizeV2", quantize_v2_cost);
   elementwise_ops_.emplace("Reciprocal", EIGEN_COST(scalar_inverse_op<float>));
   elementwise_ops_.emplace("Rint", 1);
   elementwise_ops_.emplace("Round", EIGEN_COST(scalar_round_op<float>));
   elementwise_ops_.emplace("Rsqrt", EIGEN_COST(scalar_rsqrt_op<float>));
   elementwise_ops_.emplace("Sqrt", EIGEN_COST(scalar_sqrt_op<float>));
   elementwise_ops_.emplace("Square", EIGEN_COST(scalar_square_op<float>));
   elementwise_ops_.emplace("Tanh", EIGEN_COST(scalar_tanh_op<float>));
   elementwise_ops_.emplace("Relu", EIGEN_COST(scalar_max_op<float>));
   elementwise_ops_.emplace("Sigmoid", EIGEN_COST(scalar_logistic_op<float>));
   elementwise_ops_.emplace("QuantizedSigmoid",
                            EIGEN_COST(scalar_logistic_op<float>));
   elementwise_ops_.emplace("Sign", EIGEN_COST(scalar_sign_op<float>));
   elementwise_ops_.emplace("Sin", EIGEN_COST(scalar_sin_op<float>));
   elementwise_ops_.emplace("Tan", EIGEN_COST(scalar_tan_op<float>));
   // Binary ops alphabetically sorted
   elementwise_ops_.emplace("Add", EIGEN_COST(scalar_sum_op<float>));
   elementwise_ops_.emplace("AddV2", EIGEN_COST(scalar_sum_op<float>));
   elementwise_ops_.emplace("ApproximateEqual", 1);
   elementwise_ops_.emplace("BiasAdd", EIGEN_COST(scalar_sum_op<float>));
   elementwise_ops_.emplace("QuantizedBiasAdd",
                            EIGEN_COST(scalar_sum_op<float>));
   elementwise_ops_.emplace("Div", EIGEN_COST(scalar_quotient_op<float>));
   elementwise_ops_.emplace("Equal", 1);
   elementwise_ops_.emplace("FloorDiv", EIGEN_COST(scalar_quotient_op<float>));
   elementwise_ops_.emplace("FloorMod", EIGEN_COST(scalar_mod_op<float>));
   elementwise_ops_.emplace("Greater", 1);
   elementwise_ops_.emplace("GreaterEqual", 1);
   elementwise_ops_.emplace("Less", 1);
   elementwise_ops_.emplace("LessEqual", 1);
   elementwise_ops_.emplace("LogicalAnd", EIGEN_COST(scalar_boolean_and_op));
   elementwise_ops_.emplace("LogicalNot", 1);
   elementwise_ops_.emplace("LogicalOr", EIGEN_COST(scalar_boolean_or_op));
   elementwise_ops_.emplace("Maximum", EIGEN_COST(scalar_max_op<float>));
   elementwise_ops_.emplace("Minimum", EIGEN_COST(scalar_min_op<float>));
   elementwise_ops_.emplace("Mod", EIGEN_COST(scalar_mod_op<float>));
   elementwise_ops_.emplace("Mul", EIGEN_COST(scalar_product_op<float>));
   elementwise_ops_.emplace("NotEqual", 1);
   elementwise_ops_.emplace("QuantizedAdd", EIGEN_COST(scalar_sum_op<float>));
   elementwise_ops_.emplace("QuantizedMul",
                            EIGEN_COST(scalar_product_op<float>));
   elementwise_ops_.emplace("RealDiv", EIGEN_COST(scalar_quotient_op<float>));
   elementwise_ops_.emplace("ReluGrad", EIGEN_COST(scalar_max_op<float>));
   elementwise_ops_.emplace("SquareDifference", 1);
   elementwise_ops_.emplace("Sub", EIGEN_COST(scalar_difference_op<float>));
   elementwise_ops_.emplace("TruncateDiv",
                            EIGEN_COST(scalar_quotient_op<float>));
   elementwise_ops_.emplace("TruncateMod", EIGEN_COST(scalar_mod_op<float>));

 #undef EIGEN_COST

   // By default, use sum of memory_time and compute_time for execution_time.
   compute_memory_overlap_ = false;
 }

 Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   auto it = device_cost_impl_.find(op_info.op());
   if (it != device_cost_impl_.end()) {
     std::function<Costs(const OpContext&)> estimator = it->second;
     Costs costs = estimator(op_context);
     VLOG(1) << "Operation " << op_info.op() << " takes "
             << costs.execution_time.count() << " ns.";
     return costs;
   }

   if (persistent_ops_.find(op_info.op()) != persistent_ops_.end()) {
     return PredictVariable(op_context);
   }

   if (elementwise_ops_.find(op_info.op()) != elementwise_ops_.end()) {
     return PredictCwiseOp(op_context);
   }

   VLOG(1) << "Missing accurate estimator for op: " << op_info.op();

   return PredictCostOfAnUnknownOp(op_context);
 }

 DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
     const DeviceProperties& device) const {
   double gflops = -1;
   double gb_per_sec = -1;

   if (device.type() == "CPU") {
     // Check if vector instructions are available, and refine performance
     // prediction based on this.
     // Frequencies are stored in MHz in the DeviceProperties.
     gflops = device.num_cores() * device.frequency() * 1e-3;
     if (gb_per_sec < 0) {
       if (device.bandwidth() > 0) {
         gb_per_sec = device.bandwidth() / 1e6;
       } else {
         gb_per_sec = 32;
       }
     }
   } else if (device.type() == "GPU") {
     const string architecture = device.environment().at("architecture");
     int cores_per_multiprocessor;
     if (architecture < "3") {
       // Fermi
       cores_per_multiprocessor = 32;
     } else if (architecture < "4") {
       // Kepler
       cores_per_multiprocessor = 192;
     } else if (architecture < "6") {
       // Maxwell
       cores_per_multiprocessor = 128;
     } else {
       // Pascal (compute capability version 6) and Volta (compute capability
       // version 7)
       cores_per_multiprocessor = 64;
     }
     gflops = device.num_cores() * device.frequency() * 1e-3 *
              cores_per_multiprocessor * kOpsPerMac;
     if (device.bandwidth() > 0) {
       gb_per_sec = device.bandwidth() / 1e6;
     } else {
       gb_per_sec = 100;
     }
   }
   VLOG(1) << "Device: " << device.type() << " gflops: " << gflops
           << " gb_per_sec: " << gb_per_sec;

   DCHECK_LT(0, gflops) << device.DebugString();
   DCHECK_LT(0, gb_per_sec) << device.DebugString();

   return DeviceInfo(gflops, gb_per_sec);
 }

 Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   // For unary or binary element-wise operations, op count is the element count
   // of any input. We use the count for the largest input here to be more robust
   // in case that the shape is unknown or partially known for other input.
   int64 op_count = CalculateLargestInputCount(op_info, &found_unknown_shapes);
   // If output shape is available, try use the element count calculated from
   // that.
   if (op_info.outputs_size() > 0) {
     op_count = std::max(
         op_count,
         CalculateTensorElementCount(op_info.outputs(0), &found_unknown_shapes));
   }
   // For binary ops, calculate the output shape possibly resulting from
   // broadcasting.
   if (op_info.inputs_size() >= 2) {
     op_count =
         std::max(op_count, CwiseOutputElementCount(op_info.inputs(0).shape(),
                                                    op_info.inputs(1).shape()));
   }

   int op_cost = 1;
   bool is_known_elementwise_op = false;
   auto it = elementwise_ops_.find(op_info.op());
   if (it != elementwise_ops_.end()) {
     op_cost = it->second;
     is_known_elementwise_op = true;
   } else {
     LOG(WARNING) << "Not a cwise op: " << op_info.op();
   }

   Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_info);
   if (found_unknown_shapes || !is_known_elementwise_op) {
     costs.inaccurate = true;
   }
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
     const OpContext& op_context) const {
   // Don't assume the operation is cwise, return cost based on input/output size
   // and admit that it is inaccurate...
   auto costs = PredictOpCountBasedCost(0, op_context.op_info);
   costs.inaccurate = true;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     double operations, const OpInfo& op_info) const {
   bool unknown_shapes = false;
   const double input_size = CalculateInputSize(op_info, &unknown_shapes);
   const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
   Costs costs =
       PredictOpCountBasedCost(operations, input_size, output_size, op_info);
   costs.inaccurate = unknown_shapes;
   costs.num_ops_with_unknown_shapes = unknown_shapes;
   costs.max_memory = output_size;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     double operations, double input_io_bytes, double output_io_bytes,
     const OpInfo& op_info) const {
   double total_io_bytes = input_io_bytes + output_io_bytes;
   const DeviceInfo device_info = GetDeviceInfo(op_info.device());
   if (device_info.gigaops <= 0 || device_info.gb_per_sec <= 0 ||
       device_info.intermediate_read_gb_per_sec <= 0 ||
       device_info.intermediate_write_gb_per_sec <= 0) {
     VLOG(1) << "BAD DEVICE. Op:" << op_info.op()
             << " device type:" << op_info.device().type()
             << " device model:" << op_info.device().model();
   }

   Costs::NanoSeconds compute_cost(std::ceil(operations / device_info.gigaops));
   VLOG(1) << "Op:" << op_info.op() << " GOps:" << operations / 1e9
           << " Compute Time (ns):" << compute_cost.count();

   Costs::NanoSeconds memory_cost(
       std::ceil(total_io_bytes / device_info.gb_per_sec));
   VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
           << " Memory Time (ns):" << memory_cost.count();

   // Check if bytes > 0.  If it's not and the bandwidth is set to infinity
   // then the result would be undefined.
   double intermediate_read_time =
       (input_io_bytes > 0)
           ? std::ceil(input_io_bytes / device_info.intermediate_read_gb_per_sec)
           : 0;

   double intermediate_write_time =
       (output_io_bytes > 0)
           ? std::ceil(output_io_bytes /
                       device_info.intermediate_write_gb_per_sec)
           : 0;

   Costs::NanoSeconds intermediate_memory_cost =
       compute_memory_overlap_
           ? std::max(intermediate_read_time, intermediate_write_time)
           : (intermediate_read_time + intermediate_write_time);
   VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
           << " Intermediate Memory Time (ns):"
           << intermediate_memory_cost.count();

   Costs costs;
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
   costs.intermediate_memory_time = intermediate_memory_cost;
   costs.intermediate_memory_read_time =
       Costs::NanoSeconds(intermediate_read_time);
   costs.intermediate_memory_write_time =
       Costs::NanoSeconds(intermediate_write_time);
   CombineCostsAndUpdateExecutionTime(compute_memory_overlap_, &costs);
   return costs;
 }

 int64 OpLevelCostEstimator::CountConv2DOperations(const OpInfo& op_info,
                                                   bool* found_unknown_shapes) {
   return CountConv2DOperations(op_info, nullptr, found_unknown_shapes);
 }

 // Helper to translate the positional arguments into named fields.
 /* static */
 OpLevelCostEstimator::ConvolutionDimensions
 OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     const TensorShapeProto& original_image_shape,
     const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
     bool* found_unknown_shapes) {
   VLOG(2) << "op features: " << op_info.DebugString();
   VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
   VLOG(2) << "Original filter shape: " << original_filter_shape.DebugString();
   auto image_shape =
       MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
   auto filter_shape =
       MaybeGetMinimumShape(original_filter_shape, 4, found_unknown_shapes);
   VLOG(2) << "Image shape: " << image_shape.DebugString();
   VLOG(2) << "Filter shape: " << filter_shape.DebugString();

   int x_index, y_index, channel_index;
   const string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     channel_index = 1;
     y_index = 2;
     x_index = 3;
   } else {
     // Use NHWC.
     y_index = 1;
     x_index = 2;
     channel_index = 3;
   }
   const string& filter_format = GetFilterFormat(op_info);
   int filter_x_index, filter_y_index, in_channel_index, out_channel_index;
   if (filter_format == "HWIO") {
     filter_y_index = 0;
     filter_x_index = 1;
     in_channel_index = 2;
     out_channel_index = 3;
   } else {
     // Use OIHW
     out_channel_index = 0;
     in_channel_index = 1;
     filter_y_index = 2;
     filter_x_index = 3;
   }
   int64 batch = image_shape.dim(0).size();
   int64 ix = image_shape.dim(x_index).size();
   int64 iy = image_shape.dim(y_index).size();
   int64 iz = image_shape.dim(channel_index).size();
   int64 kx = filter_shape.dim(filter_x_index).size();
   int64 ky = filter_shape.dim(filter_y_index).size();
   int64 kz = filter_shape.dim(in_channel_index).size();
   std::vector<int64> strides = GetStrides(op_info);
   const auto padding = GetPadding(op_info);
   int64 sx = strides[x_index];
   int64 sy = strides[y_index];
   int64 ox = GetOutputSize(ix, kx, sx, padding);
   int64 oy = GetOutputSize(iy, ky, sy, padding);
   int64 oz = filter_shape.dim(out_channel_index).size();
   // Only check equality when both sizes are known (in other words, when
   // neither is set to a minimum dimension size of 1).
   if (iz != 1 && kz != 1) {
     DCHECK_EQ(iz % kz, 0) << "Input channel " << iz
                           << " is not a multiple of filter channel " << kz
                           << ".";
     if (iz % kz) {
       *found_unknown_shapes = true;
     }
   } else {
     iz = kz = std::max<int64>(iz, kz);
   }
   OpLevelCostEstimator::ConvolutionDimensions conv_dims = {
       batch, ix, iy, iz, kx, ky, kz, oz, ox, oy, sx, sy, padding};

   VLOG(1) << "Batch Size:" << batch;
   VLOG(1) << "Image Dims:" << ix << "," << iy;
   VLOG(1) << "Input Depth:" << iz;
   VLOG(1) << "Kernel Dims:" << kx << "," << ky;
   VLOG(1) << "Kernel Depth:" << kz;
   VLOG(1) << "Output Dims:" << ox << "," << oy;
   VLOG(1) << "Output Depth:" << oz;
   VLOG(1) << "Strides:" << sx << "," << sy;
   VLOG(1) << "Padding:" << (padding == Padding::VALID ? "VALID" : "SAME");
   return conv_dims;
 }

 int64 OpLevelCostEstimator::CountConv2DOperations(
     const OpInfo& op_info, ConvolutionDimensions* conv_info,
     bool* found_unknown_shapes) {
   DCHECK(op_info.op() == kConv2d || op_info.op() == kDepthwiseConv2dNative)
       << "Invalid Operation: not Conv2D nor DepthwiseConv2dNative";

   if (op_info.inputs_size() < 2) {  // Unexpect inputs.
     *found_unknown_shapes = true;
     return 0;
   }

   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
       op_info.inputs(0).shape(), op_info.inputs(1).shape(), op_info,
       found_unknown_shapes);

   //  in DepthwiseConv2dNative conv_dims.oz is actually the channel depth
   //  multiplier; The effective output channel depth oz_effective is
   //  conv_dims.iz * conv_dims.oz. thus # ops = N x H x W x oz_effective x 2RS.
   //  Compare to Conv2D where # ops =  N x H x W x kz x oz x 2RS,
   //  oz = oz_effective,  then Conv2D_ops / Depthwise_conv2d_native_ops = kz.
   int64 ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
   if (op_info.op() == kConv2d) {
     ops *= conv_dims.kz * conv_dims.oz;
   } else {
     // To ensure output tensor dims to be correct for DepthwiseConv2DNative,
     // although ops are the same as Conv2D.
     conv_dims.oz *= conv_dims.iz;
     ops *= conv_dims.oz;
   }
   ops *= kOpsPerMac;

   if (conv_info != nullptr) {
     *conv_info = conv_dims;
   }
   return ops;
 }

 int64 OpLevelCostEstimator::CountMatMulOperations(const OpInfo& op_info,
                                                   bool* found_unknown_shapes) {
   return CountMatMulOperations(op_info, nullptr, found_unknown_shapes);
 }

 // TODO(nishantpatil): Create separate estimator for Sparse Matmul
 int64 OpLevelCostEstimator::CountMatMulOperations(const OpInfo& op_info,
                                                   MatMulDimensions* mat_mul,
                                                   bool* found_unknown_shapes) {
   double ops = 0;

   if (op_info.inputs_size() < 2) {
     LOG(ERROR) << "Need 2 inputs but got " << op_info.inputs_size();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }

   auto& a_matrix = op_info.inputs(0);
   auto& b_matrix = op_info.inputs(1);

   bool transpose_a = false;
   bool transpose_b = false;

   double m_dim, n_dim, k_dim, k_dim_b = 0;

   for (const auto& item : op_info.attr()) {
     VLOG(1) << "Key:" << item.first
             << " Value:" << SummarizeAttrValue(item.second);
     if (item.first == "transpose_a" && item.second.b() == true)
       transpose_a = true;
     if (item.first == "transpose_b" && item.second.b() == true)
       transpose_b = true;
   }
   VLOG(1) << "transpose_a:" << transpose_a;
   VLOG(1) << "transpose_b:" << transpose_b;
   auto a_matrix_shape =
       MaybeGetMinimumShape(a_matrix.shape(), 2, found_unknown_shapes);
   auto b_matrix_shape =
       MaybeGetMinimumShape(b_matrix.shape(), 2, found_unknown_shapes);
   if (transpose_a) {
     m_dim = a_matrix_shape.dim(1).size();
     k_dim = a_matrix_shape.dim(0).size();
   } else {
     m_dim = a_matrix_shape.dim(0).size();
     k_dim = a_matrix_shape.dim(1).size();
   }
   if (transpose_b) {
     k_dim_b = b_matrix_shape.dim(1).size();
     n_dim = b_matrix_shape.dim(0).size();
   } else {
     k_dim_b = b_matrix_shape.dim(0).size();
     n_dim = b_matrix_shape.dim(1).size();
   }

   VLOG(1) << "M, N, K: " << m_dim << "," << n_dim << "," << k_dim;
   // Only check equality when both sizes are known (in other words, when
   // neither is set to a minimum dimension size of 1).
   if (k_dim_b != 1 && k_dim != 1 && k_dim_b != k_dim) {
     LOG(ERROR) << "Incompatible Matrix dimensions";
     return ops;
   } else {
     // One of k_dim and k_dim_b might be 1 (minimum dimension size).
     k_dim = std::max(k_dim, k_dim_b);
   }

   ops = m_dim * n_dim * k_dim * 2;
   VLOG(1) << "Operations for Matmul: " << ops;

   if (mat_mul != nullptr) {
     mat_mul->m = m_dim;
     mat_mul->n = n_dim;
     mat_mul->k = k_dim;
   }
   return ops;
 }

 int64 OpLevelCostEstimator::CountBatchMatMulOperations(
     const OpInfo& op_info, bool* found_unknown_shapes) {
   return CountBatchMatMulOperations(op_info, nullptr, found_unknown_shapes);
 }

 int64 OpLevelCostEstimator::CountBatchMatMulOperations(
     const OpInfo& op_info, BatchMatMulDimensions* batch_mat_mul,
     bool* found_unknown_shapes) {
   if (op_info.op() != kBatchMatMul) {
     LOG(ERROR) << "Invalid Operation: " << op_info.op();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
   if (op_info.inputs_size() != 2) {
     LOG(ERROR) << "Expected 2 inputs but got " << op_info.inputs_size();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }

   double ops = 0;
   const auto& a_input = op_info.inputs(0);
   const auto& b_input = op_info.inputs(1);

   // BatchMatMul requires inputs of at least matrix shape (rank 2).
   // The two most minor dimensions of each input are matrices that
   // need to be multiplied together. The other dimensions determine
   // the number of such MatMuls.  For example, if the BatchMatMul has
   // inputs of shape:
   //   a_input_shape = [2, 3, 4, 5]
   //   b_input_shape = [2, 3, 5, 6]
   // then there are 2*3 = 6 MatMuls of dimensions m = 4, k = 5, n = 6
   // in this BatchMatMul.
   const int matrix_rank = 2;

   bool a_input_shape_unknown = false;
   bool b_input_shape_unknown = false;

   TensorShapeProto a_input_shape = MaybeGetMinimumShape(
       a_input.shape(), std::max(matrix_rank, a_input.shape().dim_size()),
       &a_input_shape_unknown);
   TensorShapeProto b_input_shape = MaybeGetMinimumShape(
       b_input.shape(), std::max(matrix_rank, b_input.shape().dim_size()),
       &b_input_shape_unknown);

   *found_unknown_shapes = a_input_shape_unknown || b_input_shape_unknown ||
                           (a_input.shape().dim_size() < matrix_rank) ||
                           (b_input.shape().dim_size() < matrix_rank);

   // Compute the number of matmuls as the max indicated at each dimension
   // by either input. Note that the shapes do not have to have
   // the same rank due to incompleteness.
   TensorShapeProto* bigger_rank_shape = &a_input_shape;
   TensorShapeProto* smaller_rank_shape = &b_input_shape;
   if (b_input_shape.dim_size() > a_input_shape.dim_size()) {
     bigger_rank_shape = &b_input_shape;
     smaller_rank_shape = &a_input_shape;
   }
   int num_matmuls = 1;
   for (int b_i = 0,
            s_i = smaller_rank_shape->dim_size() - bigger_rank_shape->dim_size();
        b_i < bigger_rank_shape->dim_size() - matrix_rank; ++b_i, ++s_i) {
     int b_dim = bigger_rank_shape->dim(b_i).size();
     int s_dim = 1;
     if (s_i >= 0) {
       s_dim = smaller_rank_shape->dim(s_i).size();
     }
     if (batch_mat_mul != nullptr) {
       batch_mat_mul->batch_dims.push_back(s_dim);
     }
     num_matmuls *= std::max(b_dim, s_dim);
   }

   // Build the MatMul. Note that values are ignored here since we are just
   // counting ops (e.g. only shapes matter).
   OpInfo matmul_op_info;
   matmul_op_info.set_op("MatMul");

   AttrValue transpose_a;
   transpose_a.set_b(false);
   if (op_info.attr().find("adj_x") != op_info.attr().end()) {
     transpose_a.set_b(op_info.attr().at("adj_x").b());
   }
   (*matmul_op_info.mutable_attr())["transpose_a"] = transpose_a;

   AttrValue transpose_b;
   transpose_b.set_b(false);
   if (op_info.attr().find("adj_y") != op_info.attr().end()) {
     transpose_b.set_b(op_info.attr().at("adj_y").b());
   }
   (*matmul_op_info.mutable_attr())["transpose_b"] = transpose_b;

   OpInfo::TensorProperties* a_matrix = matmul_op_info.add_inputs();
   a_matrix->set_dtype(a_input.dtype());
   TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
   for (int i = std::max(0, a_input_shape.dim_size() - matrix_rank);
        i < a_input_shape.dim_size(); ++i) {
     *(a_matrix_shape->add_dim()) = a_input_shape.dim(i);
   }

   OpInfo::TensorProperties* b_matrix = matmul_op_info.add_inputs();
   b_matrix->set_dtype(b_input.dtype());
   TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
   for (int i = std::max(0, b_input_shape.dim_size() - matrix_rank);
        i < b_input_shape.dim_size(); ++i) {
     *(b_matrix_shape->add_dim()) = b_input_shape.dim(i);
   }
   if (batch_mat_mul != nullptr) {
     batch_mat_mul->matmul_dims.m = (transpose_a.b())
                                        ? a_matrix_shape->dim(1).size()
                                        : a_matrix_shape->dim(0).size();
     batch_mat_mul->matmul_dims.k = (transpose_a.b())
                                        ? a_matrix_shape->dim(0).size()
                                        : a_matrix_shape->dim(1).size();
     batch_mat_mul->matmul_dims.n = (transpose_b.b())
                                        ? b_matrix_shape->dim(0).size()
                                        : b_matrix_shape->dim(1).size();
   }

   for (int i = 0; i < num_matmuls; ++i) {
     bool matmul_unknown_shapes = false;
     ops += CountMatMulOperations(matmul_op_info, &matmul_unknown_shapes);
     *found_unknown_shapes |= matmul_unknown_shapes;
   }
   return ops;
 }

 bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
                                         TensorShapeProto* tensor_shape_proto) {
   tensor_shape_proto->Clear();
   // First convert TensorProto into Tensor class so that it correctly parses
   // data values within TensorProto (whether it's in int_val, int64_val,
   // tensor_content, or anything.
   Tensor tensor(tensor_proto.dtype());
   if (!tensor.FromProto(tensor_proto)) {
     LOG(WARNING) << "GetTensorShapeProtoFromTensorProto() -- "
                  << "failed to parse TensorProto: "
                  << tensor_proto.DebugString();
     return false;
   }
   if (tensor.dims() != 1) {
     LOG(WARNING) << "GetTensorShapeProtoFromTensorProto() -- "
                  << "tensor is not 1D: " << tensor.dims();
     return false;
   }
   // Then, convert it back to TensorProto using AsProtoField, which makes sure
   // the data is in int_val, int64_val, or such repeated data fields, not in
   // tensor_content.
   TensorProto temp_tensor;
   tensor.AsProtoField(&temp_tensor);

 #define TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(type)        \
   do {                                                   \
     for (const auto& value : temp_tensor.type##_val()) { \
       tensor_shape_proto->add_dim()->set_size(value);    \
     }                                                    \
   } while (0)

   if (tensor.dtype() == DT_INT32 || tensor.dtype() == DT_INT16 ||
       tensor.dtype() == DT_INT8 || tensor.dtype() == DT_UINT8) {
     TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(int);
   } else if (tensor.dtype() == DT_INT64) {
     TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(int64);
   } else if (tensor.dtype() == DT_UINT32) {
     TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(uint32);
   } else if (tensor.dtype() == DT_UINT64) {
     TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO(uint64);
   } else {
     LOG(WARNING) << "GetTensorShapeProtoFromTensorProto() -- "
                  << "Unsupported dtype: " << tensor.dtype();
     return false;
   }
 #undef TENSOR_VALUES_TO_TENSOR_SHAPE_PROTO

   return true;
 }

 // TODO(cliffy): Dedup this method and CountConv2DBackpropFilterOperations.
 int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
     const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) {
   int64 ops = 0;

   DCHECK(op_info.op() == kConv2dBackpropInput ||
          op_info.op() == kDepthwiseConv2dNativeBackpropInput)
       << "Invalid Operation: not kConv2dBackpropInput nor"
          "kDepthwiseConv2dNativeBackpropInput";

   if (op_info.inputs_size() < 2) {
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
   }

   TensorShapeProto input_shape;
   bool shape_found = false;
   if (op_info.inputs(0).has_value()) {
     const TensorProto& value = op_info.inputs(0).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &input_shape);
   }
   if (!shape_found && op_info.outputs_size() == 1) {
     input_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
     // Set the minimum filter size that's feasible.
     input_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       input_shape.add_dim()->set_size(1);
     }
     *found_unknown_shapes = true;
   }

   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
       input_shape, op_info.inputs(1).shape(), op_info, found_unknown_shapes);

   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
   if (op_info.op() == kConv2dBackpropInput) {
     ops *= conv_dims.kz * conv_dims.oz;
   } else {
     // conv_dims always use forward path definition regardless
     conv_dims.oz *= conv_dims.iz;
     ops *= conv_dims.oz;
   }
   ops *= kOpsPerMac;

   VLOG(1) << "Operations for" << op_info.op() << "  " << ops;

   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
   }
   return ops;
 }

 int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) {
   int64 ops = 0;

   DCHECK(op_info.op() == kConv2dBackpropFilter ||
          op_info.op() == kDepthwiseConv2dNativeBackpropFilter)
       << "Invalid Operation: not kConv2dBackpropFilter nor"
          "kDepthwiseConv2dNativeBackpropFilter";

   TensorShapeProto filter_shape;
   bool shape_found = false;
   if (op_info.inputs_size() >= 2 && op_info.inputs(1).has_value()) {
     const TensorProto& value = op_info.inputs(1).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &filter_shape);
   }
   if (!shape_found && op_info.outputs_size() == 1) {
     filter_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
     // Set the minimum filter size that's feasible.
     filter_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       filter_shape.add_dim()->set_size(1);
     }
     *found_unknown_shapes = true;
   }

   if (op_info.inputs_size() < 1) {
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
   }
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
       op_info.inputs(0).shape(), filter_shape, op_info, found_unknown_shapes);

   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
   if (op_info.op() == kConv2dBackpropFilter) {
     ops *= conv_dims.kz * conv_dims.oz;
   } else {
     // conv_dims always use forward path definition regardless
     conv_dims.oz *= conv_dims.iz;
     ops *= conv_dims.oz;
   }
   ops *= kOpsPerMac;
   VLOG(1) << "Operations for" << op_info.op() << "  " << ops;

   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
   }
   return ops;
 }

 int64 OpLevelCostEstimator::CalculateTensorElementCount(
     const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) {
   VLOG(2) << "   with " << DataTypeString(tensor.dtype()) << " tensor of shape "
           << tensor.shape().DebugString();
   int64 tensor_size = 1;
   int num_dims = std::max(1, tensor.shape().dim_size());
   auto tensor_shape =
       MaybeGetMinimumShape(tensor.shape(), num_dims, found_unknown_shapes);
   for (const auto& dim : tensor_shape.dim()) {
     tensor_size *= dim.size();
   }
   return tensor_size;
 }

 int64 OpLevelCostEstimator::CalculateTensorSize(
     const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) {
   int64 count = CalculateTensorElementCount(tensor, found_unknown_shapes);
   int size = DataTypeSize(BaseType(tensor.dtype()));
   VLOG(2) << "Count: " << count << " DataTypeSize: " << size;
   return count * size;
 }

 int64 OpLevelCostEstimator::CalculateInputSize(const OpInfo& op_info,
                                                bool* found_unknown_shapes) {
   int64 total_input_size = 0;
   for (auto& input : op_info.inputs()) {
     int64 input_size = CalculateTensorSize(input, found_unknown_shapes);
     total_input_size += input_size;
     VLOG(1) << "Input Size: " << input_size
             << " Total Input Size:" << total_input_size;
   }
   return total_input_size;
 }

 int64 OpLevelCostEstimator::CalculateLargestInputCount(
     const OpInfo& op_info, bool* found_unknown_shapes) {
   int64 largest_input_count = 0;
   for (auto& input : op_info.inputs()) {
     int64 input_count =
         CalculateTensorElementCount(input, found_unknown_shapes);
     if (input_count > largest_input_count) {
       largest_input_count = input_count;
     }
     VLOG(1) << "Input Count: " << input_count
             << " Largest Input Count:" << largest_input_count;
   }
   return largest_input_count;
 }

 int64 OpLevelCostEstimator::CalculateOutputSize(const OpInfo& op_info,
                                                 bool* found_unknown_shapes) {
   int64 total_output_size = 0;
   // use float as default for calculations
   for (const auto& output : op_info.outputs()) {
     DataType dt = output.dtype();
     const auto& original_output_shape = output.shape();
     int64 output_size = DataTypeSize(BaseType(dt));
     int num_dims = std::max(1, original_output_shape.dim_size());
     auto output_shape = MaybeGetMinimumShape(original_output_shape, num_dims,
                                              found_unknown_shapes);
     for (const auto& dim : output_shape.dim()) {
       output_size *= dim.size();
     }
     total_output_size += output_size;
     VLOG(1) << "Output Size: " << output_size
             << " Total Output Size:" << total_output_size;
   }
   return total_output_size;
 }

 Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
       CountConv2DOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
     const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropInputOperations(
                                   op_info, nullptr, &found_unknown_shapes),
                               op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
     const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropFilterOperations(
                                   op_info, nullptr, &found_unknown_shapes),
                               op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
     const OpContext& op_context) const {
   // FusedConv2DBiasActivation computes a fused kernel which implements:
   // 2D convolution, adds side input with separate scaling on convolution and
   // side inputs, then adds bias, and finally applies the ReLU activation
   // function to the result:
   //
   // Input -> Conv2D  ->  Add  -> BiasAdd  -> ReLU
   //            ^          ^         ^
   //          Filter   Side Input   Bias
   //
   // Note that when adding the side input, the operation multiplies the output
   // of Conv2D by conv_input_scale, confusingly, and the side_input by
   // side_input_scale.
   //
   // Note that in the special case that side_input_scale is 0, which we infer
   // from side_input having dimensions [], we skip that addition operation.
   //
   // For more information, see
   // contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc

   // TODO(yaozhang): Support other data formats (NCHW_VECT_C, NHWC_VECT_W) and
   // filter formats (OIHW_VECT_I).
   string data_format = GetDataFormat(op_context.op_info);
   if (data_format != "NCHW" && data_format != "NHWC") {
     LOG(WARNING) << "unsupported data format: " << data_format;
     Costs cost = Costs::ZeroCosts();
     cost.inaccurate = true;
     return cost;
   }
   string filter_format = GetFilterFormat(op_context.op_info);
   if (filter_format != "HWIO" && filter_format != "OIHW") {
     LOG(WARNING) << "unsupported filter format: " << filter_format;
     Costs cost = Costs::ZeroCosts();
     cost.inaccurate = true;
     return cost;
   }

   auto& conv_input = op_context.op_info.inputs(0);
   auto& filter = op_context.op_info.inputs(1);
   auto& bias = op_context.op_info.inputs(2);
   auto& side_input = op_context.op_info.inputs(3);
   auto& conv_input_scale = op_context.op_info.inputs(4);
   auto& side_input_scale = op_context.op_info.inputs(5);

   // Manually compute our convolution dimensions.
   bool found_unknown_shapes = false;
   auto dims = ConvolutionDimensionsFromInputs(
       conv_input.shape(), filter.shape(), op_context.op_info,
       &found_unknown_shapes);

   // Construct the shape of our output tensor from our convolution dimensions
   // and format, as it may not be available yet.
   // TODO(varomodt): should we centralize the Conv2D input/output shapes?
   OpInfo::TensorProperties output;
   if (data_format == "NCHW") {
     output = DescribeTensor(DT_FLOAT, {dims.batch, dims.oz, dims.oy, dims.ox});
   } else if (data_format == "NHWC") {
     output = DescribeTensor(DT_FLOAT, {dims.batch, dims.oy, dims.ox, dims.oz});
   }

   // Add the operations the fused op always computes.
   std::vector<OpContext> component_ops = {
       FusedChildContext(op_context, "Conv2D", output, {conv_input, filter}),
       FusedChildContext(op_context, "Mul", output, {output, conv_input_scale}),
       FusedChildContext(op_context, "BiasAdd", output, {output, bias}),
       FusedChildContext(op_context, "Relu", output, {output})};

   // Add our side_input iff it's non-empty.
   if (side_input.shape().dim_size() > 0) {
     component_ops.push_back(FusedChildContext(op_context, "Mul", side_input,
                                               {side_input, side_input_scale}));
     component_ops.push_back(
         FusedChildContext(op_context, "Add", output, {side_input, output}));
   }

   // Construct an op_context which definitely has our output shape.
   auto op_context_with_output = op_context;
   op_context_with_output.op_info.mutable_outputs()->Clear();
   *op_context_with_output.op_info.mutable_outputs()->Add() = output;

   // Construct component operations and run the cost computation.
   auto costs = PredictFusedOp(op_context_with_output, component_ops);
   costs.inaccurate |= found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = costs.inaccurate;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
       CountMatMulOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
   // Einsum computes a generalized contraction between tensors of arbitrary
   // dimension as defined by the equation written in the Einstein summation
   // convention. The number of tensors in the computation and the number of
   // contractions can be arbitrarily long. The current model only contemplates
   // Einsum equations, which can be translated into a single BatchMatMul
   // operation. Einsum operations with more than two operands are not currently
   // supported. Subscripts where an axis appears more than once for a single
   // input and ellipsis are currently also excluded. See:
   // https://www.tensorflow.org/api_docs/python/tf/einsum
   // We distinguish four kinds of dimensions, depending on their placement in
   // the equation:
   // + B: Batch dimensions: Dimensions which appear in both operands and RHS.
   // + K: Contracting dimensions: These appear in both inputs but not RHS.
   // + M: Operand A dimensions: These appear in the first operand and the RHS.
   // + N: Operand B dimensions: These appear in the second operand and the RHS.
   // Then, the operation to estimate is BatchMatMul([B,M,K],[B,K,N])
   const auto& op_info = op_context.op_info;

   auto it = op_info.attr().find("equation");
   if (it == op_info.attr().end()) return Costs::ZeroCosts(/*inaccurate=*/true);
   const string& equation = it->second.s();
   std::vector<string> equation_split = absl::StrSplit(equation, "->");

   if (equation_split.empty()) {
     LOG(WARNING) << "Einsum with malformed equation";
     return PredictCostOfAnUnknownOp(op_context);
   }
   std::vector<string> input_split = absl::StrSplit(equation_split[0], ',');

   // The current model covers Einsum operations with two operands and a RHS
   if (op_info.inputs_size() != 2 || equation_split.size() != 2) {
     VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
     return PredictCostOfAnUnknownOp(op_context);
   }
   string rhs_str = equation_split[1];
   string a_input_str = input_split[0];
   string b_input_str = input_split[1];

   // Ellipsis are not currently supported
   if (a_input_str.find("...") != std::string::npos ||
       b_input_str.find("...") != std::string::npos) {
     VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
             << ", ellipsis not supported";
     return PredictCostOfAnUnknownOp(op_context);
   }

   const auto& a_input = op_info.inputs(0);
   const auto& b_input = op_info.inputs(1);
   const int matrix_rank = 2;

   bool found_unknown_shapes = false;
   bool a_input_shape_unknown = false;
   bool b_input_shape_unknown = false;

   TensorShapeProto a_input_shape = MaybeGetMinimumShape(
       a_input.shape(), std::max(matrix_rank, a_input.shape().dim_size()),
       &a_input_shape_unknown);
   TensorShapeProto b_input_shape = MaybeGetMinimumShape(
       b_input.shape(), std::max(matrix_rank, b_input.shape().dim_size()),
       &b_input_shape_unknown);

   found_unknown_shapes = a_input_shape_unknown || b_input_shape_unknown ||
                          (a_input.shape().dim_size() < matrix_rank) ||
                          (b_input.shape().dim_size() < matrix_rank);

   if (a_input_str.size() != a_input_shape.dim_size() ||
       b_input_str.size() != b_input_shape.dim_size()) {
     VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
             << ", equation subscripts don't match tensor rank.";
     return PredictCostOfAnUnknownOp(op_context);
   }

   // Subscripts where axis appears more than once for a single input are not yet
   // supported
   if (CheckRepeatedDimensions(a_input_str) ||
       CheckRepeatedDimensions(b_input_str) ||
       CheckRepeatedDimensions(rhs_str)) {
     VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
             << ", Subscripts where axis appears more than once for a single "
                "input are not yet supported";
     return PredictCostOfAnUnknownOp(op_context);
   }

   OpInfo batch_matmul_op_info = op_info;
   batch_matmul_op_info.mutable_inputs()->Clear();
   batch_matmul_op_info.set_op("BatchMatMul");

   AttrValue transpose_attribute;
   transpose_attribute.set_b(false);
   (*batch_matmul_op_info.mutable_attr())["transpose_a"] = transpose_attribute;
   (*batch_matmul_op_info.mutable_attr())["transpose_b"] = transpose_attribute;

   OpInfo::TensorProperties* a_matrix = batch_matmul_op_info.add_inputs();
   TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
   a_matrix->set_dtype(a_input.dtype());

   OpInfo::TensorProperties* b_matrix = batch_matmul_op_info.add_inputs();
   b_matrix->set_dtype(b_input.dtype());
   TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();

   TensorShapeProto_Dim m_dim;
   TensorShapeProto_Dim n_dim;
   TensorShapeProto_Dim k_dim;

   m_dim.set_size(1);
   n_dim.set_size(1);
   k_dim.set_size(1);

   for (int i_idx = 0; i_idx < a_input_str.size(); ++i_idx) {
     if (b_input_str.find(a_input_str[i_idx]) == std::string::npos) {
       if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
         VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
         return PredictCostOfAnUnknownOp(op_context);
       }

       m_dim.set_size(m_dim.size() * a_input_shape.dim(i_idx).size());
       continue;
     } else if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
       // The dimension does not appear in the RHS, therefore it is a contracting
       // dimension.
       k_dim.set_size(k_dim.size() * a_input_shape.dim(i_idx).size());
       continue;
     }
     // It appears in both input operands, therefore we place it as an outer
     // dimension for the Batch Matmul.
     *(a_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
     *(b_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
   }
   for (int i_idx = 0; i_idx < b_input_str.size(); ++i_idx) {
     if (a_input_str.find(b_input_str[i_idx]) == std::string::npos) {
       if (rhs_str.find(b_input_str[i_idx]) == std::string::npos) {
         VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
         return PredictCostOfAnUnknownOp(op_context);
       }
       n_dim.set_size(n_dim.size() * b_input_shape.dim(i_idx).size());
     }
   }

   // The two inner-most dimensions of the Batch Matmul are added.
   *(a_matrix_shape->add_dim()) = m_dim;
   *(a_matrix_shape->add_dim()) = k_dim;
   *(b_matrix_shape->add_dim()) = k_dim;
   *(b_matrix_shape->add_dim()) = n_dim;

   OpContext batch_matmul_op_context = op_context;
   batch_matmul_op_context.op_info = batch_matmul_op_info;
   Costs costs = PredictCosts(batch_matmul_op_context);
   costs.inaccurate = costs.inaccurate || found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictSparseTensorDenseMatMul(
     const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   // input[0]: indices in sparse matrix a
   // input[1]: values in sparse matrix a
   // input[2]: shape of matrix a
   // input[3]: matrix b
   // See
   // https://github.com/tensorflow/tensorflow/blob/9a43dfeac5/tensorflow/core/ops/sparse_ops.cc#L85
   int64 num_elems_in_a =
       CalculateTensorElementCount(op_info.inputs(1), &found_unknown_shapes);
   auto b_matrix = op_info.inputs(3);
   auto b_matrix_shape =
       MaybeGetMinimumShape(b_matrix.shape(), 2, &found_unknown_shapes);
   int64 n_dim = b_matrix_shape.dim(1).size();

   // Each element in A is multiplied and added with an element from each column
   // in b.
   const int64 op_count = kOpsPerMac * num_elems_in_a * n_dim;

   int64 a_indices_input_size =
       CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
   int64 a_values_input_size =
       CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
   int64 a_shape_input_size =
       CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
   int64 b_input_size =
       num_elems_in_a * n_dim * DataTypeSize(BaseType(b_matrix.dtype()));
   double input_size = a_indices_input_size + a_values_input_size +
                       a_shape_input_size + b_input_size;

   double output_size = CalculateOutputSize(op_info, &found_unknown_shapes);

   auto costs =
       PredictOpCountBasedCost(op_count, input_size, output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = output_size;

   return costs;
 }

 Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   return Costs::ZeroCosts();
 }

 Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
   result.max_memory = CalculateOutputSize(op_info, &result.inaccurate);
   result.num_ops_with_unknown_shapes = result.inaccurate;
   // Assign the minimum amount of time we can represent to the identity op since
   // it tends to be really cheap.
   result.compute_time = kMinComputeTime;
   result.execution_time = result.compute_time;
   return result;
 }

 Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
   result.persistent_memory = CalculateOutputSize(op_info, &result.inaccurate);
   result.num_ops_with_unknown_shapes = result.inaccurate;

   result.compute_time = kMinComputeTime;
   result.execution_time = result.compute_time;
   return result;
 }

 Costs OpLevelCostEstimator::PredictBatchMatMul(
     const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   Costs costs = PredictOpCountBasedCost(
       CountBatchMatMulOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
   const auto& op_info = op_context.op_info;
   Costs costs = Costs::ZeroCosts();
   costs.max_memory = CalculateOutputSize(op_info, &costs.inaccurate);
   costs.num_ops_with_unknown_shapes = costs.inaccurate;
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
   costs.compute_time = kMinComputeTime;
   costs.execution_time = costs.compute_time;

   return costs;
 }

 Costs OpLevelCostEstimator::PredictGatherOrSlice(
     const OpContext& op_context) const {
   // Gather & Slice ops can have a very large input, but only access a small
   // part of it. For these op the size of the output determines the memory cost.
   const auto& op_info = op_context.op_info;

   const int inputs_needed = op_info.op() == "Slice" ? 3 : 2;
   if (op_info.outputs_size() == 0 || op_info.inputs_size() < inputs_needed) {
     Costs costs = Costs::ZeroCosts();
     costs.inaccurate = true;
     return costs;
   }

   bool unknown_shapes = false;

   // Each output element is a copy of some element from input.
   // For roofline estimate we assume each copy has a unit cost.
   const int64 op_count =
       CalculateTensorElementCount(op_info.outputs(0), &unknown_shapes);

   const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
   double input_size = output_size;
   if (op_info.op() == "Slice") {
     // Add 'begin' & 'size' tensors sizes.
     input_size +=
         CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes) +
         CalculateTensorElementCount(op_info.inputs(2), &unknown_shapes);
   } else {
     // Assuming this is "Gather" or "GatherV2" op, add 'indices' size.
     input_size +=
         CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes);
   }

   Costs costs =
       PredictOpCountBasedCost(op_count, input_size, output_size, op_info);
   costs.inaccurate = unknown_shapes;
   costs.num_ops_with_unknown_shapes = unknown_shapes;
   costs.max_memory = output_size;

   return costs;
 }

 Costs OpLevelCostEstimator::PredictScatter(const OpContext& op_context) const {
   // Scatter ops sparsely access a reference input and output tensor.
   const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;

   // input[0]: ref tensor that will be sparsely accessed
   // input[1]: indices - A tensor of indices into the first dimension of ref.
   // input[2]: updates where updates.shape = indices.shape + ref.shape[1:]
   // See
   // https://www.tensorflow.org/api_docs/python/tf/scatter_add and
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/state_ops.cc#L146

   const int64 num_indices =
       CalculateTensorElementCount(op_info.inputs(1), &found_unknown_shapes);

   int64 num_elems_in_ref_per_index = 1;
   auto ref_tensor_shape = MaybeGetMinimumShape(
       op_info.inputs(0).shape(), op_info.inputs(0).shape().dim_size(),
       &found_unknown_shapes);
   for (int i = 1; i < ref_tensor_shape.dim().size(); ++i) {
     num_elems_in_ref_per_index *= ref_tensor_shape.dim(i).size();
   }
   const int64 op_count = num_indices * num_elems_in_ref_per_index;

   // Sparsely access ref so input size depends on the number of operations
   int64 ref_input_size =
       op_count * DataTypeSize(BaseType(op_info.inputs(0).dtype()));
   int64 indices_input_size =
       CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
   int64 updates_input_size =
       CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);

   double total_input_size =
       ref_input_size + indices_input_size + updates_input_size;

   // Sparsely access ref so output size depends on the number of operations
   double total_output_size =
       op_count * DataTypeSize(BaseType(op_info.outputs(0).dtype()));

   auto costs = PredictOpCountBasedCost(op_count, total_input_size,
                                        total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;

   return costs;
 }

 Costs OpLevelCostEstimator::PredictFusedOp(
     const OpContext& op_context,
     const std::vector<OpContext>& fused_op_contexts) const {
   // Note that PredictOpCountBasedCost will get the correct memory_time from
   // the node's inputs and outputs; but we don't want to have to re-implement
   // the logic for computing the operation count of each of our component
   // operations here; so we simply add the compute times of each component
   // operation, then update the execution time.
   Costs fused_cost = PredictOpCountBasedCost(0, op_context.op_info);

   fused_cost.compute_time = 0;
   fused_cost.inaccurate = false;
   for (auto& fused_op : fused_op_contexts) {
     auto op_cost = PredictCosts(fused_op);

     fused_cost.compute_time += op_cost.compute_time;
     fused_cost.inaccurate |= op_cost.inaccurate;
     fused_cost.intermediate_memory_time += op_cost.intermediate_memory_time;
   }

   CombineCostsAndUpdateExecutionTime(compute_memory_overlap_, &fused_cost);
   return fused_cost;
 }

 /* static */
 OpContext OpLevelCostEstimator::FusedChildContext(
     const OpContext& parent, const string& op_name,
     const OpInfo::TensorProperties& output,
     const std::vector<OpInfo::TensorProperties>& inputs) {
   // Setup the base parameters of our new context.
   OpContext new_context;
   new_context.name = op_name;
   new_context.device_name = parent.device_name;
   new_context.op_info = parent.op_info;
   new_context.op_info.set_op(op_name);

   // Setup the inputs of our new context.
   new_context.op_info.mutable_inputs()->Clear();
   for (const auto& input : inputs) {
     *new_context.op_info.mutable_inputs()->Add() = input;
   }

   // Setup the output of our new context.
   new_context.op_info.mutable_outputs()->Clear();
   *new_context.op_info.mutable_outputs()->Add() = output;

   return new_context;
 }

 /* static */
 OpInfo::TensorProperties OpLevelCostEstimator::DescribeTensor(
     DataType type, const std::vector<int64>& dims) {
   OpInfo::TensorProperties ret;
   ret.set_dtype(type);

   auto shape = ret.mutable_shape();
   for (const int dim : dims) {
     shape->add_dim()->set_size(dim);
   }

   return ret;
 }

 /* static */
 OpLevelCostEstimator::ConvolutionDimensions
 OpLevelCostEstimator::OpDimensionsFromInputs(
     const TensorShapeProto& original_image_shape, const OpInfo& op_info,
     bool* found_unknown_shapes) {
   VLOG(2) << "op features: " << op_info.DebugString();
   VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
   auto image_shape =
       MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
   VLOG(2) << "Image shape: " << image_shape.DebugString();

   int x_index, y_index, channel_index;
   const string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     channel_index = 1;
     y_index = 2;
     x_index = 3;
   } else {
     y_index = 1;
     x_index = 2;
     channel_index = 3;
   }
   int64 batch = image_shape.dim(0).size();
   int64 ix = image_shape.dim(x_index).size();
   int64 iy = image_shape.dim(y_index).size();
   int64 iz = image_shape.dim(channel_index).size();

   // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
   // {1, 1, 1, 1} in that case.
   std::vector<int64> ksize = GetKernelSize(op_info);
   int64 kx = ksize[x_index];
   int64 ky = ksize[y_index];
   // These ops don't support groupwise operation, therefore kz == iz.
   int64 kz = iz;

   std::vector<int64> strides = GetStrides(op_info);
   int64 sx = strides[x_index];
   int64 sy = strides[y_index];
   const auto padding = GetPadding(op_info);

   int64 ox = GetOutputSize(ix, kx, sx, padding);
   int64 oy = GetOutputSize(iy, ky, sy, padding);
   int64 oz = iz;

   OpLevelCostEstimator::ConvolutionDimensions conv_dims = {
       batch, ix, iy, iz, kx, ky, kz, oz, ox, oy, sx, sy, padding};
   return conv_dims;
 }

 Costs OpLevelCostEstimator::PredictMaxPool(const OpContext& op_context) const {
   bool found_unknown_shapes = false;
   const auto& op_info = op_context.op_info;
   // x: op_info.inputs(0)
   ConvolutionDimensions dims = OpDimensionsFromInputs(
       op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
   // kx * ky - 1 comparisons per output (kx * xy > 1)
   // or 1 copy per output (kx * k1 = 1).
   int per_output_ops = dims.kx * dims.ky == 1 ? 1 : dims.kx * dims.ky - 1;
   int64 ops = dims.batch * dims.ox * dims.oy * dims.oz * per_output_ops;

   double total_input_size = 0;
   if (dims.ky >= dims.sy) {
     total_input_size =
         CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
   } else {  // dims.ky < dims.sy
     // Vertical stride is larger than vertical kernel; assuming row-major
     // format, skip unnecessary rows (or read every kx rows per sy rows, as the
     // others are not used for output).
     const auto data_size = DataTypeSize(BaseType(op_info.inputs(0).dtype()));
     total_input_size =
         data_size * dims.batch * dims.ix * dims.ky * dims.oy * dims.iz;
   }
   const double total_output_size =
       CalculateOutputSize(op_info, &found_unknown_shapes);

   Costs costs = PredictOpCountBasedCost(ops, total_input_size,
                                         total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictMaxPoolGrad(
     const OpContext& op_context) const {
   bool found_unknown_shapes = false;
   const auto& op_info = op_context.op_info;
   // x: op_info.inputs(0)
   // y: op_info.inputs(1)
   // y_grad: op_info.inputs(2)
   if (op_info.inputs_size() < 3) return Costs::ZeroCosts(/*inaccurate=*/true);
   ConvolutionDimensions dims = OpDimensionsFromInputs(
       op_info.inputs(0).shape(), op_info, &found_unknown_shapes);

   int64 ops = 0;
   if (dims.kx == 1 && dims.ky == 1) {
     // 1x1 window. No need to know which input was max.
     ops = dims.batch * dims.ix * dims.iy * dims.iz;
   } else if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
     // Non-overlapping window: re-run maxpool, then assign zero or y_grad.
     ops = dims.batch * dims.iz *
           (dims.ox * dims.oy * (dims.kx * dims.ky - 1) + dims.ix * dims.iy);
   } else {
     // Overlapping window: initialize with zeros, re-run maxpool, then
     // accumulate y_gad to proper x_grad locations.
     ops = dims.batch * dims.iz *
           (dims.ox * dims.oy * (dims.kx * dims.ky - 1) + dims.ix * dims.iy * 2);
   }

   // Just read x and y_grad; no need to read y as we assume MaxPoolGrad re-run
   // MaxPool internally.
   double total_input_size =
       CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
   total_input_size +=
       CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
   // Write x_grad; size equal to x.
   const double total_output_size =
       CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);

   Costs costs = PredictOpCountBasedCost(ops, total_input_size,
                                         total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }

 /* This predict function handles three types of tensorflow ops
  * AssignVariableOp/AssignAddVariableOp/AssignSubVariableOp, broadcasting
  * was not possible for these ops, therefore the input tensor's shapes is
  * enough to compute the cost */
 Costs OpLevelCostEstimator::PredictAssignVariableOps(
     const OpContext& op_context) const {
   bool found_unknown_shapes = false;
   const auto& op_info = op_context.op_info;
   /* First input of these ops are reference to the assignee. */
   if (op_info.inputs_size() != 2) return Costs::ZeroCosts(true);
   const double total_input_size =
       CalculateInputSize(op_info, &found_unknown_shapes);
   const double flops = op_info.op() == kAssignVariableOp
                            ? 0.0
                            : CalculateTensorElementCount(op_info.inputs(1),
                                                          &found_unknown_shapes);
   Costs costs = PredictOpCountBasedCost(flops, total_input_size, 0, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictAvgPool(const OpContext& op_context) const {
   bool found_unknown_shapes = false;
   const auto& op_info = op_context.op_info;
   // x: op_info.inputs(0)
   ConvolutionDimensions dims = OpDimensionsFromInputs(
       op_info.inputs(0).shape(), op_info, &found_unknown_shapes);

   // kx * ky - 1 additions and 1 multiplication per output.
   int64 ops = dims.batch * dims.ox * dims.oy * dims.oz * dims.kx * dims.ky;

   double total_input_size = 0;
   if (dims.ky >= dims.sy) {
     total_input_size =
         CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
   } else {  // dims.ky < dims.sy
     // vertical stride is larger than vertical kernel; assuming row-major
     // format, skip unnecessary rows (or read every kx rows per sy rows, as the
     // others are not used for output).
     const auto data_size = DataTypeSize(BaseType(op_info.inputs(0).dtype()));
     total_input_size =
         data_size * dims.batch * dims.ix * dims.ky * dims.oy * dims.iz;
   }
   const double total_output_size =
       CalculateOutputSize(op_info, &found_unknown_shapes);

   Costs costs = PredictOpCountBasedCost(ops, total_input_size,
                                         total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictAvgPoolGrad(
     const OpContext& op_context) const {
   bool found_unknown_shapes = false;
   const auto& op_info = op_context.op_info;
   // x's shape: op_info.inputs(0)
   // y_grad: op_info.inputs(1)

   // Extract x_shape from op_info.inputs(0).value() or op_info.outputs(0).
   bool shape_found = false;
   TensorShapeProto x_shape;
   if (op_info.inputs_size() >= 1 && op_info.inputs(0).has_value()) {
     const TensorProto& value = op_info.inputs(0).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &x_shape);
   }
   if (!shape_found && op_info.outputs_size() > 0) {
     x_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
     // Set the minimum shape that's feasible.
     x_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       x_shape.add_dim()->set_size(1);
     }
     found_unknown_shapes = true;
   }

   ConvolutionDimensions dims =
       OpDimensionsFromInputs(x_shape, op_info, &found_unknown_shapes);

   int64 ops = 0;
   if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
     // Non-overlapping window.
     ops = dims.batch * dims.iz * (dims.ix * dims.iy + dims.ox * dims.oy);
   } else {
     // Overlapping window.
     ops = dims.batch * dims.iz *
           (dims.ix * dims.iy + dims.ox * dims.oy * (dims.kx * dims.ky + 1));
   }

   const double total_input_size =
       CalculateInputSize(op_info, &found_unknown_shapes);
   const double total_output_size =
       CalculateOutputSize(op_info, &found_unknown_shapes);

   Costs costs = PredictOpCountBasedCost(ops, total_input_size,
                                         total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictFusedBatchNorm(
     const OpContext& op_context) const {
   bool found_unknown_shapes = false;
   const auto& op_info = op_context.op_info;
   // x: op_info.inputs(0)
   // scale: op_info.inputs(1)
   // offset: op_info.inputs(2)
   // mean: op_info.inputs(3)  --> only for inference
   // variance: op_info.inputs(4) --> only for inference
   ConvolutionDimensions dims = OpDimensionsFromInputs(
       op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
   const bool is_training = IsTraining(op_info);

   int64 ops = 0;
   const auto rsqrt_cost = Eigen::internal::functor_traits<
       Eigen::internal::scalar_rsqrt_op<float>>::Cost;
   if (is_training) {
     ops = dims.iz * (dims.batch * dims.ix * dims.iy * 4 + 6 + rsqrt_cost);
   } else {
     ops = dims.batch * dims.ix * dims.iy * dims.iz * 2;
   }

   const double size_nhwc =
       CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
   const double size_c =
       CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
   double total_input_size = 0.0;
   double total_internal_read_size = 0.0;
   double total_output_size = 0.0;
   if (is_training) {
     total_input_size = size_nhwc + size_c * 2;
     total_output_size = size_nhwc + size_c * 4;
     total_internal_read_size = size_nhwc;
   } else {
     total_input_size = size_nhwc + size_c * 4;
     total_output_size = size_nhwc;
   }

   Costs costs =
       PredictOpCountBasedCost(ops, total_input_size + total_internal_read_size,
                               total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }

 Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
     const OpContext& op_context) const {
   bool found_unknown_shapes = false;
   const auto& op_info = op_context.op_info;
   // y_backprop: op_info.inputs(0)
   // x: op_info.inputs(1)
   // scale: op_info.inputs(2)
   // mean: op_info.inputs(3)
   // variance or inverse of variance: op_info.inputs(4)
   ConvolutionDimensions dims = OpDimensionsFromInputs(
       op_info.inputs(1).shape(), op_info, &found_unknown_shapes);

   int64 ops = 0;
   const auto rsqrt_cost = Eigen::internal::functor_traits<
       Eigen::internal::scalar_rsqrt_op<float>>::Cost;
   ops = dims.iz * (dims.batch * dims.ix * dims.iy * 11 + 5 + rsqrt_cost);

   const double size_nhwc =
       CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
   const double size_c =
       CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
   double total_input_size = size_nhwc * 2 + size_c * 2;
   double total_internal_read_size = size_nhwc;
   double total_output_size = size_nhwc * 1 + size_c * 2;

   Costs costs =
       PredictOpCountBasedCost(ops, total_input_size + total_internal_read_size,
                               total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
   return costs;
 }
 }  // end namespace grappler
 }  // end namespace tensorflow