tensorflow/contrib/tensorrt/convert/convert_graph.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"

 #include <fstream>
 #include <list>
 #include <map>
 #include <set>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>

 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/contrib/tensorrt/test/utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"  // NOLINT
 #include "tensorflow/core/util/device_name_utils.h"

 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;

 // Returns compiled TRT version information {Maj, Min, Patch}
 std::vector<int> GetLinkedTensorRTVersion() {
   return {NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH};
 }

 // Returns loaded TRT library version {Maj, Min, Patch}
 std::vector<int> GetLoadedTensorRTVersion() {
   int ver = getInferLibVersion();
   int ver_major = ver / 1000;
   ver = ver - ver_major * 1000;
   int ver_minor = ver / 100;
   int ver_patch = ver - ver_minor * 100;
   return {ver_major, ver_minor, ver_patch};
 }

 namespace {

 bool IsTensorRTCandidate(const tensorflow::Node* node) {
   // LINT.IfChange
   // TODO(jie): Segmentation shouldn't associated with op name.
   //            Split it into a registration for each kernel.
   static const std::set<string> candidate_ops = {
     "Identity",
     "Snapshot",
     "Const",
     "Conv2D",
     "MaxPool",
     "BiasAdd",
     "Relu",
     "Add",
     "Mul",
     "Sub",
     "Rsqrt",
     "Pad",
     "Mean",
     "AvgPool",
     "ConcatV2",
     "DepthwiseConv2dNative",
     "FusedBatchNorm",
     "FusedBatchNormV2",
     "Div",
     "RealDiv",
     "Rsqrt",
     "Reciprocal",
     "Exp",
     "Log",
     "Sqrt",
     "Abs",
     "Neg",
 #if NV_TENSORRT_MAJOR > 3
     "MatMul",
     "BatchMatMul",
     "Softmax",
     "Minimum",
     "Maximum",
     "TopKV2",
     "Sum",
     "Prod",
     "Max",
     "Min",
 #endif
     // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
   return (candidate_ops.count(node->type_string()) ||
           PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
 }

 tensorflow::Status BuildNodeMap(
     const tensorflow::Graph& graph,
     std::unordered_map<string, tensorflow::Node*>* node_map) {
   for (auto* node : graph.op_nodes()) {
     if (!node_map->insert({node->name(), node}).second) {
       return tensorflow::errors::AlreadyExists(
           "Node name is not unique in graph: " + node->name());
     }
   }
   return tensorflow::Status::OK();
 }

 }  // namespace

 // Function to get calibration from ResourceMgr and put them into nodedef.
 tensorflow::Status ConvertCalibGraphToInferGraph(
     const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
     bool is_dyn_op) {
   VLOG(0) << "Starting Calib Conversion";
   infer_graph->CopyFrom(graph_def);
   auto trt_rm = TRTResourceManager::instance();
   auto calib_rm = trt_rm->getManager("TRTCalibration");
   int num_nodes = infer_graph->node_size();
   if (!is_dyn_op) {
     LOG(WARNING) << "Construction of static int8 engine is not implemented "
                     "yet!. Dynamic engine will be constructed";
   }
   for (int i = 0; i < num_nodes; ++i) {
     auto n = infer_graph->mutable_node(i);
     if (n->op() == "TRTEngineOp") {
       VLOG(1) << "Processing " << n->name();
       const string& container_name = n->attr().at("segment_funcdef_name").s();
       TRTCalibrationResource* cres = nullptr;
       auto status = calib_rm->Lookup(container_name, "Calibrator", &cres);
       if (!status.ok()) {
         LOG(ERROR) << "Could not get Calibration information. Did you run with "
                       "calibration data?";
         return tensorflow::errors::FailedPrecondition(
             "Need to run graph with calibration data first!");
       }
       if (cres->calibrator_) {
         cres->calibrator_->waitAndSetDone();
         cres->thr_->join();
         const auto& calibration_table =
             cres->calibrator_->getCalibrationTableAsString();
         if (!calibration_table.size()) {
           LOG(ERROR) << "Calibration table is empty";
           return tensorflow::errors::Unknown(
               "Calibration table is missing. This shouldn't have happened!");
         }
         n->mutable_attr()->at("calibration_data").set_s(calibration_table);
       } else {
         LOG(ERROR) << "Can't get TRTCalibrator from resource manager!";
         return tensorflow::errors::Unknown(
             "Can't get TRTCalibrator from resource manager!");
       }
       cres->Unref();
       TF_RETURN_IF_ERROR(calib_rm->Cleanup(container_name));
     }
   }
   return tensorflow::Status::OK();
 }

 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode, int minimum_segment_size, bool is_dyn_op,
     int max_cached_engines, std::vector<int> cached_engine_batches) {
   // Create GrapplerItem.
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   item.graph = graph_def;

   // TODO(aaroey): we should have used single machine cluster like the
   // following, but the problem is then wrap_conversion will depend on
   // direct_session and cause double linking problems. To fix this we need to
   // fix or get rid of the swig dependency. Here we use VirtualCluster
   // as a work around, and we need to create a session to initialize the
   // underlying device before calling this method.
 #if 0
   // Create single machine cluster. Note that this will create a session and
   // initialize the gpu devices.
   const int num_cpu_cores =
       tensorflow::grappler::GetNumAvailableLogicalCPUCores();
   const int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   VLOG(2) << "cpu_cores: " << num_cpu_cores;
   VLOG(2) << "gpus: " << num_gpus;
   const int timeout_s = 60 * 10;
   std::unique_ptr<tensorflow::grappler::Cluster> cluster(
       new tensorflow::grappler::SingleMachine(
           timeout_s, num_cpu_cores, num_gpus));
   // These settings are the defaults in tensorflow/python/grappler/cluster.py.
   cluster->DisableDetailedStats(true);
   cluster->AllowSoftPlacement(true);
   cluster->SetNumWarmupSteps(10);
   TF_RETURN_IF_ERROR(cluster->Provision());
 #else
   // Create virtual cluster. Grappler requires a virtual cluster with a proper
   // GPU device in order to calculate flops>0 or fails with FATAL in dbg mode.
   // We add numbers from a Pascal card here to have flops>0.
   tensorflow::DeviceProperties device_properties;
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
   device_properties.set_num_cores(3584);
   device_properties.set_frequency(1531);
   std::unique_ptr<tensorflow::grappler::Cluster> cluster(
       new tensorflow::grappler::VirtualCluster(
           {{"/GPU:0", device_properties}}));
 #endif

   // Create RewriterConfig.
   tensorflow::RewriterConfig rw_cfg;
   // TODO(aaroey): use only const folding and layout for the time being since
   // new optimizers break the graph for trt.
   rw_cfg.add_optimizers("constfold");
   rw_cfg.add_optimizers("layout");
   auto optimizer = rw_cfg.add_custom_optimizers();
   optimizer->set_name("TensorRTOptimizer");
   auto& parameters = *(optimizer->mutable_parameter_map());
   parameters["minimum_segment_size"].set_i(minimum_segment_size);
   parameters["max_batch_size"].set_i(max_batch_size);
   parameters["is_dynamic_op"].set_b(is_dyn_op);
   parameters["max_workspace_size_bytes"].set_i(max_workspace_size_bytes);
   TF_RETURN_IF_ERROR(GetPrecisionModeName(
       precision_mode, parameters["precision_mode"].mutable_s()));
   parameters["maximum_cached_engines"].set_i(max_cached_engines);
   if (!cached_engine_batches.empty()) {
     auto list = parameters["cached_engine_batches"].mutable_list();
     for (const int batch : cached_engine_batches) {
       list->add_i(batch);
     }
   }

   // Run optimizer.
   tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
   TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, new_graph_def));

   if (VLOG_IS_ON(5)) {
     std::fstream f;
     f.open("TRTConversionInput.pb",
            std::fstream::out | std::fstream::binary | std::fstream::trunc);
     f << new_graph_def->SerializeAsString();
     f.close();
   }
   return Status::OK();
 }

 // Function to get subsegment information structure.
 tensorflow::Status GetEngineInfo(
     const tensorflow::Graph* g,
     const tensorflow::grappler::GraphProperties& graph_properties,
     const std::set<string>& segment_nodes,
     const std::unordered_map<string, tensorflow::Node*>& node_map,
     const std::vector<tensorflow::Node*>& reverse_topo_order,
     EngineInfo* info) {
   std::vector<int> subgraph_node_ids;  // Topologically sorted node ids.
   std::set<string> subgraph_node_names = segment_nodes;
   std::set<int> added_const_node_ids;  // Used to prevent double insertion.
   std::set<string> segment_devices;

   // Map from src_node_name+port to the unique port numbers of the TRT op, where
   // the src_node_name is the name of the source node of the input/output
   // edge, thus there must not be any duplicates since source nodes of
   // input/output edges must be in different split of the graph.
   // TODO(aaroey): consider using node id and port instead.
   // TODO(aaroey): using topo order instead of reverting reverse topo order.
   std::unordered_map<string, int> input_to_engine_port, output_to_engine_port;
   for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
        ++it) {
     const auto& node_name = (*it)->name();
     if (segment_nodes.count(node_name) == 0) continue;
     auto node = *it;
     auto node_device = node->requested_device();
     if (!node_device.empty()) {
       segment_devices.insert(node_device);
     } else {
       if (node->has_assigned_device_name()) {
         segment_devices.insert(node->assigned_device_name());
       } else {
         VLOG(2) << "Node " << node->name()
                 << " neither have requested device nor assigned device";
       }
     }
     const int node_id = node->id();
     subgraph_node_ids.push_back(node_id);
     // Create input connections.
     for (const auto edge : node->in_edges()) {
       auto input_node = edge->src();
       if (input_node->IsSource() || segment_nodes.count(input_node->name())) {
         continue;
       }
       if (edge->IsControlEdge()) {
         // Control input.
         info->connections.emplace_back(input_node->name(), input_node->id(),
                                        node_name, node_id,
                                        /*input_edge=*/true);
       } else if (input_node->type_string() == "Const") {
         // Add constant data input nodes into the segment graphdef (thus also in
         // the engine). We don't care if it has other output edges going into
         // other engines or TF nodes. Since we add it only to the segment
         // graphdef, not the segment itself, it won't be removed from the graph.
         // If it doesn't have any edges, TF will prune it out.
         //
         // Note that the segmenter already ensure that the constant data input
         // is valid and suppported by the engine.
         if (!added_const_node_ids.insert(input_node->id()).second) {
           // Already added before.
           continue;
         }
         VLOG(1) << "Adding const node " << input_node->name();
         QCHECK(subgraph_node_names.insert(input_node->name()).second);
         // Since we already add (duplicate) the const input node to the segment
         // graphdef, it's now not a data dependency any more, but to make the
         // dependency correct we still add a control dependency.
         info->connections.emplace_back(input_node->name(), input_node->id(),
                                        node_name, node_id,
                                        /*input_edge=*/true);
       } else {
         // Non-const data input.
         int port = Graph::kControlSlot - 1;
         // Use the source non-segment node name/port as key.
         const string s = StrCat(input_node->name(), ":", edge->src_output());
         VLOG(1) << "Input edge = " << s;
         if (input_to_engine_port.count(s)) {
           port = input_to_engine_port.at(s);
         } else {
           port = input_to_engine_port.size();
           input_to_engine_port.insert({s, port});
         }
         info->connections.emplace_back(
             input_node->name(), input_node->id(), edge->src_output(), node_name,
             node_id, edge->dst_input(), /*input_edge=*/true, port);
       }
     }
     // Create output connections.
     for (const auto edge : node->out_edges()) {
       auto output_node = edge->dst();
       if (output_node->IsSink() || segment_nodes.count(output_node->name())) {
         continue;
       }
       if (edge->IsControlEdge()) {
         // Control output.
         info->connections.emplace_back(output_node->name(), output_node->id(),
                                        node_name, node_id,
                                        /*input_edge=*/false);
       } else {
         // Data output.
         int port = Graph::kControlSlot - 1;
         // Use the source segment node name/port as key.
         const string s = StrCat(node_name, ":", edge->src_output());
         VLOG(1) << "Output edge = " << s;
         if (output_to_engine_port.count(s)) {
           port = output_to_engine_port.at(s);
         } else {
           port = output_to_engine_port.size();
           output_to_engine_port.insert({s, port});
         }
         info->connections.emplace_back(
             output_node->name(), output_node->id(), edge->dst_input(),
             node_name, node_id, edge->src_output(), /*input_edge=*/false, port);
       }
     }
   }  // For each segment node in topological order.

   // Construct the const nodes first.
   subgraph_node_ids.insert(subgraph_node_ids.begin(),
                            added_const_node_ids.begin(),
                            added_const_node_ids.end());
   TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
       g, graph_properties, subgraph_node_names, subgraph_node_ids,
       &info->connections, &info->segment_graph_def, &info->engine_name));
   // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
   } else if (segment_devices.size() > 1) {
     LOG(WARNING) << "Detected multiple(" << segment_devices.size()
                  << ") devices for the segment. Picking first one to continue "
                  << "but this shouldn't have happened";
     info->device = *segment_devices.begin();
   } else {
     LOG(ERROR) << "Can't find a device placement for the op!";
   }
   return Status::OK();
 }

 // Helper function to update edge connection from the removed node to the
 // engine node. If an outside node is gone, it must have been absorbed into
 // an engine node. Find the engine node.
 void UpdateToEngineNode(const std::vector<EngineInfo>& infos,
                         const size_t my_engine_id,
                         const std::vector<Node*>& engine_nodes,
                         const bool is_input_edge, const string& node_name,
                         tensorflow::Node** node, int* port) {
   for (size_t t = 0; t < infos.size(); ++t) {
     if (t == my_engine_id) {
       continue;
     }
     const auto& info = infos.at(t);
     for (const auto& eng_conn : info.connections) {
       // If the connection being updated is an input connection, the source of
       // the connection must be an output connection of another engine. And vise
       // versa.
       if (is_input_edge == eng_conn.is_input_edge) continue;
       if (eng_conn.inside_node_name == node_name &&
           eng_conn.inside_port == *port) {
         *node = CHECK_NOTNULL(engine_nodes[t]);
         QCHECK_EQ(info.engine_name, (**node).name())
             << "Engine name mismatch: " << info.engine_name << " vs "
             << (**node).name();
         *port = eng_conn.port_number;
         return;
       }
     }
   }
   LOG(FATAL) << "Node " << (**node).name() << " not found in any engine.";
 }

 // Function to insert a TRT engine node into the graph.
 // Create engine nodes in the following way:
 // 1. Each invocation of CreateTRTNode creates an engine node for infos[pos]
 // 2. When an engine node is created, add it into the graph with necessary
 //    re-wiring.
 //    2.1. If the outside connected node is existing, connect the engine
 //         node to it.
 //    2.2. If the outside connected node is gone, it must have been absorted
 //         into another engine node (which was processed before the processing
 //         one). Connect to the pre-existing engine node instead.
 // 3. In this way, we ensure the graph is topologically sort-able after each
 //    invocation of CreateTRTNode().
 tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
                                  int max_batch_size, tensorflow::Graph* graph,
                                  nvinfer1::IGpuAllocator* alloc,
                                  std::vector<Node*>* engine_nodes) {
   const auto& info = infos.at(pos);
   TRT_RETURN_IF_TEST_VALUE(StrCat(info.engine_name, ":CreateTRTNode"), "fail");
   std::vector<tensorflow::TensorShapeProto> output_shape_protos;
   std::vector<tensorflow::TensorShapeProto> input_shape_protos;
   std::vector<tensorflow::PartialTensorShape> input_shapes;
   std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs;
   std::vector<tensorflow::Node*> input_nodes;
   std::vector<tensorflow::Node*> control_input_nodes;
   std::unordered_set<string> control_input_names;
   std::vector<tensorflow::DataType> out_types;

   VLOG(1) << "Processing " << info.engine_name;
   // Collect needed info for creating the engine node in the graph
   for (const auto& conn : info.connections) {
     // Control edges
     if (conn.is_control_edge()) {
       // Skip control outputs for now. control output info are not needed for
       // node creation and will be processed later.
       if (!conn.is_input_edge) continue;

       // Rewrire control input if it's not found in original graph.
       tensorflow::Node* input_node = graph->FindNodeId(conn.outside_id);
       int port = tensorflow::Graph::kControlSlot;
       if (!input_node) {
         UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/true,
                            conn.outside_node_name, &input_node, &port);
         QCHECK_EQ(Graph::kControlSlot, port);
       }
       if (!control_input_names.insert(input_node->name()).second) {
         continue;
       }
       control_input_nodes.push_back(input_node);
       VLOG(1) << "Engine Control Input " << input_node->name() << " -> "
               << info.engine_name;
     } else {
       // Data edges
       if (!conn.is_input_edge) {
         // Set the shapes and data types of output edge.
         tensorflow::TensorShapeProto out_shape;
         // shape of the output node inside segment
         conn.inside_shape.AsProto(&out_shape);
         if (output_shape_protos.size() <= conn.port_number) {
           output_shape_protos.resize(conn.port_number + 1);
           out_types.resize(conn.port_number + 1);
         }
         output_shape_protos.at(conn.port_number) = out_shape;
         out_types.at(conn.port_number) = conn.connection_type;
       } else {
         // Set the shapes and data types of input edge.
         tensorflow::TensorShapeProto in_shape;
         conn.outside_shape.AsProto(&in_shape);
         if (input_shape_protos.size() <= conn.port_number) {
           input_shape_protos.resize(conn.port_number + 1);
           input_shapes.resize(conn.port_number + 1);
         }
         input_shape_protos.at(conn.port_number) = in_shape;
         input_shapes.at(conn.port_number) = conn.outside_shape;

         // Rewrire data input if it's not found in original graph.
         tensorflow::Node* input_node = graph->FindNodeId(conn.outside_id);
         int port = conn.outside_port;
         if (!input_node) {
           UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/true,
                              conn.outside_node_name, &input_node, &port);
         }
         if (std::find_if(
                 std::begin(inputs), std::end(inputs),
                 [input_node, &port](const NodeDefBuilder::NodeOut& inp) {
                   return inp.node == input_node->name() && inp.index == port;
                 }) == std::end(inputs)) {
           inputs.emplace_back(input_node->name(), port, conn.connection_type);
           input_nodes.push_back(CHECK_NOTNULL(input_node));
           VLOG(1) << "Engine Input " << input_node->name() << ":" << port
                   << " -> " << info.engine_name << ":" << inputs.size() - 1;
         }
       }
     }
   }
   string segment_string;
   if (info.engine_type == EngineInfo::EngineType::TRTStatic ||
       info.precision_mode == INT8MODE) {
     // Create static engine for fp32/fp16 mode, and test validity of the engine
     // for int8 mode. We don't want engine to fail at the calibration time.
     // So we are constructing a FP32 engine here to check its validity, and if
     // it is a valid engine then we put the serialized graphdef to the op.
     // Otherwise we skip node creation for this engine.
     Logger trt_logger;
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     // TODO(sami): What happens if 1st dim is not batch?
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
         info.segment_graph_def,
         info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes,
         &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string =
         string((const char*)engine_data->data(), engine_data->size());
     if (info.precision_mode == INT8MODE) {
       // See above comment about why not putting this inside the 'else' branch.
       segment_string = info.segment_graph_def.SerializeAsString();
     }
   } else {
     segment_string = info.segment_graph_def.SerializeAsString();
   }

   // TODO(aaroey): use enum instead, and add a helper method to do the
   // conversion.
   string prec_string;
   TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string));
   if (info.precision_mode == INT8MODE &&
       !TRTResourceManager::instance()->getManager("TRTCalibration")) {
     LOG(ERROR) << "Failed to construct calibration storage";
   }
   tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
   if (!info.device.empty()) node_builder.Device(info.device);
   if (VLOG_IS_ON(1)) {
     string ins = StrCat(info.engine_name, " inputs= ");
     for (const auto& ii : inputs) {
       StrAppend(&ins, ii.node, ":", ii.index, " ");
     }
     VLOG(1) << ins;
   }
   node_builder.Input(inputs);
   for (const string& c : control_input_names) {
     node_builder.ControlInput(c);
   }

   if (info.engine_type == EngineInfo::EngineType::TRTStatic &&
       info.cached_engine_batches.size()) {
     LOG(WARNING) << "Cached engine batches are ignored for static engines";
   }
   tensorflow::NodeDef trt_node;
   tensorflow::Status status =
       node_builder.Attr("input_shapes", input_shape_protos)
           .Attr("output_shapes", output_shape_protos)
           .Attr("static_engine",
                 info.engine_type == EngineInfo::EngineType::TRTStatic)
           .Attr("segment_funcdef_name",
                 StrCat(info.engine_name, "_native_segment"))
           .Attr("serialized_segment", segment_string)
           .Attr("calibration_data", "")
           .Attr("max_cached_engines_count", info.maximum_cached_engines)
           .Attr("cached_engine_batches", {max_batch_size})
           .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
           .Attr("precision_mode", prec_string)
           .Attr("OutT", out_types)
           .Finalize(&trt_node);
   if (!status.ok()) {
     LOG(ERROR) << "Node construction failed with" << status;
     return status;
   }
   VLOG(1) << "Adding TRTEngine " << info.engine_name << " to graph";

   // Up until this point, graph is not modified. If we return !status.ok() from
   // here, this segment will be skipped
   // TODO(aaroey): let it return proper error status for the following logic
   // instead of checking fail.
   tensorflow::Node* engine_node = graph->AddNode(trt_node, &status);
   (*engine_nodes)[pos] = engine_node;
   if (!status.ok()) {
     LOG(ERROR) << "Adding node failed " << status;
     return status;
   }
   // Add control input and input edges to the engine node.
   for (const auto in : control_input_nodes) {
     VLOG(1) << "Connecting control edge from " << in->name() << " to "
             << engine_node->name();
     graph->AddControlEdge(in, engine_node);
   }
   VLOG(1) << "input_nodes size = " << input_nodes.size();
   for (int i = 0; i < input_nodes.size(); ++i) {
     Node* n = CHECK_NOTNULL(input_nodes[i]);
     const auto& in = inputs[i];
     VLOG(1) << "Connecting data edge from " << n->name() << ":" << in.index
             << " to " << engine_node->name() << ":" << i;
     graph->AddEdge(n, in.index, engine_node, i);
   }

   // Updates the inputs of output edges destination nodes, and point them to the
   // engine node.
   for (auto& conn : info.connections) {
     if (conn.is_input_edge) {
       continue;
     }
     tensorflow::Node* output_node = graph->FindNodeId(conn.outside_id);
     int port = conn.outside_port;
     if (!output_node) {
       UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/false,
                          conn.outside_node_name, &output_node, &port);
     }
     VLOG(1) << "Updating " << engine_node->name() << ":" << conn.port_number
             << " to " << output_node->name() << ":" << port;
     if (conn.is_control_edge()) {
       QCHECK_EQ(Graph::kControlSlot, port);
       graph->AddControlEdge(engine_node, output_node);
     } else {
       auto new_edge =
           graph->AddEdge(engine_node, conn.port_number, output_node, port);
       QCHECK(new_edge) << "Adding a new edge failed " << engine_node->name()
                        << ":" << conn.port_number << " -> "
                        << output_node->name() << ":" << conn.outside_port;
     }
   }
   return Status::OK();
 }

 // Function to construct a funcdef from the segment and add it to the graph.
 tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
     tensorflow::Graph* graph, const tensorflow::GraphDef& segment,
     const string& engine_name) {
   tensorflow::Graph sgraph(graph->flib_def());
   tensorflow::GraphConstructorOptions gcopts;
   TF_RETURN_IF_ERROR(
       tensorflow::ConvertGraphDefToGraph(gcopts, segment, &sgraph));
   std::map<string, tensorflow::Node*> io_nodes;
   int num_inputs = 0;
   for (auto n : sgraph.op_nodes()) {
     if (tensorflow::str_util::StartsWith(n->name(), kInputPHName)) {
       num_inputs++;
       io_nodes.insert({n->name(), n});
     } else if (tensorflow::str_util::StartsWith(n->name(), kOutputPHName)) {
       io_nodes.insert({n->name(), n});
     }
   }

   for (int i = 0; i < num_inputs; ++i) {
     auto name = StrCat(kInputPHName, i);
     auto node = io_nodes[name];
     tensorflow::NodeDef nd;
     tensorflow::NodeDefBuilder node_builder(
         StrCat(name, "_Arg"), tensorflow::FunctionLibraryDefinition::kArgOp);
     VLOG(1) << "Adding " << StrCat(name, "_Arg");
     TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
                            .Attr("index", i)
                            .Finalize(&nd));
     tensorflow::Status s;
     auto node_arg = sgraph.AddNode(nd, &s);
     if (!s.ok()) {
       LOG(ERROR) << "Couldn't add _Arg node for " << name;
     }
     for (auto edge : node->out_edges()) {
       sgraph.AddEdge(node_arg, 0, edge->dst(), edge->dst_input());
       VLOG(1) << "Updating funcdef input " << node_arg->name() << ":" << 0
               << " - > " << edge->dst()->name() << ":" << edge->dst_input();
       if (!s.ok()) {
         LOG(ERROR) << "Failed to update edge from " << node_arg->name()
                    << " to " << edge->dst()->name() << ":" << edge->dst_input();
       }
     }
     sgraph.RemoveNode(node);
   }

   for (int i = 0; i < io_nodes.size() - num_inputs; ++i) {
     auto name = StrCat(kOutputPHName, i);
     auto node = io_nodes[name];
     tensorflow::NodeDef nd;
     tensorflow::NodeDefBuilder node_builder(
         StrCat(name, "_Ret"), tensorflow::FunctionLibraryDefinition::kRetOp);
     auto edge = *(node->in_edges().begin());
     tensorflow::NodeDefBuilder::NodeOut nout(
         edge->src()->name(), edge->src_output(),
         edge->src()->output_type(edge->src_output()));
     VLOG(1) << " input " << nout.node << ":" << nout.index
             << " dtype=" << tensorflow::DataTypeString(nout.data_type);
     // nvcc complains that Input(<brace-enclosed initializer list>) is
     // ambiguous, so do not use Input({nout}).
     node_builder.Input(nout);
     TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
                            .Attr("index", i)
                            .Finalize(&nd));
     if (VLOG_IS_ON(3)) {
       VLOG(3) << nd.DebugString();
     }
     tensorflow::Status s;
     auto node_ret = sgraph.AddNode(nd, &s);
     if (!s.ok()) {
       LOG(ERROR) << "Couldn't add _Ret node for " << name;
     }
     VLOG(1) << "Update edge from " << edge->src()->name() << ":"
             << edge->src_output() << " - > " << node_ret->name() << ":" << 0;
     sgraph.AddEdge(edge->src(), edge->src_output(), node_ret, 0);
     s = sgraph.UpdateEdge(edge->src(), edge->src_output(), node_ret, 0);
     if (!s.ok()) {
       LOG(ERROR) << "Failed to update edge from " << edge->src()->name() << ":"
                  << edge->src_output() << " - > " << node_ret->name() << ":"
                  << 0;
     }
     sgraph.RemoveNode(node);
   }
   tensorflow::FunctionDefLibrary fdeflib;
   auto native_segment = fdeflib.add_function();
   TF_RETURN_IF_ERROR(tensorflow::GraphToFunctionDef(
       sgraph, StrCat(engine_name, "_native_segment"), native_segment));
   if (VLOG_IS_ON(7)) {
     VLOG(7) << engine_name << " Function_Def ";
     VLOG(7) << native_segment->DebugString();
   }
   VLOG(1) << "Adding funcdef to graphlib";
   TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib));
   return tensorflow::Status::OK();
 }

 std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
     const ConversionParams& params, const EngineInfo& engine) {
   int cuda_device_id = -1;
   tensorflow::Allocator* dev_allocator = nullptr;
   if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
       engine.device.empty()) {
     // If device is not set, use the first found GPU device for the conversion.
     for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
       TfGpuId tf_gpu_id(tf_gpu_id_value);
       PlatformGpuId platform_gpu_id;
       Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
       if (s.ok()) {
         VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
                 << platform_gpu_id.value();
         cuda_device_id = platform_gpu_id.value();
         GPUOptions gpu_options;
         // If the TF to Cuda gpu id mapping exist, the device and corresponding
         // allocator must have been initialized already, so the
         // GetGPUAllocator() call won't create a new allocator.
         dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
             gpu_options, tf_gpu_id, 1);
         break;
       }
       LOG(ERROR) << "TF GPU with id " << tf_gpu_id_value << " does not exist "
                  << s;
     }
     return std::make_pair(cuda_device_id, dev_allocator);
   }

   // Use the device requested by the engine.
   auto device_set = params.cluster->GetDeviceSet();
   std::vector<tensorflow::Device*> devices;
   DeviceNameUtils::ParsedName parsed_name;
   if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
       parsed_name.has_id) {
     device_set->FindMatchingDevices(parsed_name, &devices);
   }
   if (!devices.empty()) {
     if (devices.size() > 1) {
       string msg = "Found multiple matching devices using name '";
       StrAppend(&msg, engine.device, "': ");
       for (auto d : devices) StrAppend(&msg, d->name(), ", ");
       StrAppend(&msg, ". Will get the allocator from first one.");
       LOG(WARNING) << msg;
     }
     tensorflow::AllocatorAttributes alloc_attr;
     cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
     dev_allocator = devices[0]->GetAllocator(alloc_attr);
     VLOG(1) << "Using allocator " << dev_allocator->Name()
             << " and cuda_device_id " << cuda_device_id;
   } else {
     LOG(WARNING) << "Cluster is set but device '" << engine.device
                  << "' is not found in the cluster";
   }
   return std::make_pair(cuda_device_id, dev_allocator);
 }

 // Entry function from optimization pass.
 // TODO(aaeory): parameter should use pointer type.
 tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   // Convert graphdef to graph.
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
                                              params.input_graph_def->library());
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
       tensorflow::GraphConstructorOptions(), *params.input_graph_def, &graph));

   // Segment the graph into subgraphs that can be converted to TensorRT
   tensorflow::tensorrt::segment::SegmentOptions segment_options;
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
   for (auto node : *(params.output_names)) {
     segment_options.exclude_node_list.insert(node);
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
       &graph, IsTensorRTCandidate, InputEdgeValidator(*params.graph_properties),
       OutputEdgeValidator(), segment_options, &initial_segments));
   if (initial_segments.size() > 1) {
     VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
             << initial_segments.size();
   }

   // Get the EngineInfo for each segment.
   std::unordered_map<string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
   float total_num_nodes_in_segments = 0.;
   std::vector<EngineInfo> engine_segments;
   engine_segments.reserve(initial_segments.size());
   std::vector<tensorflow::Node*> reverse_topo_order;
   tensorflow::GetPostOrder(graph, &reverse_topo_order);
   size_t total_engine_bytes_size = 0;
   std::vector<size_t> engine_bytes_size;
   tensorflow::tensorrt::segment::SegmentNodesVector converted_segments;
   converted_segments.reserve(initial_segments.size());
   for (size_t t = 0; t < initial_segments.size(); t++) {
     auto& curr_segment = initial_segments.at(t);
     EngineInfo curr_engine;
     Status status =
         GetEngineInfo(&graph, *params.graph_properties, curr_segment.first,
                       node_map, reverse_topo_order, &curr_engine);
     if (!status.ok()) {
       LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
                    << status;
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
     curr_engine.engine_type =
         (params.is_dyn_op || params.precision_mode == INT8MODE
              ? EngineInfo::EngineType::TRTDynamic
              : EngineInfo::EngineType::TRTStatic);
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
     StrAppend(&curr_engine.engine_name, "my_trt_op_", t);
     status = RegisterSegmentFunctionToFunctionLibrary(
         &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
     if (!status.ok()) {
       LOG(WARNING) << "Failed to register segment graphdef as a function " << t
                    << ": " << status;
       continue;
     }

     engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
     total_engine_bytes_size += engine_bytes_size.back();
     total_num_nodes_in_segments += curr_segment.first.size();
     engine_segments.push_back(std::move(curr_engine));
     converted_segments.push_back(std::move(curr_segment));

     if (VLOG_IS_ON(8)) {
       string fname = curr_engine.engine_name;
       StrAppend(&fname, ".pb");
       std::fstream f;
       f.open(fname.c_str(), std::fstream::out | std::fstream::binary);
       f << engine_segments.at(t).segment_graph_def.SerializeAsString();
       f.close();
     }
   }

   // Create a TRT node for each segment using its EngineInfo.
   int old_cuda_device = 0;
   auto err = cudaGetDevice(&old_cuda_device);
   if (err != cudaSuccess) {
     LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err);
   }
   VLOG(1) << "Current cuda device is " << old_cuda_device;
   std::vector<Node*> engine_nodes;
   engine_nodes.resize(engine_segments.size());
   for (int i = 0; i < engine_segments.size(); ++i) {
     auto& engine = engine_segments.at(i);
     // Partition the workspace size by the average of node ratio and segment
     // graphdef size
     engine.max_workspace_size_bytes =
         params.max_workspace_size_bytes *
         (engine_bytes_size.at(i) / total_engine_bytes_size +
          converted_segments.at(i).first.size() / total_num_nodes_in_segments) /
         2.0;
     // The allocator is used to build the engine. The build and the built engine
     // will be destroyed after we get the serialized engine string, so it's fine
     // to use unique_ptr here.
     std::unique_ptr<TRTBaseAllocator> alloc;
     auto device_alloc = GetDeviceAndAllocator(params, engine);
     int cuda_device_id = 0;
     if (device_alloc.first >= 0) {
       cuda_device_id = device_alloc.first;
       alloc.reset(new TRTDeviceAllocator(device_alloc.second));
     } else {
       // Setting allocator as nullptr should get revert to the cudamalloc
       LOG(WARNING) << "Can't identify the cuda device. Running on device 0 ";
     }
     cudaSetDevice(cuda_device_id);
     auto status = CreateTRTNode(engine_segments, i, params.max_batch_size,
                                 &graph, alloc.get(), &engine_nodes);
     // If status is ok, we successfully added the node to the graph and can
     // remove segment ops. Otherwise graph is not modified.
     const string msg = StrCat("Engine ", engine.engine_name,
                               " creation for segment ", i, ", composed of ",
                               converted_segments.at(i).first.size(), " nodes");
     if (status.ok()) {
       LOG(INFO) << msg << " succeeded.";
       for (auto node_name : converted_segments.at(i).first) {
         graph.RemoveNode(node_map.at(node_name));
       }
     } else {
       // Graph is not modified.
       LOG(WARNING) << msg << " failed: " << status << ". Skipping...";
     }
   }
   cudaSetDevice(old_cuda_device);
   graph.ToGraphDef(params.output_graph_def);
   VLOG(1) << "Returning from conversion";
   return tensorflow::Status::OK();
 }

 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow

 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA