blob: 2fe8eec9675f5f6a12dc731c7f87c3760054daf2 [file] [log] [blame]
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/grappler/costs/graph_properties.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "third_party/tensorrt/NvInfer.h"
namespace tensorflow {
namespace tensorrt {
namespace convert {
using ::stream_executor::port::StatusOr;
struct EngineConnection {
// Constructs a non-control edge.
EngineConnection(const string& outside, int out_id, int out_port,
const string& inside, int in_id, int in_port,
bool input_edge, int port)
: outside_node_name(outside),
outside_id(out_id),
outside_port(out_port),
inside_node_name(inside),
inside_id(in_id),
inside_port(in_port),
is_input_edge(input_edge),
port_number(port) {}
// Constructs a control edge.
EngineConnection(const string& outside, int out_id, const string& inside,
int in_id, bool input_edge)
: outside_node_name(outside),
outside_id(out_id),
outside_port(Graph::kControlSlot),
inside_node_name(inside),
inside_id(in_id),
inside_port(Graph::kControlSlot),
is_input_edge(input_edge),
port_number(Graph::kControlSlot) {}
bool is_control_edge() const { return port_number == Graph::kControlSlot; }
const string outside_node_name;
const int outside_id;
const int outside_port;
PartialTensorShape outside_shape; // Only set for input edge.
const string inside_node_name;
const int inside_id;
const int inside_port;
PartialTensorShape inside_shape; // Only set for output edge.
DataType connection_type;
const bool is_input_edge;
// The port number of the TRT node connected with this edge.
const int port_number;
};
struct EngineInfo {
EngineInfo()
: engine_type(EngineType::TRTStatic),
max_workspace_size_bytes(0),
precision_mode(TrtPrecisionMode::FP32),
use_calibration(true),
allow_build_at_runtime(true) {}
string engine_name;
string device;
GraphDef segment_graph_def;
// Non-control input connections inside this vector are sorted in a way such
// that, the segment nodes connecting to them are topological sorted.
// In addition, for non-control connections, there must be no duplicates.
std::vector<EngineConnection> connections;
enum class EngineType { TRTStatic = 0, TRTDynamic = 1 };
EngineType engine_type;
int64 max_workspace_size_bytes;
int maximum_cached_engines;
TrtPrecisionMode precision_mode;
bool use_calibration;
bool allow_build_at_runtime;
};
// Constructs a graphdef from the segment in the given graph. Adds _Arg
// nodes for input edges (InputPH_*) and _Retval nodes for output edges
// (OutputPH_*). This function needs to be called before TensorRT nodes
// inserted in order to correctly get sizes from the original graph.
//
// - subgraph_node_names: the node names of the subgraph.
// - subgraph_node_ids: the node ids of the subgraph, must be sorted in
// topological order.
// - segment_def: the output GraphDef, whose non-input/output nodedefs will be
// sorted in topological order.
// - scope_name: the name of the scope where the TRTEngineOp will be placed.
//
// TODO(aaroey): add tests to validate these properties.
Status ConvertSegmentToGraphDef(
const Graph* graph, const grappler::GraphProperties& graph_properties,
const std::vector<const Node*>& subgraph_nodes,
std::vector<EngineConnection>* connections, GraphDef* segment_def,
string* scope_name);
// Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff
// 'builder' successfully build the engine. If the result is not ok, 'engine'
// will be set to nullptr
// Once returned, 'builder' is not needed any more and can be safely destroyed.
//
// - convert_successfully: indicates whether the conversion to TensorRT network
// is successful. This is different than successfully building the engine:
// building can still fail afterwards.
Status ConvertGraphDefToEngine(
const GraphDef& gdef, TrtPrecisionMode precision_mode, int max_batch_size,
size_t max_workspace_size_bytes,
const std::vector<PartialTensorShape>& input_shapes,
nvinfer1::ILogger* logger, nvinfer1::IGpuAllocator* allocator,
TRTInt8Calibrator* calibrator,
TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
const bool use_implicit_batch, bool* convert_successfully,
TrtShapeOptimizationProfile* profiles);
// Helper class for the segmenter to determine whether an output edge from the
// TRT segment is valid.
class OutputEdgeValidator {
public:
// Return true if the specified edge is eligible to be an output edge of the
// TRT segment.
bool operator()(const Edge* out_edge) const;
};
int64_t TrtWeightDimsNumElements(const nvinfer1::Dims& dims);
int64_t TrtTensorDimsNumElements(const nvinfer1::Dims& dims);
// Class to convert TF compile-time constants (e.g. Const nodes) to TRT weight.
class TRT_ShapedWeights {
public:
explicit TRT_ShapedWeights(
nvinfer1::DataType type = nvinfer1::DataType::kFLOAT);
// Copy from another weights.
//
// NOTE: this does not copy the underlying buffer but only increase its
// reference count.
TRT_ShapedWeights(const TRT_ShapedWeights& rhs);
nvinfer1::Weights GetTrtWeights() const;
const Tensor& GetTensor() const { return tensor_; }
// Returns the raw pointer to the underlying buffer which holds the weights
// value.
void* GetValues() const {
return const_cast<char*>(tensor_.tensor_data().data());
}
int64_t count() const;
size_t size_bytes() const;
string DebugString() const;
template <typename T>
absl::Span<const T> GetSpan() const {
return absl::Span<const T>(tensor_.flat<T>().data(), count());
}
template <typename T>
std::vector<T> ToVector() const {
auto span = GetSpan<T>();
return std::vector<T>(span.data(), span.data() + span.size());
}
nvinfer1::DataType TrtDType() const { return type_; }
// TODO(aaroey): make these private.
nvinfer1::Dims shape_; // Note: shape.type[] is not used.
private:
// This constructor is only used by TrtWeightStore, which creates the
// underlying buffer.
TRT_ShapedWeights(nvinfer1::DataType type, nvinfer1::Dims dims,
Tensor tensor);
nvinfer1::DataType type_;
// All weights should be stored inside TrtWeightStore to make sure lifetime of
// all the underlying tensors are available until the engine is built. For
// this reason, tensor_ should never be reassigned to a different value that
// is not already present in the TrtWeightStore.
Tensor tensor_;
friend class TrtWeightStore;
};
// Container for TRT_ShapedWeights. We need this container because, TRT doesn't
// manage the lifetime of the weights buffer, it only keeps a pointer to it and
// requires that the data referenced by the pointer be available until the
// building of engine is complete. For more information see
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/classnvinfer1_1_1_weights.html
//
// TODO(laigd): consider adding garbage collection to the unused weights.
class TrtWeightStore {
public:
// Get a TRT_ShapedWeights with 'type' and 'dims'.
TRT_ShapedWeights GetTempWeights(nvinfer1::DataType trt_type,
const nvinfer1::Dims& dims);
// Get a TRT_ShapedWeights with the same data type and dimensions as
// 'weights'.
TRT_ShapedWeights GetTempWeights(const TRT_ShapedWeights& weights) {
return GetTempWeights(weights.TrtDType(), weights.shape_);
}
private:
// The backend storage of the TRT_ShapedWeights.
std::vector<Tensor> store_;
};
// Represents a TRT-style input to a TF node, it can be either a
// nvinfer1::ITensor, or TRT_ShapedWeights which is compile-time constant.
//
// TODO(laigd): maybe rename it to TrtArgument, or mimic XlaCompiler::Argument.
class TRT_TensorOrWeights {
public:
TRT_TensorOrWeights() {}
// Constructor that makes it an ITensor, doesn't take ownership of 'tensor'.
// This is used by Converter when building the TRT network, where the ITensor
// is owned by the TRT network being built. See comment for 'tensor_' below.
explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor, int batch_size = -1);
// Constructor that makes it an ITensor by creating one using provided data
// type and shape, and takes ownership of the created ITensor. This is used by
// TrtNodeValidator to encapsulate the type and shape information for
// validation of graph nodes, and the created ITensor is fake and temporary,
// and should not be used to build any TRT network. See comment for
// 'simple_itensor_' below.
explicit TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
const nvinfer1::Dims& trt_dims, int batch_size);
// Constructor that makes it a TRT_TensorOrWeights.
explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights);
TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs);
void operator=(const TRT_TensorOrWeights& rhs);
bool is_tensor() const { return initialized_ && is_tensor_; }
bool is_weights() const { return initialized_ && !is_tensor_; }
nvinfer1::ITensor* tensor() const;
TRT_ShapedWeights& weights() {
CHECK(is_weights());
return weights_;
}
const TRT_ShapedWeights& weights() const {
CHECK(is_weights());
return weights_;
}
nvinfer1::Dims GetTrtDims() const;
Status GetTfType(DataType* tf_type) const;
int batch_size() const { return batch_size_; }
string DebugString() const;
private:
class SimpleITensor;
void set_batch_size(int batch_size) { batch_size_ = batch_size; }
// When it represents an ITensor, the ITensor can be either passed by the
// caller via the constructor that takes an ITensor* as parameter, or be
// created as a SimpleITensor.
//
// In the first case, the ITensor pointer is stored in 'tensor_' below, and
// the ITensor itself is not owned by this class. This method is used by
// Converter (e.g. AddInputTensor) and op converters during TRT network
// construction, where the TRT network owns the ITensor.
//
// In the second case, the created SimpleITensor is stored in
// 'simple_itensor_' below and is owned by this class. SimpleITensor is a fake
// implementation of ITensor and is used only by TrtNodeValidator to validate
// the graph nodes.
nvinfer1::ITensor* tensor_ = nullptr; // Not owned.
std::shared_ptr<SimpleITensor> simple_itensor_ = nullptr;
// First dimension of the TF tensor (NOT tensor_) that is represented by
// tensor_ is treated as the "batch dimension" by TRT, and tensor_'s
// dimensions (obtained via tensor_->getDimensions()) do not contain the batch
// dimension. For example, when a TF tensor with shape (A,B,C) is represented
// in TRT, tensor_->getDimensions() will be (B,C) and batch_size_ will be A.
//
// This requires that all tensors in the subgraph that is converted to a TRT
// engine have the same batch size are represented by the first dimension of
// their shape, and Converter will verify this during conversion. The drawback
// is that currently it cannot convert a graph that doesn't have the batch
// size represented in the shapes or the batch sizes are different. See
// b/118387490 for more details.
//
// If use_implicit_batch is false, batch_size_ is unused and
// tensor_->getDimensions() will contain the entire shape (A,B,C).
int batch_size_ = -1;
TRT_ShapedWeights weights_;
bool initialized_ = false;
bool is_tensor_ = false;
friend class Converter;
};
class Converter;
// Parameters for each op converter.
struct OpConverterParams {
// Constructor used for validation only.
OpConverterParams(const NodeDef& node_def,
const std::vector<TRT_TensorOrWeights>& inputs,
std::vector<TRT_TensorOrWeights>* outputs,
TrtWeightStore* weight_store,
TrtPrecisionMode precision_mode, bool use_calibration,
bool use_implicit_batch);
// Constructor used for conversion.
OpConverterParams(Converter* converter, const NodeDef& node_def,
const std::vector<TRT_TensorOrWeights>& inputs,
std::vector<TRT_TensorOrWeights>* outputs,
TrtWeightStore* weight_store);
Converter* converter = nullptr;
const NodeDef& node_def;
const std::vector<TRT_TensorOrWeights>& inputs;
std::vector<TRT_TensorOrWeights>* outputs;
const bool validation_only;
TrtWeightStore* weight_store;
const TrtPrecisionMode precision_mode;
const bool use_calibration;
const bool use_implicit_batch;
};
using OpConverter = std::function<Status(OpConverterParams*)>;
// Class to verify if specific TF node is supported by TRT.
class TrtNodeValidator {
public:
// 'graph_properties' is the GraphProperties of the graph whose nodes will be
// checked by IsTensorRTCandidate() later. It is used to get the shape and
// data type information of a tensor for validation purpose.
TrtNodeValidator(const grappler::GraphProperties& graph_properties,
TrtPrecisionMode precision_mode, bool use_calibration,
bool use_implicit_batch);
// Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
// to TRT subgraph and later converted into TRT engine.
Status IsTensorRTCandidate(const Node* node);
private:
static const std::set<string>* quantize_ops;
void RegisterOpValidators();
// Convert a Const node to a TRT_TensorOrWeights.
Status ConvertConstToWeights(const NodeDef& const_node_def,
const std::vector<TRT_TensorOrWeights>& inputs,
TRT_TensorOrWeights* output);
// Convert the output tensor at 'output_port' of 'node_def' to a
// TRT_TensorOrWeights which will be later used as an input to other nodes and
// passed to ValidateNode() below.
Status ConvertToTensorOrWeights(const NodeDef& node_def, int output_port,
TRT_TensorOrWeights* tensor_or_weights);
// Stores all the validators by op type. If no validator is registered for
// specific op, it means no validation is needed and ValidateNode() will
// return OK.
std::unordered_map<string, OpConverter> op_validators_;
// Store the weights added during validation. Some validations (e.g.
// validation for Const node) may produce weights.
TrtWeightStore weight_store_;
// GraphProperties of the graph whose nodes are to be validated by
// IsTensorRTCandidate().
const grappler::GraphProperties& graph_properties_;
// Quantization ops are only converted when using quantized precisions.
const TrtPrecisionMode precision_mode_;
const bool use_calibration_;
const bool use_implicit_batch_;
friend class ValidatorTest;
friend class OpConverterTest;
};
// Class to convert TF nodes to TRT network.
class Converter {
public:
// Used for Converter::RenameAndMarkOutputTensors()
struct EngineOutputInfo {
// The TRT tensor name which produces the output.
string source_tensor_name;
// The TensorFlow node name which is receiving the output from the TRT
// engine. This should always be the Identity node created in
// ConvertSegmentToGraphDef.
string dest_node_name;
// Output type. TensorRT requires this to be explicitly set for engine
// outputs.
nvinfer1::DataType trt_dtype;
};
static StatusOr<std::unique_ptr<Converter>> Create(
TrtPrecisionMode precision_mode, bool use_calibration,
nvinfer1::ILogger* trt_logger, const bool use_implicit_batch);
//////////////////////////////////////////////////////////////////////////////
// Methods used by the TRT engine builder to build a TRT network from a TF
// function/subgraph.
// Convert the node to TRT network.
Status ConvertNode(const NodeDef& node_def);
// Add input tensor to the TRT network with given 'name', 'dtype', 'dims' and
// 'batch_size'.
Status AddInputTensor(const string& name, nvinfer1::DataType dtype,
const nvinfer1::Dims& dims, int batch_size);
// Mark the tensors with names specified by source_tensor_name as output of
// the TRT network, and set their names in the TRT network as dest_node_name.
Status RenameAndMarkOutputTensors(
const std::vector<EngineOutputInfo>& output_tensors);
// Build a TRT engine using the created network.
Status BuildCudaEngine(TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
int max_batch_size, size_t max_workspace_size_bytes,
nvinfer1::IGpuAllocator* allocator,
TRTInt8Calibrator* calibrator,
TrtShapeOptimizationProfile* profiles);
//////////////////////////////////////////////////////////////////////////////
// Methods used by op converters to convert individual TF node and add layers
// to the TRT network.
// Op converters (e.g. ConvertReshape) need to access the TRT network in order
// to add TRT layers.
nvinfer1::INetworkDefinition* network() { return trt_network_.get(); }
// What precision are we targeting?
TrtPrecisionMode precision_mode() const { return precision_mode_; }
// Calibration will be or was previously performed on this network?
bool use_calibration() const { return use_calibration_; }
// Whether implicit batch mode is enabled
bool use_implicit_batch() const { return use_implicit_batch_; }
// This should be called on the inputs and outputs of any layer we create
// where we know that the quantization range does not change during that
// operation. (e.g. Reshape, Transpose, Identity, MaxPool).
void MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
nvinfer1::ITensor* output);
// This function should be called when we know the quantization range of a
// tensor, either from a quantize/dequantize node or when the output is a
// fixed range (e.g. SoftMax, Relu6, Sigmoid).
void ProvideQuantizationRange(nvinfer1::ITensor* tensor, float min_range,
float max_range);
// Should be called when full TRT network has been constructed and before
// building the engine.
void MaybeApplyQuantizationRanges();
// Below are helper methods for op converters to add different layers to the
// TRT network.
// Transpose 'input_tensor' with given permutation 'order_with_batch_dim' to
// 'output_tensor'. The permutation 'order_with_batch_dim' contains the batch
// dimension which should always be 0.
Status TransposeTensor(nvinfer1::ITensor* input_tensor,
const std::vector<int>& order_with_batch_dim,
absl::string_view name,
nvinfer1::ITensor** output_tensor);
// Converts 'input' into 'tensor' with shape specified by 'dims' (which
// doesn't contain the batch dimension).
//
// If validation_only is true, it doesn't do the conversion but only do some
// minimum validation for the eligibility of the conversion, and *tensor will
// be set to nullptr.
Status PrepareTensorForShape(const TRT_TensorOrWeights& input,
const nvinfer1::Dims& dims,
const bool validation_only,
nvinfer1::ITensor** tensor);
// Helper function to add a squeeze op to the network.
//
// The input_dims argument stores the TRT dimensions of the input tensor,
// where the dimensions to be squeezed are replaced by 0.
Status SqueezeTensor(nvinfer1::ITensor* input, std::vector<int>* input_dims,
nvinfer1::ITensor** output);
// Creates an IConstantLayer using 'weights' whose dimensions are specified by
// 'dims', and returns the output ITensor.
nvinfer1::ITensor* CreateConstantLayer(const TRT_ShapedWeights& weights,
const nvinfer1::Dims& dims);
private:
Converter(TrtPrecisionMode precision_mode, bool use_calibration,
nvinfer1::ILogger* trt_logger, const bool use_implicit_batch);
Status Init(nvinfer1::ILogger* trt_logger);
// Verify the provided batch_size is consistent with batch_size_ and update it
// if necessary.
Status MaybeUpdateBatchSize(int batch_size);
// Add the provided tensor/weights to the map trt_tensors_.
Status AddTensorOrWeights(const string& name, TRT_TensorOrWeights input);
// Get the tensor/weights from trt_tensors_ by 'name'.
Status GetTensorOrWeights(const string& name, TRT_TensorOrWeights* output);
// Get the inputs of 'node_def' from trt_tensors_.
Status GetInputs(const NodeDef& node_def,
std::vector<TRT_TensorOrWeights>* inputs) const;
void RegisterOpConverters();
void PropagateQuantizationRanges();
// Gets the min and max value in a TRT_ShapedWeights
Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
float* out_max) const;
// Registered op converters by op type.
std::unordered_map<string, OpConverter> op_registry_;
// Tensors/weights added during construction of trt_network_.
std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
// The TRT builder used to create the network and build the engine. Not owned.
TrtUniquePtrType<nvinfer1::IBuilder> trt_builder_;
// The TRT network being built.
TrtUniquePtrType<nvinfer1::INetworkDefinition> trt_network_;
// Store the weights added during construction of trt_network_.
TrtWeightStore weight_store_;
// During conversion, this table is populated with quantization ranges per
// tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT
// quantization ranges. Since TRT only supports symmetric ranges, we will
// store the range as a single float = max(abs(min_range), abs(max_range)).
// Range refers to the floating point values, e.g. min_range = 0.0f, max_range
// = 6.0f for Relu6.
std::unordered_map<nvinfer1::ITensor*, float> quantization_ranges_;
// Edges where quantization ranges can be inferred (copied) across ops - from
// first tensor to second tensor. PropagateQuantizationRanges() will propagate
// known ranges from quantization_ranges_ across these edges, adding the new
// ranges to quantization_ranges_ so that they can be applied in
// MaybeApplyQuantizationRanges().
std::vector<std::pair<nvinfer1::ITensor*, nvinfer1::ITensor*>>
quantization_infer_;
const TrtPrecisionMode precision_mode_;
const bool use_calibration_;
// If this is false, all dimensions including the batch dimension are
// set explicitely.
const bool use_implicit_batch_;
// Batch size of inputs to trt_network_ added by AddInputTensor(). During
// network construction it will update this, use it to verify the batch
// size of all inputs are compatible, and make sure individual TF node is
// acceptable by TRT.
int batch_size_ = -1;
friend class ConverterTest;
friend class OpConverterTest;
};
// Return OK if the broadcast scheme is supported and compute the shapes after
// broadcasting. check_feasibility can be set to false in cases where dimensions
// do not need to match exactly (as in the case of BatchMatMulV2).
Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
const TRT_TensorOrWeights& operand_r,
const bool check_feasibility,
const bool use_implicit_batch,
nvinfer1::Dims* operand_l_new_dims,
nvinfer1::Dims* operand_r_new_dims);
// Map of all supported UnaryOperations
const std::unordered_map<string, nvinfer1::UnaryOperation>* UnaryOperationMap();
// Map of all supported ActivationTypes
const std::unordered_map<string, nvinfer1::ActivationType>* ActivationTypeMap();
} // namespace convert
} // namespace tensorrt
} // namespace tensorflow
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA
#endif // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_