| /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| // Neural Net operation support for StreamExecutor instances. |
| // |
| // This is an abstract interface for a platform to optionally support common |
| // neural net operations; it accommodates implementations such as the cudnn |
| // library operations. |
| |
| #ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_ |
| #define TENSORFLOW_STREAM_EXECUTOR_DNN_H_ |
| |
| #include <functional> |
| #include <limits> |
| #include <memory> |
| #include <tuple> |
| #include <type_traits> |
| |
| #include "absl/types/optional.h" |
| #include "absl/types/span.h" |
| #include "tensorflow/stream_executor/device_memory.h" |
| #include "tensorflow/stream_executor/dnn.pb.h" |
| #include "tensorflow/stream_executor/lib/array_slice.h" |
| #include "tensorflow/stream_executor/lib/status.h" |
| #include "tensorflow/stream_executor/lib/statusor.h" |
| #include "tensorflow/stream_executor/platform/logging.h" |
| #include "tensorflow/stream_executor/platform/port.h" |
| |
| namespace Eigen { |
| struct half; |
| } // namespace Eigen |
| |
| namespace stream_executor { |
| |
| class HostBuffer; |
| class Stream; |
| class ScratchAllocator; |
| |
| namespace dnn { |
| |
| // Specifies an index to use when accessing specific spatial dimensions. |
| enum class DimIndex : int { |
| X = 0, |
| Y = 1, |
| Z = 2, |
| }; |
| |
| // Helper functions to make methods more readable. |
| inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) { |
| return data.rbegin()[static_cast<int64>(dim)]; |
| } |
| |
| inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) { |
| data.rbegin()[static_cast<int64>(dim)] = value; |
| } |
| |
| inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) { |
| return SetDim(absl::MakeSpan(*data), dim, value); |
| } |
| |
| // int64 is not the same type as tensorflow::protobuf_int64 in open-source. This |
| // wrapper function gives an int64 array slice view of a repeated int64 protobuf |
| // field. |
| // |
| // T should be a protobuf RepeatedField. |
| template <typename T> |
| inline absl::Span<const int64> AsInt64Slice(const T& repeated_field) { |
| using data_ty = |
| typename std::remove_reference<decltype(*repeated_field.data())>::type; |
| static_assert(std::is_integral<data_ty>::value && |
| std::is_signed<data_ty>::value && sizeof(data_ty) == 8, |
| "repeated_field.data() must return a pointer to a signed " |
| "64-bit integer type."); |
| return absl::Span<const int64>( |
| reinterpret_cast<const int64*>(repeated_field.data()), |
| repeated_field.size()); |
| } |
| template <typename T> |
| inline absl::Span<int64> AsInt64Slice(T* repeated_field) { |
| using data_ty = |
| typename std::remove_reference<decltype(*repeated_field->data())>::type; |
| static_assert(std::is_integral<data_ty>::value && |
| std::is_signed<data_ty>::value && sizeof(data_ty) == 8, |
| "repeated_field->data() must return a pointer to a signed " |
| "64-bit integer type."); |
| return absl::Span<int64>( |
| reinterpret_cast<int64*>(repeated_field->mutable_data()), |
| repeated_field->size()); |
| } |
| |
| // Returns a string representation of the given data layout. |
| string DataLayoutString(DataLayout layout); |
| |
| // Specifies a quantization for activations in a given BatchDescriptor. |
| enum class QuantizedActivationMode { |
| k8Bit = 1, |
| k16Bit = 2, |
| k32Bit = 4, |
| }; |
| |
| // A helper class to convert C/C++ types to the proper enums. |
| template <typename T> |
| struct ToDataType; |
| template <> |
| struct ToDataType<float> { |
| static constexpr DataType value = DataType::kFloat; |
| }; |
| template <> |
| struct ToDataType<double> { |
| static constexpr DataType value = DataType::kDouble; |
| }; |
| template <> |
| struct ToDataType<Eigen::half> { |
| static constexpr DataType value = DataType::kHalf; |
| }; |
| template <> |
| struct ToDataType<int8> { |
| static constexpr DataType value = DataType::kInt8; |
| }; |
| template <> |
| struct ToDataType<int32> { |
| static constexpr DataType value = DataType::kInt32; |
| }; |
| |
| // Specifies the types of a RNN model. |
| enum class RnnMode { |
| kRnnRelu = 0, |
| kRnnTanh = 1, |
| kRnnLstm = 2, |
| kRnnGru = 3, |
| }; |
| |
| // Specifies the input model and whether there is a linear transformation |
| // between the input state and the first layer hidden state. |
| enum class RnnInputMode { |
| kRnnLinearSkip = 0, |
| kRnnSkipInput = 1, |
| }; |
| |
| // Specifies the number of directions used in a RNN model. When bidirection |
| // is used, the input states and output sequence contain data for both |
| // directions. |
| enum class RnnDirectionMode { |
| kRnnUnidirectional = 0, |
| kRnnBidirectional = 1, |
| }; |
| |
| // Relevant to DepthToSpace and SpaceToDepth. This is the write layout when |
| // performing depth to space and the read layout when performing space to depth. |
| // It's specified with most-major dimension first and most-minor dimension last. |
| // In DepthToSpace, the D*M^2 values are read in and then, for DepthHeightWidth, |
| // written out to the output patch, by varying first width, then height, then |
| // depth. In C array format, it looks like [depth][height][width]. See |
| // DepthToSpace comment for more information. |
| enum class DepthToSpaceLayout { DepthHeightWidth }; |
| |
| // Specifies the descriptor for a RNN model. |
| // |
| // An example use case: |
| // * The user first creates a model through createRnnDescriptor. |
| // * The user queries the size of the underlying opaque parameter buffer. |
| // * The user creates and initializes a parameter buffer of the proper size. |
| // * The user runs forward and backward operations using this RNN descriptor. |
| // * Once a while, user queries maintainable weights and bias regions from |
| // the underlying parameter buffer. They are more likely to be forward |
| // compatible and should used in saving and restoring a model. |
| // * The user releases the RNN descriptor when the model is no longer in use. |
| class RnnDescriptor { |
| public: |
| struct ParamsRegion { |
| int64 offset; |
| int64 size; |
| }; |
| typedef std::vector<ParamsRegion> ParamsRegions; |
| virtual ~RnnDescriptor() {} |
| virtual int64 ParamsSizeInBytes() const { return -1; } |
| virtual ParamsRegions ParamsWeightRegions() const { return ParamsRegions(); } |
| virtual ParamsRegions ParamsBiasRegions() const { return ParamsRegions(); } |
| }; |
| |
| // Specifies the sequence in a RNN model. |
| // |
| // The user is responsible for releasing this descriptor when it is no longer |
| // in use. The destructor releases the underlying descriptors. |
| class RnnSequenceTensorDescriptor { |
| public: |
| virtual ~RnnSequenceTensorDescriptor() {} |
| }; |
| |
| // Specifies either the input and hidden state in a RNN model. |
| // |
| // The user is responsible for releasing this descriptor when it is no longer |
| // in use. The destructor releases the underlying descriptors. |
| class RnnStateTensorDescriptor { |
| public: |
| virtual ~RnnStateTensorDescriptor() {} |
| }; |
| |
| // Returns a string representation of the given quantization mode. |
| string QuantizedActivationModeString(QuantizedActivationMode mode); |
| |
| // Describes the dimensions that a layer consumes/produces. |
| // |
| // This is a matrix (height, width), its "depth" (feature_map_count), |
| // how many of these matrices are present (count), |
| // and the maximum and minimum values expected in the matrix (value_max, |
| // value_min). |
| // If input is quantized, all values greater |
| // than value_max will be clipped to value_max and all values less than |
| // value_min will be clipped to value_min. |
| // When quantized output is dequantized no value will be greater than |
| // value_max or less than value_min. |
| // |
| // Uses the named argument construction form: |
| // |
| // auto input_batch_dimensions = |
| // BatchDescriptor().set_count(42).set_feature_map_count(7)... |
| // |
| // Details: |
| // |
| // For a convolutional layer, a single inference takes a 3-dimensional matrix |
| // of input and produces a 3-dimensional matrix of output. We call the three |
| // dimensions height, width and feature_map_count, where for an image, the |
| // height and width correspond to the Y and X pixel indices, respectively, and |
| // the feature_map_count corresponds to the RGB dimension of the input data. |
| // Then the count indicates how many 3D matrices are being presented to be |
| // processed at once; this corresponds to the neural network concept of |
| // minibatch size. |
| // |
| // For a fully connected layer, it's better to put the nodes of the layer in |
| // the feature_map_count, and leave the height and weight as degenerate (== 1). |
| // Count indicates how many input vectors (degenerate 3D matrices) are to be |
| // processed. |
| // |
| // If unspecified, value_max and value_min default to 0.0. |
| // If value_max == value_min the Stream will attempt to derive valid values - |
| // for example the output of Relu6 activation will always be in the range |
| // [0.0, 6.0]. |
| // |
| // If unspecified, layout defaults to kYXDepthBatch. |
| class BatchDescriptor { |
| public: |
| // Creates a "blank" batch descriptor, which should be initialized via the |
| // named argument helpers. |
| BatchDescriptor(); |
| explicit BatchDescriptor(int ndims); |
| |
| // Clones values from 'other' for initialization. |
| void CloneFrom(const BatchDescriptor& other); |
| |
| string ToString() const; |
| string ToShortString() const; |
| |
| // Pre-condition: |
| // value_max_ == 0 |
| // value_min_ == 0 |
| // quantized_activation_mode_ == QuantizedActivationMode::k8Bit |
| TensorDescriptorProto ToProto(DataType data_type) const; |
| |
| // Accessors. |
| int64 count() const { return tensor_.dimensions(0); } |
| int64 feature_map_count() const { return tensor_.dimensions(1); } |
| int64 height() const { return GetDim(spatial_size(), DimIndex::Y); } |
| int64 width() const { return GetDim(spatial_size(), DimIndex::X); } |
| int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); } |
| int ndims() const { return spatial_size().size(); } |
| float value_max() const { return value_max_; } |
| float value_min() const { return value_min_; } |
| DataLayout layout() const { return tensor_.data_layout(); } |
| QuantizedActivationMode quantized_activation_mode() const { |
| return quantized_activation_mode_; |
| } |
| // Full dimensions of the underlying data, ordered according to a specific |
| // layout. |
| std::vector<int64> full_dims(const DataLayout& layout) const; |
| |
| // Full strides of the underlying data, ordered according to a specific |
| // layout. |
| std::vector<int64> full_strides(const DataLayout& layout) const; |
| |
| // Named-argument helpers for avoiding user error during construction. |
| BatchDescriptor& set_count(int64 value) { |
| tensor_.set_dimensions(0, value); |
| return *this; |
| } |
| BatchDescriptor& set_feature_map_count(int64 value) { |
| tensor_.set_dimensions(1, value); |
| return *this; |
| } |
| BatchDescriptor& set_height(int64 value) { |
| SetDim(spatial_size(), DimIndex::Y, value); |
| return *this; |
| } |
| BatchDescriptor& set_width(int64 value) { |
| SetDim(spatial_size(), DimIndex::X, value); |
| return *this; |
| } |
| BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) { |
| SetDim(spatial_size(), dim, value); |
| return *this; |
| } |
| BatchDescriptor& set_value_max(float value) { |
| value_max_ = value; |
| return *this; |
| } |
| BatchDescriptor& set_value_min(float value) { |
| value_min_ = value; |
| return *this; |
| } |
| BatchDescriptor& set_layout(DataLayout layout) { |
| tensor_.set_data_layout(layout); |
| return *this; |
| } |
| BatchDescriptor& set_quantized_activation_mode( |
| QuantizedActivationMode quantized_activation_mode) { |
| quantized_activation_mode_ = quantized_activation_mode; |
| return *this; |
| } |
| |
| // Return the number of nodes in a single feature map. |
| int64 NodesPerFeatureMap() const; |
| |
| // Return the number of nodes across all feature maps. Note that this is not |
| // affected by the batch count. |
| int64 NodesAcrossFeatureMaps() const; |
| |
| // Returns the number of elements (e.g. RGB pixel values) required to hold a |
| // given batch descriptor, given a no-padding assumption. Note that this is |
| // affected by the batch count. |
| int64 ElementCount() const; |
| |
| // Return the number of weights required to fully connect a layer with |
| // dimensions given by the 'input' descriptor with a layer with dimensions |
| // given by the 'output' descriptor. |
| static int64 FullyConnectedWeightCount(const BatchDescriptor& input, |
| const BatchDescriptor& output); |
| |
| // Return the number of biases required to fully connect to an output layer |
| // with dimensions given the 'output' descriptor. |
| static int64 FullyConnectedBiasCount(const BatchDescriptor& output); |
| |
| // Return a BatchDescriptor for the output of a depth concatenation |
| // with the given input descriptors. The inputs should have the same |
| // dimensions, except possibly for feature_map_count(), though this |
| // function does not verify that. |
| static BatchDescriptor DepthConcatenateOutputDescriptor( |
| port::ArraySlice<dnn::BatchDescriptor> inputs); |
| |
| private: |
| absl::Span<const int64> spatial_size() const { |
| return AsInt64Slice(tensor_.dimensions()).subspan(2); |
| } |
| |
| absl::Span<int64> spatial_size() { |
| return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2); |
| } |
| |
| TensorDescriptorProto tensor_; |
| float value_max_; |
| float value_min_; |
| QuantizedActivationMode quantized_activation_mode_; |
| }; |
| |
| // Returns a string representation of the given filter layout. |
| string FilterLayoutString(FilterLayout layout); |
| |
| // Describes a filter for the convolution. This is the "window" from |
| // height-by-width patches of each of the feature maps in the input layer to the |
| // cells within the output feature map. |
| // |
| // Uses the named argument construction form: |
| // |
| // FilterDescriptor filter_dimensions; |
| // filter_dimensions |
| // .set_output_feature_map_count(42) |
| // .set_input_feature_map_count(7) |
| // ... |
| // |
| // Arguments: |
| // - output_feature_map_count: number of feature maps in the output layer. |
| // - input_feature_map_count: number of feature maps in the input layer (from |
| // which the filter patch is taken). |
| // - input_filter_height: "height" number of neurons used in the sliding window |
| // over the input layer. |
| // - input_filter_width: "width" number of neurons used in the sliding window |
| // over the input layer. |
| // |
| // Sometimes names like "filter input height" are referred to by synonymous |
| // terminology, such as "kernel y size". |
| // |
| // If unspecified, layout defaults to kOutputInputYX. |
| class FilterDescriptor { |
| public: |
| // By default construction, all dimensions are set to zero, so they should all |
| // be populated by the user via the named-argument helpers below. (See class |
| // comment for details.) |
| FilterDescriptor(); |
| explicit FilterDescriptor(int ndims); |
| ~FilterDescriptor(); |
| |
| // Named-argument helpers for avoiding user error during construction. |
| FilterDescriptor& set_output_feature_map_count(int64 value) { |
| tensor_.set_dimensions(0, value); |
| return *this; |
| } |
| FilterDescriptor& set_input_feature_map_count(int64 value) { |
| tensor_.set_dimensions(1, value); |
| return *this; |
| } |
| FilterDescriptor& set_input_filter_height(int64 value) { |
| SetDim(input_filter_dims(), DimIndex::Y, value); |
| return *this; |
| } |
| FilterDescriptor& set_input_filter_width(int64 value) { |
| SetDim(input_filter_dims(), DimIndex::X, value); |
| return *this; |
| } |
| FilterDescriptor& set_layout(FilterLayout layout) { |
| tensor_.set_filter_layout(layout); |
| return *this; |
| } |
| FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) { |
| SetDim(input_filter_dims(), dim, value); |
| return *this; |
| } |
| int ndims() const { return input_filter_dims().size(); } |
| |
| void CloneFrom(const FilterDescriptor& other); |
| |
| string ToString() const; |
| string ToShortString() const; |
| TensorDescriptorProto ToProto(DataType data_type) const; |
| |
| // Returns the number of weights required as parameters for a convolution |
| // using this filter descriptor. |
| int64 ComputeWeightCount() const; |
| |
| // Returns the number of biases required as parameters for a convolution |
| // using this filter descriptor. |
| int64 bias_count() const { return output_feature_map_count(); } |
| |
| int64 output_feature_map_count() const { return tensor_.dimensions(0); } |
| int64 input_feature_map_count() const { return tensor_.dimensions(1); } |
| int64 input_filter_height() const { |
| return GetDim(input_filter_dims(), DimIndex::Y); |
| } |
| int64 input_filter_width() const { |
| return GetDim(input_filter_dims(), DimIndex::X); |
| } |
| int64 input_filter_dim(DimIndex dim) const { |
| return GetDim(input_filter_dims(), dim); |
| } |
| |
| FilterLayout layout() const { return tensor_.filter_layout(); } |
| |
| absl::Span<const int64> input_filter_dims() const { |
| return AsInt64Slice(tensor_.dimensions()).subspan(2); |
| } |
| |
| private: |
| absl::Span<int64> input_filter_dims() { |
| return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2); |
| } |
| |
| TensorDescriptorProto tensor_; |
| }; |
| |
| // Describes how padding should be aligned when the total number of pad |
| // elements is odd. |
| enum class PadAlignment : int64 { |
| kDefault = 0, // default padding for the device. |
| kCudnnPadding, // cuDNN padding - prefer to pad at the start. |
| kTensorFlowPadding, // TensorFlow padding - prefer to pad at the end. |
| }; |
| |
| // Returns a string representation of the given padding alignment. |
| string PadAlignmentString(PadAlignment alignment); |
| |
| // Print alignment to str. Needed to use CHECK_EQ between two PadAlignments. |
| std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment); |
| |
| // Describes a convolution. |
| // |
| // Uses the named argument construction form: |
| // |
| // ConvolutionDescriptor convolution_dimensions; |
| // convolution_dimensions |
| // .set_vertical_filter_stride(2) |
| // .set_horizontal_filter_stride(2) |
| // ... |
| // |
| // Arguments: |
| // - zero_padding_height: padding of the "y dimension" of the input data. Note |
| // that this is different from the height of the filter. |
| // - zero_padding_width: analogous to the height above, but in the "x |
| // dimension". |
| // - vertical_filter_stride: the convolution slides a 2-dimensional window of |
| // filter-height-by-filter-width over the input layer -- the center of that |
| // window is moved in the "y dimension" according to this stride value. |
| // - horizontal_filter_stride: analogous to the vertical stride above, but in |
| // the "x dimension". |
| // - vertical_dilation_rate: there will be (vertical_dilation_rate - 1) skipped |
| // cells between each filter element in the "y dimension". |
| // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1) |
| // skipped cells between each filter element in the "x dimension". |
| // - convolution_not_crosscor: By default (convolution_not_crosscor == false), |
| // we perform cross correlation rather than convolution. With the flag set, |
| // we perform convolution. Convolution and cross correlation are related by |
| // rotating the filter by 180 degrees (or equivalently flipping all spatial |
| // dimensions). |
| class ConvolutionDescriptor { |
| public: |
| // By default construction, there is no zero-padding and the filter stride is |
| // 1x1 (centering the filter on every cell in the input layer's |
| // width-by-height area). |
| ConvolutionDescriptor(); |
| explicit ConvolutionDescriptor(int ndims); |
| ~ConvolutionDescriptor(); |
| |
| string ToString() const; |
| string ToShortString() const; |
| ConvolutionDescriptorProto ToProto() const { return proto_; } |
| |
| ConvolutionDescriptor& set_zero_padding_height(int64 value) { |
| SetDim(padding(), DimIndex::Y, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_zero_padding_width(int64 value) { |
| SetDim(padding(), DimIndex::X, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) { |
| SetDim(padding(), dim, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_vertical_filter_stride(int64 value) { |
| SetDim(strides(), DimIndex::Y, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) { |
| SetDim(strides(), DimIndex::X, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) { |
| SetDim(strides(), dim, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) { |
| SetDim(dilations(), DimIndex::Y, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) { |
| SetDim(dilations(), DimIndex::X, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) { |
| SetDim(dilations(), dim, value); |
| return *this; |
| } |
| ConvolutionDescriptor& set_group_count(int group_count) { |
| proto_.set_group_count(group_count); |
| return *this; |
| } |
| ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) { |
| proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION |
| : ConvolutionMode::CROSS_CORRELATION); |
| return *this; |
| } |
| ConvolutionDescriptor& set_name(const string& name) { |
| proto_.set_name(name); |
| return *this; |
| } |
| int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); } |
| int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); } |
| int64 vertical_filter_stride() const { |
| return GetDim(strides(), DimIndex::Y); |
| } |
| int64 horizontal_filter_stride() const { |
| return GetDim(strides(), DimIndex::X); |
| } |
| int64 vertical_dilation_rate() const { |
| return GetDim(dilations(), DimIndex::Y); |
| } |
| int64 horizontal_dilation_rate() const { |
| return GetDim(dilations(), DimIndex::X); |
| } |
| |
| int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); } |
| int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); } |
| int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); } |
| // TODO(timshen): remove this function. No users of this class is setting a |
| // non-default pad alignment. |
| PadAlignment pad_alignment() const { return PadAlignment::kDefault; } |
| int group_count() const { return proto_.group_count(); } |
| int ndims() const { return padding().size(); } |
| bool convolution_not_crosscorr() const { |
| return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION; |
| } |
| |
| absl::Span<const int64> strides() const { |
| return AsInt64Slice(proto_.strides()); |
| } |
| |
| absl::Span<const int64> dilations() const { |
| return AsInt64Slice(proto_.dilations()); |
| } |
| |
| absl::Span<const int64> padding() const { |
| return AsInt64Slice(proto_.paddings()); |
| } |
| |
| string name() const { return proto_.name(); } |
| |
| private: |
| absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); } |
| |
| absl::Span<int64> dilations() { |
| return AsInt64Slice(proto_.mutable_dilations()); |
| } |
| |
| absl::Span<int64> padding() { |
| return AsInt64Slice(proto_.mutable_paddings()); |
| } |
| |
| ConvolutionDescriptorProto proto_; |
| |
| // TODO(leary) cudnn provides these fields, but need to characterize what |
| // their effect is -- they may be boolean rather than integral. |
| // int64 upscale_input_x; |
| // int64 upscale_input_y; |
| }; |
| |
| // A patch of values in the input can be pooled via either a max or an average |
| // operation. |
| // Specify int64 so there's no padding in PoolingDescriptor. |
| enum class PoolingMode : int64 { |
| kMaximum, |
| kAverage, |
| }; |
| |
| // Specify the dimension in which to concatenate inputs in space. |
| // Specify int64 so there's no padding in SpaceConcatenateMode. |
| enum class SpaceConcatenateMode : int64 { |
| XDirection, |
| YDirection, |
| }; |
| |
| // Returns a short name for the pooling mode, e.g. "Avg". |
| string ShortPoolingModeString(PoolingMode mode); |
| |
| // Describes a pooling operation to be enqueued onto a stream via a platform's |
| // DnnSupport. |
| // |
| // TODO(broune): describe how padding works and what happens if the |
| // window height/width is not divisible by the vertical/horizontal |
| // stride. |
| // |
| // Arguments: |
| // pooling_mode: pooling operator to use on the input patch |
| // window_height: height of input window |
| // window_width: width of input window |
| // vertical_stride: vertical delta for center of the input patch |
| // horizontal_stride: horizontal delta for center of the input patch |
| class PoolingDescriptor { |
| public: |
| PoolingDescriptor(); |
| explicit PoolingDescriptor(int ndims); |
| |
| PoolingDescriptor& set_pooling_mode(PoolingMode value) { |
| mode_ = value; |
| return *this; |
| } |
| PoolingDescriptor& set_window_height(int64 value) { |
| SetDim(&window_, DimIndex::Y, value); |
| return *this; |
| } |
| PoolingDescriptor& set_window_width(int64 value) { |
| SetDim(&window_, DimIndex::X, value); |
| return *this; |
| } |
| PoolingDescriptor& set_window(DimIndex dim, int64 value) { |
| SetDim(&window_, dim, value); |
| return *this; |
| } |
| PoolingDescriptor& set_vertical_padding(int64 value) { |
| SetDim(&padding_, DimIndex::Y, value); |
| return *this; |
| } |
| PoolingDescriptor& set_horizontal_padding(int64 value) { |
| SetDim(&padding_, DimIndex::X, value); |
| return *this; |
| } |
| PoolingDescriptor& set_padding(DimIndex dim, int64 value) { |
| SetDim(&padding_, dim, value); |
| return *this; |
| } |
| PoolingDescriptor& set_vertical_stride(int64 value) { |
| SetDim(&strides_, DimIndex::Y, value); |
| return *this; |
| } |
| PoolingDescriptor& set_horizontal_stride(int64 value) { |
| SetDim(&strides_, DimIndex::X, value); |
| return *this; |
| } |
| PoolingDescriptor& set_stride(DimIndex dim, int64 value) { |
| SetDim(&strides_, dim, value); |
| return *this; |
| } |
| PoolingDescriptor& set_propagate_nans(bool value) { |
| propagate_nans_ = value; |
| return *this; |
| } |
| PoolingDescriptor& set_name(const string& name) { |
| name_ = name; |
| return *this; |
| } |
| |
| int ndims() const { return ndims_; } |
| void CloneFrom(const PoolingDescriptor& other); |
| |
| string ToString() const; |
| string ToShortString() const; |
| |
| PoolingMode mode() const { return mode_; } |
| int64 window_height() const { return GetDim(window_, DimIndex::Y); } |
| int64 window_width() const { return GetDim(window_, DimIndex::X); } |
| int64 window(DimIndex dim) const { return GetDim(window_, dim); } |
| int64 vertical_padding() const { return GetDim(padding_, DimIndex::Y); } |
| int64 horizontal_padding() const { return GetDim(padding_, DimIndex::X); } |
| int64 padding(DimIndex dim) const { return GetDim(padding_, dim); } |
| int64 vertical_stride() const { return GetDim(strides_, DimIndex::Y); } |
| int64 horizontal_stride() const { return GetDim(strides_, DimIndex::X); } |
| int64 stride(DimIndex dim) const { return GetDim(strides_, dim); } |
| absl::Span<const int64> window() const { return window_; } |
| absl::Span<const int64> padding() const { return padding_; } |
| absl::Span<const int64> strides() const { return strides_; } |
| bool propagate_nans() const { return propagate_nans_; } |
| string name() const { return name_; } |
| |
| private: |
| PoolingMode mode_; |
| int ndims_; |
| bool propagate_nans_; |
| string name_; // Name as in Tensorflow NodeDef, for debugging purposes. |
| |
| // Stored as: ..., y, x. |
| std::vector<int64> window_; |
| std::vector<int64> padding_; |
| std::vector<int64> strides_; |
| }; |
| |
| // Collects parameters for DNN algorithms |
| class AlgorithmDesc { |
| public: |
| typedef int64 Index; |
| AlgorithmDesc() : AlgorithmDesc(0, false) {} |
| AlgorithmDesc(Index a, bool use_tensor_ops) { |
| proto_.set_algo_id(a); |
| proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH |
| : AlgorithmProto::DEFAULT_MATH); |
| } |
| bool tensor_ops_enabled() const { |
| return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH; |
| } |
| Index algo_id() const { return proto_.algo_id(); } |
| bool operator==(const AlgorithmDesc& other) const { |
| return algo_id() == other.algo_id() && |
| tensor_ops_enabled() == other.tensor_ops_enabled(); |
| } |
| uint64 hash() const; |
| |
| AlgorithmProto ToProto() const { return proto_; } |
| |
| string ToString() const; |
| |
| private: |
| AlgorithmProto proto_; |
| }; |
| |
| // Describes the result from a perf experiment. |
| // |
| // Arguments: |
| // algorithm: returns the exact algorithm that was used. |
| // elapsed_time_in_ms: returns the measured elapsed time in milliseconds. |
| class ProfileResult { |
| public: |
| bool is_valid() const { |
| return algorithm_.has_value() && |
| elapsed_time_in_ms() != std::numeric_limits<float>::max(); |
| } |
| |
| AlgorithmDesc algorithm() const { return *algorithm_; } |
| void set_algorithm(AlgorithmDesc val) { algorithm_ = val; } |
| |
| float elapsed_time_in_ms() const { return elapsed_time_in_ms_; } |
| void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; } |
| |
| size_t scratch_size() const { return scratch_size_; } |
| void set_scratch_size(size_t val) { scratch_size_ = val; } |
| |
| private: |
| absl::optional<AlgorithmDesc> algorithm_; |
| float elapsed_time_in_ms_ = std::numeric_limits<float>::max(); |
| // The scratch size algorithm_ requires. Currently it's only populated by |
| // convolutions. |
| size_t scratch_size_ = 0; |
| }; |
| |
| // Describes the configuration for the algorithms that will used. |
| // |
| // Arguments: |
| // algorithm: the primary algorithm that should be used. |
| // algorithm_no_scratch: a secondary algorithm that should be used, if the |
| // the allocation for the scratch memory fails. |
| // scrach_size: specify the size of scratch memory in bytes needed for the |
| // algorithm used. |
| // |
| // On CUDA platform with CUDNN library, algorithm and algorithm_no_scratch |
| // would be used. On ROCm platform with MIOpen library, algorithm and |
| // scratch_size would be used. The major difference between the two platforms |
| // are whether it's possible to get an algorithm without scratch memory. On |
| // CUDA + CUDNN it's possible, and algorithm_no_scratch can be used to track |
| // such information, whereas on ROCm + MIOpen there is no guarantee to getting |
| // one without scratch memory, and scratch_size field is used to track it. |
| class AlgorithmConfig { |
| public: |
| AlgorithmConfig() {} |
| explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {} |
| AlgorithmConfig(AlgorithmDesc algorithm, size_t scratch_size) |
| : algorithm_(algorithm), scratch_size_(scratch_size) {} |
| AlgorithmConfig(AlgorithmDesc algorithm, AlgorithmDesc algorithm_no_scratch) |
| : algorithm_(algorithm), algorithm_no_scratch_(algorithm_no_scratch) {} |
| absl::optional<AlgorithmDesc> algorithm() const { return algorithm_; } |
| void set_algorithm(AlgorithmDesc val) { algorithm_ = val; } |
| absl::optional<AlgorithmDesc> algorithm_no_scratch() const { |
| return algorithm_no_scratch_; |
| } |
| void set_algorithm_no_scratch(AlgorithmDesc val) { |
| algorithm_no_scratch_ = val; |
| } |
| absl::optional<size_t> scratch_size() const { return scratch_size_; } |
| void set_scratch_size(size_t val) { scratch_size_ = val; } |
| bool operator==(const AlgorithmConfig& other) const { |
| return this->algorithm_ == other.algorithm_ && |
| this->algorithm_no_scratch_ == other.algorithm_no_scratch_ && |
| this->scratch_size_ == other.scratch_size_; |
| } |
| bool operator!=(const AlgorithmConfig& other) const { |
| return !(*this == other); |
| } |
| string ToString() const; |
| |
| private: |
| absl::optional<AlgorithmDesc> algorithm_; |
| absl::optional<AlgorithmDesc> algorithm_no_scratch_; |
| absl::optional<size_t> scratch_size_; |
| }; |
| |
| // Describes a local response normalization (LRN). LRN is used e.g. in |
| // dist_belief. |
| // |
| // Let V be the vector of feature maps at some (batch, y, x) |
| // coordinate. LRN applies independently to each vector V in the |
| // input, across all coordinates (batch, y, x), by mapping each V to |
| // another vector U of the same size using the formula |
| // |
| // U_i = V_i / ((bias + alpha * (sum_j V_j^2)) ^ beta) |
| // |
| // where the sum is taken over j in the closed range [i - range, i + range]. |
| // |
| // When calculating U_i the j in the sum can extend beyond the bounds |
| // of V. If wrap_around is true, then V_j = V_{j mod F} where F is the |
| // size of V, which is the number of feature maps. If wrap_around is |
| // false, then V_j = 0 for j outside [0, F-1]. |
| // |
| // If segment_size <= F, where F is the number of feature_maps, then |
| // segment_size has no effect. Otherwise, each consecutive segment of |
| // segment_size entries in V are normalized separately. |
| // |
| // Not all StreamExecutors allow wrap_around == true or segment_size |
| // != 64. Some do not implement normalization at all. |
| class NormalizeDescriptor { |
| public: |
| NormalizeDescriptor(); |
| |
| NormalizeDescriptor& set_bias(float bias) { |
| bias_ = bias; |
| return *this; |
| } |
| |
| NormalizeDescriptor& set_range(int32 range) { |
| range_ = range; |
| return *this; |
| } |
| |
| NormalizeDescriptor& set_alpha(float alpha) { |
| alpha_ = alpha; |
| return *this; |
| } |
| |
| NormalizeDescriptor& set_beta(float beta) { |
| beta_ = beta; |
| return *this; |
| } |
| |
| NormalizeDescriptor& set_wrap_around(bool wrap_around) { |
| wrap_around_ = wrap_around; |
| return *this; |
| } |
| |
| NormalizeDescriptor& set_segment_size(int32 segment_size) { |
| segment_size_ = segment_size; |
| return *this; |
| } |
| |
| void CloneFrom(const NormalizeDescriptor& other); |
| |
| string ToString() const; |
| string ToShortString() const; |
| |
| float bias() const { return bias_; } |
| int32 range() const { return range_; } |
| float alpha() const { return alpha_; } |
| float beta() const { return beta_; } |
| bool wrap_around() const { return wrap_around_; } |
| int32 segment_size() const { return segment_size_; } |
| |
| private: |
| float bias_; |
| int32 range_; |
| float alpha_; |
| float beta_; |
| bool wrap_around_; |
| int32 segment_size_; |
| }; |
| |
| // Returns a string representation of the given activation mode. |
| string ActivationModeString(ActivationMode mode); |
| |
| // Describes the operation that DoElementwiseOperation should perform on its |
| // inputs. |
| enum class ElementwiseOperation { kAdd, kMultiply }; |
| |
| string ElementwiseOperationString(ElementwiseOperation op); |
| |
| // A simple class representing the version of the backing library, to |
| // workaround the "too perfect forwarding" issue in gcc6+ compilers. |
| // See PR#16309 and issue #18402 for links discussing the issue. |
| class VersionInfo { |
| public: |
| VersionInfo(int major = 0, int minor = 0, int patch = 0) |
| : major_(major), minor_(minor), patch_(patch) {} |
| int major_version() const { return major_; } |
| int minor_version() const { return minor_; } |
| int patch() const { return patch_; } |
| |
| private: |
| int major_; |
| int minor_; |
| int patch_; |
| }; |
| |
| // Suite of operations typically used for implementing Deep/Convolutional Neural |
| // Nets. Note: A false return value of an operation indicates the |
| // implementation is not available. |
| // |
| // TODO(b/118763918): this class (or rather dispatch table) has several |
| // problems: |
| // * Some overloads are missing. Ideally we want to have template virtual |
| // functions while the template arguments is a closed set. However, we don't |
| // get that from the language. |
| // * The API is a union of cuDNN and another private backend. Only 10% of the |
| // functions are actually implemented by both backends, the rest are |
| // actually backend-specific. The massive interface creates extra mental |
| // burden. |
| // * Poor error handling: the API should return Status objects. |
| // |
| // PrepareForConvolution is an example for how new APIs should be written. |
| class DnnSupport { |
| public: |
| DnnSupport() {} |
| virtual ~DnnSupport() {} |
| |
| virtual port::Status Init() = 0; |
| |
| // Gets the version of the backing library, as a VersionInfo object. |
| virtual port::StatusOr<VersionInfo> GetVersion() { |
| return port::UnimplementedError( |
| "DnnSupport::GetVersion not implemented on this platform."); |
| } |
| |
| // Performs a single-precision forward batch normalization operation onto |
| // the stream. |
| // |
| // Arguments: |
| // stream: borrowed pointer to the stream that the batch normalization |
| // operation should be enqueued onto. |
| // x: input data. |
| // scale: scaling parameters. |
| // offset: offset parameters. |
| // estimated_mean: population mean estimated during training. |
| // Used for inference only; empty for training. |
| // estimated_variance: population variance estimated during training, |
| // used for inference only; empty for training. |
| // side_input: optional input that is element-wise added to the output of |
| // batch normalization. |
| // x_desc: dimensions of the input data, which is the same as the dimensions |
| // of the output and side input. |
| // scale_offset_desc: dimensions of scale and offset. |
| // epsilon: a small floating point number added to the variance of x. |
| // activation_mode: activation applied to the result of batch normalization |
| // (or after adding optional side input) |
| // y: output data. |
| // batch_mean: batch mean, to be used to compute the running mean. |
| // batch_variance: batch variance, to be used to compute |
| // the running variance. |
| // reserve_space_1: saved mean, to be reused in the backward gradient |
| // computation. |
| // reserve_space_2: saved inv_var (1/sqrt(epsilon + variance), to be reused |
| // in the backward gradient computation. |
| // is_training: Set to true for training, false for inference. |
| // var_to_inv_var: a function to convert the variance to inverted variance |
| // for cuDNN v4 forward inference. |
| // inv_var_to_var: a function to convert the inverted variance to |
| // variance for cuDNN v4 forward training, to be used for TensorFlow |
| // to calculate the running variance. |
| virtual bool DoBatchNormalizationForward( |
| Stream* stream, const DeviceMemory<float>& x, |
| const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, |
| const DeviceMemory<float>& estimated_mean, |
| const DeviceMemory<float>& estimated_variance, |
| const DeviceMemory<float>& side_input, const dnn::BatchDescriptor& x_desc, |
| const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, |
| const double exponential_average_factor, |
| dnn::ActivationMode activation_mode, DeviceMemory<float>* y, |
| DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var, |
| DeviceMemory<float>* reserve_space_1, |
| DeviceMemory<float>* reserve_space_2, bool is_training, |
| ScratchAllocator* reserve_space_allocator, |
| ScratchAllocator* workspace_allocator, |
| std::function<const DeviceMemory<float>&()> var_to_inv_var, |
| std::function<void()> inv_var_to_var) { |
| return false; |
| } |
| |
| // Performs a half-precision forwards batch normalization operation onto the |
| // stream. See DoBatchNormalizationForward above for argument details. |
| virtual bool DoBatchNormalizationForward( |
| Stream* stream, const DeviceMemory<Eigen::half>& x, |
| const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, |
| const DeviceMemory<float>& estimated_mean, |
| const DeviceMemory<float>& estimated_variance, |
| const DeviceMemory<float>& side_input, const dnn::BatchDescriptor& x_desc, |
| const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, |
| const double exponential_average_factor, |
| dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y, |
| DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var, |
| DeviceMemory<float>* reserve_space_1, |
| DeviceMemory<float>* reserve_space_2, bool is_training, |
| ScratchAllocator* reserve_space_allocator, |
| ScratchAllocator* workspace_allocator, |
| std::function<const DeviceMemory<float>&()> var_to_inv_var, |
| std::function<void()> inv_var_to_var) { |
| return false; |
| } |
| |
| // Performs a single-precision backward batch normalization gradient |
| // computation operation onto the stream. |
| // |
| // Arguments: |
| // stream: borrowed pointer to the stream that the batch normalization |
| // gradient computation operation should be enqueued onto. |
| // y_backprop: gradient with regard to output y. |
| // x: input data. |
| // scale: scaling parameters. |
| // inv_var: 1/sqrt(epsilon + variance) of x. |
| // x_desc: dimensions of the input data, which is the same as the dimensions |
| // of the output. |
| // scale_offset_desc: dimensions of scale and offset. |
| // epsilon: a small floating point number added to the variance of x. |
| // x_backprop: gradient with respect to input x. |
| // scale_backprop: gradient with respect to scale. |
| // offset_backprop: gradient with respect to offset. |
| virtual bool DoBatchNormalizationBackward( |
| Stream* stream, const DeviceMemory<float>& y_backprop, |
| const DeviceMemory<float>& x, const DeviceMemory<float>& scale, |
| const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var, |
| const dnn::BatchDescriptor& x_desc, |
| const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, |
| DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop, |
| DeviceMemory<float>* offset_backprop, |
| DeviceMemory<uint8>* reserve_space_data, |
| ScratchAllocator* workspace_allocator) { |
| return false; |
| } |
| |
| // Performs a half-precision backward batch normalization gradient computation |
| // operation onto the stream. See DoBatchNormalizationBackward above for |
| // argument details. |
| virtual bool DoBatchNormalizationBackward( |
| Stream* stream, const DeviceMemory<Eigen::half>& y_backprop, |
| const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale, |
| const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var, |
| const dnn::BatchDescriptor& x_desc, |
| const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, |
| DeviceMemory<Eigen::half>* x_backprop, |
| DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop, |
| DeviceMemory<uint8>* reserve_space_data, |
| ScratchAllocator* workspace_allocator) { |
| return false; |
| } |
| |
| // Enqueues a fused convolution operation onto the stream. |
| // We provide several variants with different types for inputs, biases and |
| // scaling parameters. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'convolve' operation |
| // should be enqueued onto. |
| // conv_input_descriptor: dimensions of the convolution input layer. |
| // conv_input_data: un-owned device memory region which contains the |
| // convolution input. |
| // conv_input_scale: a floating point scale to multiply with each element |
| // of conv_input_data. |
| // filter_descriptor: dimensions of the convolution filter. |
| // filter_data: un-owned device memory region which contains the |
| // convolution filter weights. |
| // convolution_descriptor: stride of the convolution filter. |
| // biases: un-owned device memory region containing biases to add to the |
| // input. |
| // activation_mode: Type of activation to perform. |
| // side_input_data: un-owned device memory region which contains optional |
| // side input data. If 'side_input_scale' is non-zero, then this must |
| // point to data in the tensor shape specified by output_shape. |
| // It will be scaled by 'side_input_scale' and added to the convolution |
| // result and bias prior to applying the activation function. |
| // side_input_scale: a floating point scale to multiply with each element |
| // of side_input_data. |
| // output_descriptor: dimensions of the output layer. |
| // output_data: un-owned device memory region in which to place the |
| // convolution result. |
| // scratch_allocator: un-owned, may-be-null object that may allocate scratch |
| // space in order to speed up the convolution operation. |
| // algorithm_config: specifies which algorithm should be used for the |
| // operation. |
| // output_profile_result: the output profile result for this call. The |
| // profiling is only enabled when this is not nullptr. |
| // |
| // conv_input_descriptor, filter_descriptor, convolution_descriptor and |
| // output_descriptor together specify exactly how the convolution is aligned |
| // with the input data: |
| // |
| // * (input dimensions - filter size + 1) / filter stride == output dimensions |
| // corresponds to dist_belief padding = VALID, i.e. the input is not padded. |
| // * input dimensions / filter stride == output dimensions |
| // corresponds to dist_belief padding = SAME, i.e. input and output are the |
| // same size - this requires padding the input. |
| // * (input dimensions + filter size - 1) / filter stride == output dimensions |
| // corresponds to dist_belief padding = FULL, i.e. the output is sized so |
| // that if the inverse of the filter is applied to the output in VALID mode |
| // the result is the same size as the input - this requires even more |
| // padding of the input. |
| virtual bool DoFusedConvolve( |
| Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, |
| const DeviceMemory<double>& conv_input_data, double conv_input_scale, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<double>& filter_data, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const DeviceMemory<double>& side_input_data, double side_input_scale, |
| const dnn::BatchDescriptor& bias_descriptor, |
| const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode, |
| const dnn::BatchDescriptor& output_descriptor, |
| DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator, |
| const dnn::AlgorithmConfig& algorithm_config, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| // This is the float version of DoFusedConvolve. |
| virtual bool DoFusedConvolve( |
| Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, |
| const DeviceMemory<float>& conv_input_data, float conv_input_scale, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<float>& filter_data, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const DeviceMemory<float>& side_input_data, float side_input_scale, |
| const dnn::BatchDescriptor& bias_descriptor, |
| const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode, |
| const dnn::BatchDescriptor& output_descriptor, |
| DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator, |
| const dnn::AlgorithmConfig& algorithm_config, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| // This is the Eigen::half version of DoFusedConvolve. |
| // The scaling parameters are still floats. |
| virtual bool DoFusedConvolve( |
| Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, |
| const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<Eigen::half>& filter_data, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale, |
| const dnn::BatchDescriptor& bias_descriptor, |
| const DeviceMemory<Eigen::half>& biases, |
| dnn::ActivationMode activation_mode, |
| const dnn::BatchDescriptor& output_descriptor, |
| DeviceMemory<Eigen::half>* output_data, |
| ScratchAllocator* scratch_allocator, |
| const dnn::AlgorithmConfig& algorithm_config, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| // This is the int8 version of DoFusedConvolve. |
| // The bias input and scaling parameters are floats. |
| virtual bool DoFusedConvolve( |
| Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, |
| const DeviceMemory<int8>& conv_input_data, float conv_input_scale, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<int8>& filter_data, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const DeviceMemory<int8>& side_input_data, float side_input_scale, |
| const dnn::BatchDescriptor& bias_descriptor, |
| const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode, |
| const dnn::BatchDescriptor& output_descriptor, |
| DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator, |
| const dnn::AlgorithmConfig& algorithm_config, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| // This is the int8 version of DoFusedConvolve. |
| // The output, bias input and scaling parameters are floats. |
| virtual bool DoFusedConvolve( |
| Stream* /*stream*/, const dnn::BatchDescriptor& /*conv_input_descriptor*/, |
| const DeviceMemory<int8>& /*conv_input_data*/, float /*conv_input_scale*/, |
| const dnn::FilterDescriptor& /*filter_descriptor*/, |
| const DeviceMemory<int8>& /*filter_data*/, |
| const dnn::ConvolutionDescriptor& /*convolution_descriptor*/, |
| const DeviceMemory<float>& /*side_input_data*/, |
| float /*side_input_scale*/, |
| const dnn::BatchDescriptor& /*bias_descriptor*/, |
| const DeviceMemory<float>& /*biases*/, |
| dnn::ActivationMode /*activation_mode*/, |
| const dnn::BatchDescriptor& /*output_descriptor*/, |
| DeviceMemory<float>* /*output_data*/, |
| ScratchAllocator* /*scratch_allocator*/, |
| const dnn::AlgorithmConfig& /*algorithm_config*/, |
| dnn::ProfileResult* /*output_profile_result*/) { |
| return false; |
| } |
| |
| template <typename ElementType, typename OutputType> |
| port::Status PrepareForConvolution( |
| ConvolutionKind kind, Stream* stream, |
| const BatchDescriptor& batch_descriptor, |
| DeviceMemory<ElementType> input_data, |
| const FilterDescriptor& filter_descriptor, |
| DeviceMemory<ElementType> filter_data, |
| const BatchDescriptor& output_descriptor, |
| DeviceMemory<OutputType> output_data, |
| const ConvolutionDescriptor& convolution_descriptor, |
| const AlgorithmConfig& algorithm_config, |
| ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc, |
| DeviceMemory<uint8>* scratch_memory) { |
| return DoPrepareForConvolution( |
| kind, ToDataType<ElementType>::value, stream, batch_descriptor, |
| input_data, filter_descriptor, filter_data, output_descriptor, |
| output_data, convolution_descriptor, algorithm_config, |
| scratch_allocator, algorithm_desc, scratch_memory); |
| } |
| |
| // Enqueues a single-precision convolution operation onto the stream. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'convolve' operation |
| // should be enqueued onto. |
| // input_descriptor: dimensions of the input layer. |
| // input_data: un-owned device memory region which contains the |
| // convolution input. |
| // filter_descriptor: dimensions of the convolution filter. |
| // convolution_descriptor: stride of the convolution filter. |
| // output_descriptor: dimensions of the output layer. |
| // output_data: un-owned device memory region in which to place the |
| // convolution result. |
| // algorithm_desc: specifies which algorithm should be used for the |
| // operation. |
| // scratch: un-owned device memory for scratch space in order to speed up |
| // the convolution operation. |
| // output_profile_result: the output profile result for this call. The |
| // profiling is only enabled when this is not nullptr. |
| // |
| // input_descriptor, filter_descriptor, convolution_descriptor and |
| // output_descriptor together specify exactly how the convolution is aligned |
| // with the input data: |
| // |
| // * (input dimensions - filter size + 1) / filter stride == output dimensions |
| // corresponds to dist_belief padding = VALID, i.e. the input is not padded. |
| // * input dimensions / filter stride == output dimensions |
| // corresponds to dist_belief padding = SAME, i.e. input and output are the |
| // same size - this requires padding the input. |
| // * (input dimensions + filter size - 1) / filter stride == output dimensions |
| // corresponds to dist_belief padding = FULL, i.e. the output is sized so |
| // that if the inverse of the filter is applied to the output in VALID mode |
| // the result is the same size as the input - this requires even more |
| // padding of the input. |
| virtual port::Status DoConvolve( |
| ConvolutionKind kind, DataType element_type, DataType output_type, |
| Stream* stream, const BatchDescriptor& input_descriptor, |
| DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor, |
| DeviceMemoryBase filter_data, const BatchDescriptor& output_descriptor, |
| DeviceMemoryBase output_data, |
| const ConvolutionDescriptor& convolution_descriptor, |
| AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory, |
| ProfileResult* output_profile_result) = 0; |
| |
| template <typename ElementType, typename OutputType> |
| bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor, |
| const DeviceMemory<ElementType>& input_data, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<ElementType>& filter_data, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const dnn::BatchDescriptor& output_descriptor, |
| DeviceMemory<OutputType>* output_data, |
| const dnn::AlgorithmDesc& algorithm_desc, |
| DeviceMemory<uint8>* scratch_memory, |
| ProfileResult* output_profile_result) { |
| return IsStatusOk( |
| DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value, |
| ToDataType<OutputType>::value, stream, input_descriptor, |
| input_data, filter_descriptor, filter_data, |
| output_descriptor, *output_data, convolution_descriptor, |
| algorithm_desc, *scratch_memory, output_profile_result), |
| !output_profile_result); |
| } |
| |
| // Return a list of algorithms supported by the forward convolution pass. |
| // cc_major and cc_minor are the compute capabilities of the device. |
| virtual bool GetConvolveAlgorithms( |
| bool with_winograd_nonfused, int cc_major, int cc_minor, |
| std::vector<AlgorithmDesc>* out_algorithms); |
| |
| virtual bool GetMIOpenConvolveAlgorithms( |
| dnn::ConvolutionKind kind, Stream* stream, dnn::DataType element_type, |
| const dnn::BatchDescriptor& input_descriptor, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const dnn::BatchDescriptor& output_descriptor, |
| std::vector<ProfileResult>* out_algorithms); |
| |
| // Returns a list of supported rnn algorithms. |
| virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms); |
| |
| // Version of DoConvolve that uses pre-quantized 8 bit coefficients. |
| // coefficient_scales specifies the scaling of each column of coefficients: |
| // original float coefficient[row * num_columns + column] = |
| // quantized coefficient[row * num_columns + column] * |
| // coefficient_scales[column]. |
| virtual bool DoConvolveQuantized( |
| Stream* stream, const dnn::BatchDescriptor& input_descriptor, |
| const DeviceMemory<float>& input_data, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<int8>& filter_coefficients, |
| const DeviceMemory<float>& coefficient_scales, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const dnn::BatchDescriptor& output_descriptor, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Same as DoConvolveQuantized above, but int8 filter coefficients. |
| virtual bool DoConvolveQuantized( |
| Stream* stream, const dnn::BatchDescriptor& input_descriptor, |
| const DeviceMemory<float>& input_data, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<int16>& filter_coefficients, |
| const DeviceMemory<float>& coefficient_scales, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const dnn::BatchDescriptor& output_descriptor, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Variation of the above with the weight matrix split into two matrices. |
| // first_weights: Coefficients of the first matrix. |
| // second_weights: Coefficients of the second matrix. |
| // depth_multiplier: specifies the columns of the first matrix and rows |
| // of the second one - first_weights columns = depth_multiplier, |
| // second_weights rows = depth_multiplier * |
| // filter_descriptor.input_feature_map_count(). |
| // see go/separable for documentation on separable convolutions. |
| virtual bool DoSeparableConvolve( |
| Stream* stream, const BatchDescriptor& input_descriptor, |
| const DeviceMemory<float>& input_data, |
| const FilterDescriptor& filter_descriptor, int depth_multiplier, |
| const DeviceMemory<float>& first_weights, |
| const DeviceMemory<float>& second_weights, |
| const ConvolutionDescriptor& convolution_descriptor, |
| const BatchDescriptor& output_descriptor, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Enqueues a single-precision backward convolution (for data) operation onto |
| // the stream. |
| // |
| // Arguments: |
| // stream: borrowed pointer to the stream that the 'convolve' operation |
| // should be enqueued onto. |
| // filter_descriptor: dimensions of the convolution filter. |
| // filter_data: coefficients for the convolution filter. |
| // output_descriptor: dimensions of the output gradients, which is the same |
| // as the dimensions of the output. |
| // backward_output_data: un-owned device memory region which contains the |
| // backprop of the output. |
| // convolution_descriptor: stride of the convolution filter. |
| // input_descriptor: dimensions of the input layer. |
| // backward_input_data: un-owned device memory region in which to place the |
| // backprop of the input. |
| // scratch_allocator: un-owned, may-be-null object that may allocate scratch |
| // space in order to speed up the convolution operation. |
| template <typename ElementType> |
| bool DoConvolveBackwardData( |
| Stream* stream, const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<ElementType>& filter_data, |
| const dnn::BatchDescriptor& output_descriptor, |
| const DeviceMemory<ElementType>& backward_output_data, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const dnn::BatchDescriptor& input_descriptor, |
| DeviceMemory<ElementType>* backward_input_data, |
| const dnn::AlgorithmDesc& algorithm_desc, |
| DeviceMemory<uint8>* scratch_memory, |
| ProfileResult* output_profile_result) { |
| return IsStatusOk( |
| DoConvolve( |
| ConvolutionKind::BACKWARD_DATA, ToDataType<ElementType>::value, |
| ToDataType<ElementType>::value, stream, input_descriptor, |
| *backward_input_data, filter_descriptor, filter_data, |
| output_descriptor, backward_output_data, convolution_descriptor, |
| algorithm_desc, *scratch_memory, output_profile_result), |
| !output_profile_result); |
| } |
| |
| // Return a list of algorithms supported by the backward convolution pass for |
| // data. |
| virtual bool GetConvolveBackwardDataAlgorithms( |
| bool with_winograd_nonfused, int cc_major, int cc_minor, |
| std::vector<AlgorithmDesc>* out_algorithms); |
| |
| // Enqueues a single-precision backward convolution (for filter) operation |
| // onto the stream. |
| // |
| // Arguments: |
| // stream: borrowed pointer to the stream that the 'convolve' operation |
| // should be enqueued onto. |
| // input_descriptor: dimensions of the input layer. |
| // input_data: un-owned device memory region which contains the |
| // convolution input. |
| // output_descriptor: dimensions of the output gradients, which is the same |
| // as the dimensions of the output. |
| // backward_output_data: un-owned device memory region which contains the |
| // backprop of the output. |
| // convolution_descriptor: stride of the convolution filter. |
| // filter_descriptor: dimensions of the convolution filter. |
| // backward_filter_data: un-owned device memory region in which to place the |
| // backprop of the filter. |
| // scratch_allocator: un-owned, may-be-null object that may allocate scratch |
| // space in order to speed up the convolution operation. |
| template <typename ElementType> |
| bool DoConvolveBackwardFilter( |
| Stream* stream, const BatchDescriptor& input_descriptor, |
| const DeviceMemory<ElementType>& input_data, |
| const BatchDescriptor& output_descriptor, |
| const DeviceMemory<ElementType>& backward_output_data, |
| const ConvolutionDescriptor& convolution_descriptor, |
| const FilterDescriptor& filter_descriptor, |
| DeviceMemory<ElementType>* backward_filter_data, |
| const dnn::AlgorithmDesc& algorithm_desc, |
| DeviceMemory<uint8>* scratch_memory, |
| ProfileResult* output_profile_result) { |
| return IsStatusOk( |
| DoConvolve( |
| ConvolutionKind::BACKWARD_FILTER, ToDataType<ElementType>::value, |
| ToDataType<ElementType>::value, stream, input_descriptor, |
| input_data, filter_descriptor, *backward_filter_data, |
| output_descriptor, backward_output_data, convolution_descriptor, |
| algorithm_desc, *scratch_memory, output_profile_result), |
| !output_profile_result); |
| } |
| |
| // Return a list of algorithms supported by the backward convolution pass for |
| // filters. |
| virtual bool GetConvolveBackwardFilterAlgorithms( |
| bool with_winograd_nonfused, int cc_major, int cc_minor, |
| std::vector<AlgorithmDesc>* out_algorithms); |
| |
| // Enqueues a single-precision backward convolution (for bias) operation onto |
| // the stream. |
| // |
| // Arguments: |
| // stream: borrowed pointer to the stream that the 'convolve' operation |
| // should be enqueued onto. |
| // input_descriptor: dimensions of the input layer. |
| // input_data: un-owned device memory region which contains the |
| // convolution input. |
| // bias_descriptor: dimensions of the bias tensor. Should be the same as the |
| // input dimensions, but with the spatial dimensions set to 1. |
| // backward_filter_data: un-owned device memory region in which to place the |
| // backprop of the bias. |
| virtual bool DoConvolveBackwardBias(Stream* stream, |
| const BatchDescriptor& input_descriptor, |
| const DeviceMemory<float>& input_data, |
| const BatchDescriptor& bias_descriptor, |
| DeviceMemory<float>* backward_bias_data) { |
| return false; |
| } |
| |
| virtual bool DoConvolveBackwardBias( |
| Stream* stream, const BatchDescriptor& input_descriptor, |
| const DeviceMemory<double>& input_data, |
| const BatchDescriptor& bias_descriptor, |
| DeviceMemory<double>* backward_bias_data) { |
| return false; |
| } |
| |
| virtual bool DoConvolveBackwardBias( |
| Stream* stream, const BatchDescriptor& input_descriptor, |
| const DeviceMemory<Eigen::half>& input_data, |
| const BatchDescriptor& bias_descriptor, |
| DeviceMemory<Eigen::half>* backward_bias_data) { |
| return false; |
| } |
| |
| // Fully connects the "nodes" (float values) in input_data with |
| // shape input_dimensions to output_data with output_dimensions |
| // using provided weights. This is equivalent to computing a matrix |
| // product, hence the name MatMul. |
| // |
| // A BatchDescriptor has four dimensions: batch, y, x, depth. Matrix products |
| // happen in two dimensions. To get down to two dimensions, we consider the |
| // input y, x and depth dimension as one combined dimension T. For now, |
| // assume that the output height and width are 1 and let OD be the output |
| // depth. |
| // |
| // There are three device memory buffers passed in to this |
| // function. We can now view all three as matrices: |
| // |
| // input_data: A batch x T matrix |
| // weights: A T x OD matrix |
| // output_data: A batch x OD matrix |
| // |
| // This function then computes the matrix product of input_data and |
| // weights and writes the result into output_data. |
| // |
| // Here the weights buffer is in row major order, i.e. the first OD |
| // entries in weights are the first row, the second OD entries in |
| // weights are the second row and so on. |
| // |
| // The case for output width*height > 1 is more complicated. Let K = |
| // OY * OX where OY is the output height and OX is the output |
| // width. Then weights is divided into K sub-arrays W_i, for |
| // i=0,...,k-1, that each represent a T x OD matrix. This function |
| // then computes the K matrix multiplications of input_data with |
| // each W_i. This creates K matrices with dimensions batch x |
| // OD. These K matrices are concatenated horizontally to form one |
| // larger matrix with dimensions batch x (K*OD); note that this is |
| // not the same as concatenating the bytes of the matrices. The |
| // combined matrix can then be interpreted as a tensor with |
| // dimensions (batch, OY, OX, OD). If the output tensor format is |
| // not kBatchYXDepth, this function would then need to arrange for |
| // the output to be in the requested layout, if that is |
| // supported. Note that the case K=1 is equivalent to the |
| // description above. It is recommended to prefer the case K=1. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'fully connect' operation |
| // should be enqueued onto. |
| // output_data: un-owned device memory region in which to place the |
| // fully connected result. |
| virtual bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data, |
| const DeviceMemory<float>& weights, |
| const dnn::BatchDescriptor& input_dimensions, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Version of DoMatMul that uses pre-quantized 8 bit weights. |
| // weight_scales specifies the scaling of each column of weights: |
| // original float weight[row * num_columns + column] = |
| // quantized_weight[row * nnum_columns + column] * weight_scales[column]. |
| virtual bool DoMatMulQuantized(Stream* stream, |
| const DeviceMemory<float>& input_data, |
| const DeviceMemory<int8>& quantized_weights, |
| const DeviceMemory<float>& weight_scales, |
| const dnn::BatchDescriptor& input_dimensions, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Version of DoMatMul that uses pre-quantized 16 bit weights. |
| // weight_scales specifies the scaling of each column of weights: |
| // original float weight[row * num_columns + column] = |
| // quantized_weight[row * nnum_columns + column] * weight_scales[column]. |
| virtual bool DoMatMulQuantized(Stream* stream, |
| const DeviceMemory<float>& input_data, |
| const DeviceMemory<int16>& quantized_weights, |
| const DeviceMemory<float>& weight_scales, |
| const dnn::BatchDescriptor& input_dimensions, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Adds biases to the feature maps in input_data producing |
| // output_data. input_data can equal output_data, but must not |
| // partially overlap it. |
| // |
| // Let K = count() * height() * width() and N = feature_map_count() |
| // on dimensions. Then input_value contains K*N values and biases |
| // contains N values. We can thus logically consider input_value to |
| // contain K vectors of N elements each. This function adds biases |
| // to each of those N vectors. |
| // |
| // TODO(broune): This works differently when width() * height() > 1 |
| // and the call to ThenBiasAdd() follows a call to ThenMatMul(). In |
| // that case there should be width() * height() * |
| // feature_map_count() biases, but this is not implemented on all |
| // StreamExecutors. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'bias add' operation |
| // should be enqueued onto. |
| // input_data: un-owned device memory region containing the input. |
| // biases: un-owned device memory region containing biases to add to the |
| // input. |
| // dimensions: dimensions of input_data and output_data. |
| // output_data: un-owned device memory region in which to place the result. |
| virtual bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data, |
| const DeviceMemory<float>& biases, |
| const dnn::BatchDescriptor& dimensions, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Performs a forward pooling operation on input_data, writing to |
| // output_data. See PoolingDescriptor for how to configure the |
| // pooling operation. |
| // |
| // Pooling happens as a window that moves across the Y and X |
| // dimensions of input_data, where each position of the window |
| // yields one output value. E.g. for max pooling, the computed value |
| // is the maximum element in the window. The operation is applied |
| // independently to each batch and at each feature map (depth), so |
| // that the output depth and feature_map_count are the same as for |
| // the input. The output width and height can be different. |
| // |
| // See PoolingDescriptor for how to configure the pooling operation. |
| virtual bool DoPoolForward(Stream* stream, |
| const dnn::PoolingDescriptor& pooling_dimensions, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<float>& input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<float>* output_data, |
| ScratchAllocator* workspace_allocator) = 0; |
| |
| virtual bool DoPoolForward(Stream* stream, |
| const dnn::PoolingDescriptor& pooling_dimensions, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<double>& input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<double>* output_data, |
| ScratchAllocator* workspace_allocator) { |
| LOG(FATAL) << "DoPoolForward not implemented for double."; |
| return false; |
| } |
| |
| virtual bool DoPoolForward(Stream* stream, |
| const dnn::PoolingDescriptor& pooling_dimensions, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<Eigen::half>& input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<Eigen::half>* output_data, |
| ScratchAllocator* workspace_allocator) { |
| LOG(FATAL) << "DoPoolForward not implemented for float16."; |
| return false; |
| } |
| |
| virtual bool DoPoolForward(Stream* stream, |
| const dnn::PoolingDescriptor& pooling_dimensions, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<int8>& input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<int8>* output_data, |
| ScratchAllocator* workspace_allocator) { |
| LOG(FATAL) << "DoPoolForward not implemented for int8."; |
| return false; |
| } |
| |
| // Performs differentiation of the pooling operation. |
| virtual bool DoPoolBackward(Stream* stream, |
| const dnn::PoolingDescriptor& pooling_dimensions, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<double>& input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| const DeviceMemory<double>& output_data, |
| const DeviceMemory<double>& input_diff_data, |
| DeviceMemory<double>* output_diff_data, |
| ScratchAllocator* workspace_allocator) { |
| LOG(FATAL) << "DoPoolBackward not implemented."; |
| return false; |
| } |
| |
| virtual bool DoPoolBackward(Stream* stream, |
| const dnn::PoolingDescriptor& pooling_dimensions, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<float>& input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| const DeviceMemory<float>& output_data, |
| const DeviceMemory<float>& input_diff_data, |
| DeviceMemory<float>* output_diff_data, |
| ScratchAllocator* workspace_allocator) { |
| LOG(FATAL) << "DoPoolBackward not implemented."; |
| return false; |
| } |
| |
| virtual bool DoPoolBackward(Stream* stream, |
| const dnn::PoolingDescriptor& pooling_dimensions, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<Eigen::half>& input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| const DeviceMemory<Eigen::half>& output_data, |
| const DeviceMemory<Eigen::half>& input_diff_data, |
| DeviceMemory<Eigen::half>* output_diff_data, |
| ScratchAllocator* workspace_allocator) { |
| LOG(FATAL) << "DoPoolBackward not implemented."; |
| return false; |
| } |
| |
| // Applies local response normalization to the values from input_data and |
| // writes the result to output_data. |
| // |
| // See comments on NormalizeDescriptor for a description of local response |
| // normalization. |
| virtual bool DoNormalizeWithDimensions( |
| Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, |
| const dnn::BatchDescriptor& dimensions, |
| const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) { |
| return false; |
| } |
| |
| // Performs backpropagation for the normalization operation |
| // |
| // Given raw data, its corresponding normalized output, and a gradient of some |
| // unspecified function with respect to the normalized variables, computes the |
| // gradient of that unspecified function with respect to the raw variables. |
| // |
| // The normalized data input array is expected to match the output that would |
| // be obtained by running the raw data input array through the DoNormalize |
| // method above. |
| // |
| // See comments on NormalizeDescriptor for a description of local response |
| // normalization. |
| virtual bool DoNormalizeBackwardWithDimensions( |
| Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, |
| const dnn::BatchDescriptor& dimensions, |
| const DeviceMemory<float>& raw_data, |
| const DeviceMemory<float>& normalized_data, |
| const DeviceMemory<float>& normalized_variable_gradient, |
| DeviceMemory<float>* raw_variable_gradient, |
| ScratchAllocator* workspace_allocator) { |
| return false; |
| } |
| |
| // Applies an activation function (see ActivationMode) to all of the values |
| // held on the device in 'input_data', whose dimensions are described by |
| // 'dimensions'. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'activate' operation |
| // should be enqueued onto. |
| // activation_mode: Type of activation to perform. |
| // input_data: un-owned device memory region which contains the |
| // activate input. |
| // output_data: un-owned device memory region in which to place the |
| // activate result. |
| virtual bool DoActivate(Stream* stream, ActivationMode activation_mode, |
| const BatchDescriptor& dimensions, |
| const DeviceMemory<float>& input_data, |
| DeviceMemory<float>* output_data, uint64 options) { |
| return false; |
| } |
| |
| // Concatenates several layers into one, by concatenating the depth of each |
| // layer at matching x and y coordinates. |
| // The inputs must all have the same width and height, the output will have |
| // the same width and height as the inputs and its depth will be the sum of |
| // the input depths. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'depth concatenate' |
| // operation should be enqueued onto. |
| // input_dimensions: The dimensions of each input. |
| // input_data: un-owned device memory region which contains the |
| // input data for each input layer. |
| // output_data: un-owned device memory region in which to place the |
| // depth concatenate result. |
| virtual bool DoDepthConcatenate( |
| Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, |
| port::ArraySlice<const DeviceMemory<float>*> input_data, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Concatenates several layers into one, by concatenating each in the |
| // x-dimension or y-dimension, based on a user-specified flag. |
| // For x-concatenation, layers are aligned at matching y and depth |
| // coordinates, and for y-concatenation, they are aligned at matching x and |
| // depth coordinates. The inputs must all have the same depth and batch size. |
| // For x-concatenation, the inputs must have the same height (y-size), and the |
| // output will have the same depth and height as the inputs and its width (x- |
| // size) will be the sum of the input widths. For y-concatenation, the inputs |
| // must have the same width, and the output will have the same depth and width |
| // as the inputs, and its height will be the sum of the input heights. |
| // |
| // Arguments: |
| // stream: borrowed pointer to the stream that the 'space concatenate' |
| // operation should be enqueued onto. |
| // input_dimensions: the dimensions of each input. |
| // input_data: un-owned device memory region which contains the input data |
| // for each input layer. |
| // output_data: un-owned device memory region in which to place the space |
| // concatenate result. |
| // concat_direction: either dnn:SpaceConcatenateMode::XDirection or |
| // dnn::SpaceConcatenateMode::YDirection. |
| virtual bool DoSpaceConcatenate( |
| Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, |
| port::ArraySlice<const DeviceMemory<float>*> input_data, |
| DeviceMemory<float>* output_data, |
| dnn::SpaceConcatenateMode concat_direction) { |
| return false; |
| } |
| |
| // Change the layout of the data by shrinking one dimension (or set of |
| // dimensions) and growing another dimension (or set of dimensions), while |
| // keeping the total number of data elements constant, and maintaining the |
| // current data ordering. |
| // |
| // Currently, the only supported operation is depth into space by a power of |
| // 2. E.g. (y, x, z) -> (y*2, x*2, z/4) |
| // |
| // Note that Reshape may not be a no-op, depending on the platform and which |
| // dimensions are being changed. |
| // |
| // Example: forgetting about batch for the moment, let's take a tensor that's |
| // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout |
| // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The |
| // elements of the tensor range from 0 to 15. The x,y,z indices are below each |
| // element. |
| // |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1 |
| // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 |
| // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7 |
| // |
| // reshape to 4x2x2 |
| // |
| // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
| // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3 |
| // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 |
| // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 |
| virtual bool DoReshape(Stream* stream, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<float>& input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<float>* output_data) { |
| return false; |
| } |
| |
| // Depth to space takes an X by Y image with depth D*M^2 and changes it to an |
| // MX x MY image with depth D. Each input location (x,y) with depth D*M^2 in |
| // the input image is changed to an MxM contiguous area in the output image, |
| // with the values being laid out in the raster order by DepthToSpaceLayout, |
| // and will have a new depth of D. |
| // |
| // Example. |
| // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4, Dout=2 |
| // DepthHeightWidth layout |
| // Values within a 'cell' are at different depths and same x & y. |
| // Input: |
| // abcdefgh ijklmnop |
| // qrstuvwx yz012345 |
| // Output: |
| // ae bf im jn |
| // cg dh ko lp |
| // qu rv y2 z3 |
| // sw tx 04 15 |
| // |
| // sqrt_depth_reduction: 'M' in the comment above |
| virtual bool DoDepthToSpace(Stream* stream, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<float>& input_data, |
| const DepthToSpaceLayout& depth_to_space_layout, |
| const int& sqrt_depth_reduction, |
| DeviceMemory<float>* output_data) { |
| return false; |
| } |
| |
| // Space to depth is the inverse of depth to space. Space to depth takes each |
| // non-overlapping M by M patch (in the X and Y dimensions) with depth D of |
| // the input, and transforms it to a 1 by 1 patch with depth D*M^2. If the |
| // input has size (MX, MY, D), the output has size (X, Y, D*M^2). The number |
| // of data elements is not changed. |
| // |
| // Example. |
| // M=2, Din =2, Xin=4, Yin=4, Dout=8 |
| // DepthHeightWidth layout |
| // Values within a 'cell' are at different depths and same x & y. |
| // Input: |
| // ae bf im jn |
| // cg dh ko lp |
| // qu rv y2 z3 |
| // sw tx 04 15 |
| // Output: |
| // abcdefgh ijklmnop |
| // qrstuvwx yz012345 |
| // |
| // sqrt_depth_increase: 'M' in the comment above |
| virtual bool DoSpaceToDepth(Stream* stream, |
| const dnn::BatchDescriptor& input_dimensions, |
| const DeviceMemory<float>& input_data, |
| const DepthToSpaceLayout& space_to_depth_layout, |
| const int& sqrt_depth_increase, |
| DeviceMemory<float>* output_data) { |
| return false; |
| } |
| |
| // Computes the specified operation (e.g. addition or multiplication) |
| // between corresponding elements in the inputs and stores the result in the |
| // output element. |
| // The inputs and output must all have the same dimensions, but may have |
| // different quantization parameters (min_value and max_value). |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'elementwise operation' |
| // should be enqueued onto. |
| // operation: The operation to perform. |
| // input_dimensions: The dimensions of each input. |
| // input_data: un-owned device memory region which contains the |
| // input data for each input layer. |
| // output_dimensions: The dimensions of the output. |
| // output_data: un-owned device memory region in which to place the |
| // operation result. |
| virtual bool DoElementwiseOperate( |
| Stream* stream, ElementwiseOperation operation, |
| port::ArraySlice<dnn::BatchDescriptor> input_dimensions, |
| port::ArraySlice<const DeviceMemory<float>*> input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<float>* output_data) = 0; |
| |
| // Computes the specified operation (e.g. addition or multiplication) |
| // between corresponding elements in the inputs and stores the result in the |
| // output element. Each input is multiplied by a scalar constant and the |
| // result is divided by a scalar constant. |
| // e.g. To perform Z = 0.9*X + 1.1*Y, set the input multiplicands to 9 and 11 |
| // and the output divisor to 10. |
| // The inputs and output must all have the same dimensions, but may have |
| // different quantization parameters (min_value and max_value). |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'elementwise operation' |
| // should be enqueued onto. |
| // operation: The operation to perform. |
| // input_multiplicands: Amount to scale each input. |
| // output_divisor: Amount to divide the output. |
| // input_dimensions: The dimensions of each input. |
| // input_data: un-owned device memory region which contains the |
| // input data for each input layer. |
| // output_dimensions: The dimensions of the output. |
| // output_data: un-owned device memory region in which to place the |
| // operation result. |
| virtual bool DoElementwiseOperateScaledQuantized( |
| Stream* stream, ElementwiseOperation operation, |
| port::ArraySlice<int> input_multiplicands, int output_divisor, |
| port::ArraySlice<dnn::BatchDescriptor> input_dimensions, |
| port::ArraySlice<const DeviceMemory<float>*> input_data, |
| const dnn::BatchDescriptor& output_dimensions, |
| DeviceMemory<float>* output_data) { |
| return false; |
| } |
| |
| // Pads the input with zeros in the X and Y dimensions. The feature_map |
| // dimension is unchanged. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'elementwise operation' |
| // should be enqueued onto. |
| // dimensions: The dimensions of the input. |
| // input_data: un-owned device memory region which contains the |
| // input data for the input layer. |
| // left_pad: Amount to pad the input on the left. |
| // right_pad: Amount to pad the input on the right. |
| // top_pad: Amount to pad the input at the top (low Y). |
| // bottom_pad: Amount to pad the input at the bottom (high Y). |
| // output_data: un-owned device memory region in which to place the |
| // padded result. |
| virtual bool DoXYPad(Stream* stream, const dnn::BatchDescriptor &dimensions, |
| const DeviceMemory<float> &input_data, |
| int64 left_pad, int64 right_pad, int64 top_pad, |
| int64 bottom_pad, DeviceMemory<float> *output_data) = 0; |
| |
| // Extracts a slice of the input in the X and Y dimensions. The feature_map |
| // dimension is unchanged. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'elementwise operation' |
| // should be enqueued onto. |
| // dimensions: The dimensions of the input. |
| // input_data: un-owned device memory region which contains the |
| // input data for the input layer. |
| // left_trim: Amount to cut off the input on the left. |
| // right_trim: Amount to cut off the input on the right. |
| // top_trim: Amount to cut off the input at the top (low y). |
| // bottom_trim: Amount to cut off the input at the bottom (high Y). |
| // output_data: un-owned device memory region in which to place the |
| // padded result. |
| virtual bool DoXYSlice(Stream* stream, const dnn::BatchDescriptor &dimensions, |
| const DeviceMemory<float> &input_data, |
| int64 left_trim, int64 right_trim, int64 top_trim, |
| int64 bottom_trim, DeviceMemory<float> *output_data) = 0; |
| |
| // Grows the input tensor by replicating the X and Y dimensions. The batch and |
| // depth/feature_map dimensions are unchanged. Currently, the input tensor is |
| // limited to X=1 and Y=1. |
| // |
| // For example, the input has dimensions x=2, y=3, and replicate_x=3, |
| // replicate_y=2. The diagonal elements of the output would be: [x0y0, x1y1, |
| // x0y2, x1y0, x0y1, x1y2]. |
| // Here is the example as a picture. input: |
| // AB |
| // CD |
| // EF |
| // broadcast result: |
| // ABABAB |
| // CDCDCD |
| // EFEFEF |
| // ABABAB |
| // CDCDCD |
| // EFEFEF |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'elementwise operation' |
| // should be enqueued onto. |
| // dimensions: The dimensions of the input. |
| // input_data: un-owned device memory region which contains the |
| // input data for the input layer. |
| // replicate_x: Amount to replicate the input's X dimension. |
| // replicate_y: Amount to replicate the input's Y dimension. |
| // output_data: un-owned device memory region in which to place the |
| // padded result. |
| virtual bool DoXYBroadcast(Stream* stream, |
| const dnn::BatchDescriptor& dimensions, |
| const DeviceMemory<float>& input_data, |
| int64 replicate_x, int64 replicate_y, |
| DeviceMemory<float>* output_data) { |
| return false; |
| } |
| |
| // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that |
| // is, bytes instead of scaled floats) into 'host_dst' if they are available |
| // for the underlying DNN implementation. If this quantized output is not |
| // available, false is returned, which will place 'stream' into an error |
| // state. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'quantized memcpy' |
| // operation should be enqueued onto. |
| // gpu_unquantized_src: the device memory that contains the unquantized data |
| // -- this data should also have a corresponding quantized representation |
| // on the device for this operation to succeed. |
| // mode: Type of quantization of the data to write into host_dst. |
| // host_dst: un-owned host memory region that is mutated in place, |
| // it is clobbered by the values in 'gpu_unquantized_src' when the enqueued |
| // (asynchronous) memcpy operation is performed. |
| // size: size in bytes of the host_dst host memory region. |
| virtual bool DoMemcpyD2HQuantized( |
| Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, |
| QuantizedActivationMode mode, void* host_dst, int64 size) = 0; |
| |
| // Enqueues an asynchronous memcpy of 'host_dst' into the *quantized* input |
| // of a layer (that is, bytes instead of scaled floats) if they are supported |
| // by the underlying DNN implementation. If this quantized input is not |
| // supported, false is returned, which will place 'stream' into an error |
| // state. |
| // |
| // Arguments (all borrowed): |
| // stream: borrowed pointer to the stream that the 'quantized memcpy' |
| // operation should be enqueued onto. |
| // host_src: un-owned host memory region that contains the quantized data. |
| // size: size in bytes of the host_src host memory region. |
| // mode: Type of quantization of the data to read from host_src. |
| // gpu_unquantized_dst: the device memory that is clobbered by the values in |
| // 'host_src' when the enqueued (asynchronous) memcpy operation is |
| // performed. -- this data should also have a corresponding quantized |
| // representation on the device for this operation to |
| // succeed. |
| virtual bool DoMemcpyH2DQuantized( |
| Stream* stream, const void* host_src, int64 size, |
| QuantizedActivationMode mode, |
| DeviceMemory<float>* gpu_unquantized_dst) = 0; |
| |
| // Create an RNN descriptor based on model shapes and configurations. |
| // The caller retains the ownership of the descriptor. |
| // |
| // Arguments: |
| // num_layers: the number of layers for a RNN model. |
| // hidden_size: the size of the hidden state. |
| // input_size: the size of the input state. |
| // cell_size: the size of the cell state |
| // input_mode: an enum to specify whether a linear transformation is added |
| // after the input state. If input_size is different from hidden_size, this |
| // is required. |
| // direction_mode: an enum to specify whether this model is unidirectional or |
| // bidirectional. |
| // rnn_mode: an enum to specify the type of model to build. |
| // data_type: an enum to specify the data types used in this model. |
| // dropout: the dropout threshold between layers. When it is 0., no dropout |
| // is added. |
| // seed: a seed for initializing the dropout layers. |
| // state_allocator: an memory allocator that will be used to store the state |
| // for dropout layer. The user has to maintain the memory until the model |
| // is no longer in use. |
| // use_padded_io: a bool to specify whether the input is using padded IO. |
| virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> |
| createRnnDescriptor(int num_layers, int hidden_size, int input_size, |
| int cell_size, int batch_size, |
| dnn::RnnInputMode input_mode, |
| dnn::RnnDirectionMode direction_mode, |
| dnn::RnnMode rnn_mode, dnn::DataType data_type, |
| const dnn::AlgorithmConfig& algorithm_config, |
| float dropout, uint64 seed, |
| ScratchAllocator* state_allocator, bool use_padded_io) { |
| return port::Status(port::error::UNIMPLEMENTED, |
| "createRnnDescriptor is unimplemented"); |
| } |
| |
| // Create a RNN sequence descriptor that specifies either the input or output |
| // sequence. The caller retains the ownership of the returned descriptor. |
| // |
| // Arguments: |
| // max_seq_length: the max length of the sequences. |
| // batch_size: the size of a minibatch. |
| // data_size: the size of the state. |
| // seq_lengths: the lengths of sequences in a batch. |
| // data_type: an enum to specify the type for the underlying data. |
| virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>> |
| createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size, |
| int data_size, dnn::DataType data_type) { |
| return port::Status(port::error::UNIMPLEMENTED, |
| "createRnnSequenceTensorDescriptor is unimplemented"); |
| } |
| |
| virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>> |
| createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size, |
| int data_size, |
| const absl::Span<const int>& seq_lengths, |
| bool time_major, dnn::DataType data_type) { |
| return port::Status(port::error::UNIMPLEMENTED, |
| "createRnnSequenceTensorDescriptor is unimplemented"); |
| } |
| |
| // Create an RNN state descriptor that specifies the input or hidden state. |
| // The caller retains the ownership of the returned descriptor. |
| virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>> |
| createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size, |
| dnn::DataType data_type) { |
| return port::Status(port::error::UNIMPLEMENTED, |
| "createRnnStateTensorDescriptor is unimplemented"); |
| } |
| |
| // Enqueue a forward operation of the RNN model onto the stream. |
| // |
| // Arguments: |
| // stream: pointer to the stream where this operation should be enqueued to. |
| // rnn_desc: a RNN descriptor created by createRnnDescriptor. |
| // input_desc: descriptor for the input sequence. |
| // input_data: the device memory region that contains the input data. |
| // input_h_desc: descriptor for the input "h" state. |
| // input_h_data: the device memory region that contains the input "h" data. |
| // input_c_desc: descriptor for the input "c" state. |
| // input_c_data: the device memory region that contains the input "c" data. |
| // This must be specified for LSTM models. |
| // params: the device memory region that contains the parameters used in this |
| // model. |
| // output_desc: descriptor for the output sequence. |
| // output_data: the memory region that stores the output sequence data. |
| // output_h_desc: descriptor for the output "h" state. |
| // output_h_data: the memory region that stores the output "h" data. |
| // output_c_desc: descriptor for the output "c" state. |
| // output_c_data: the memory region that stores the output "c" data. This |
| // must be specified for LSTM models. |
| // is_training: whether this is used in training or inference. That decides |
| // whether respace_space data need to be produced. |
| // reserve_space_allocator: if "is_training" is true, an memory allocator |
| // to create memory that holds the produced reserve_space. The caller is |
| // retains the data and feed it to the backward pass. |
| // workspace_allocator: an allocator to create temporary workspace used in |
| // this kernel. The caller is responsible for retaining the memory long |
| // enough for the lifespan of this operation, and recycles afterwards. |
| virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, |
| const dnn::RnnSequenceTensorDescriptor& input_desc, |
| const DeviceMemory<Eigen::half>& input_data, |
| const dnn::RnnStateTensorDescriptor& input_h_desc, |
| const DeviceMemory<Eigen::half>& input_h_data, |
| const dnn::RnnStateTensorDescriptor& input_c_desc, |
| const DeviceMemory<Eigen::half>& input_c_data, |
| const DeviceMemory<Eigen::half>& params, |
| const dnn::RnnSequenceTensorDescriptor& output_desc, |
| DeviceMemory<Eigen::half>* output_data, |
| const dnn::RnnStateTensorDescriptor& output_h_desc, |
| DeviceMemory<Eigen::half>* output_h_data, |
| const dnn::RnnStateTensorDescriptor& output_c_desc, |
| DeviceMemory<Eigen::half>* output_c_data, |
| bool is_training, |
| ScratchAllocator* reserve_space_allocator, |
| ScratchAllocator* workspace_allocator, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, |
| const dnn::RnnSequenceTensorDescriptor& input_desc, |
| const DeviceMemory<float>& input_data, |
| const dnn::RnnStateTensorDescriptor& input_h_desc, |
| const DeviceMemory<float>& input_h_data, |
| const dnn::RnnStateTensorDescriptor& input_c_desc, |
| const DeviceMemory<float>& input_c_data, |
| const DeviceMemory<float>& params, |
| const dnn::RnnSequenceTensorDescriptor& output_desc, |
| DeviceMemory<float>* output_data, |
| const dnn::RnnStateTensorDescriptor& output_h_desc, |
| DeviceMemory<float>* output_h_data, |
| const dnn::RnnStateTensorDescriptor& output_c_desc, |
| DeviceMemory<float>* output_c_data, |
| bool is_training, |
| ScratchAllocator* reserve_space_allocator, |
| ScratchAllocator* workspace_allocator, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, |
| const dnn::RnnSequenceTensorDescriptor& input_desc, |
| const DeviceMemory<double>& input_data, |
| const dnn::RnnStateTensorDescriptor& input_h_desc, |
| const DeviceMemory<double>& input_h_data, |
| const dnn::RnnStateTensorDescriptor& input_c_desc, |
| const DeviceMemory<double>& input_c_data, |
| const DeviceMemory<double>& params, |
| const dnn::RnnSequenceTensorDescriptor& output_desc, |
| DeviceMemory<double>* output_data, |
| const dnn::RnnStateTensorDescriptor& output_h_desc, |
| DeviceMemory<double>* output_h_data, |
| const dnn::RnnStateTensorDescriptor& output_c_desc, |
| DeviceMemory<double>* output_c_data, |
| bool is_training, |
| ScratchAllocator* reserve_space_allocator, |
| ScratchAllocator* workspace_allocator, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| // Enqueue a backward operation of the RNN model onto the stream. |
| // |
| // Arguments: |
| // stream: pointer to the stream where this operation should be enqueued to. |
| // rnn_desc: a RNN descriptor created by createRnnDescriptor. |
| // input_desc: descriptor for the input sequence. |
| // input_data: the device memory region that contains the input data. |
| // input_h_desc: descriptor for the input "h" state. |
| // input_h_data: the device memory region that contains the input "h" data. |
| // input_c_desc: descriptor for the input "c" state. |
| // input_c_data: the device memory region that contains the input "c" data. |
| // This must be specified for LSTM models. |
| // params: the device memory region that contains the parameters used in this |
| // model. |
| // output_desc: descriptor for the output sequence. |
| // output_data: the memory region that stores the output sequence data. |
| // output_h_desc: descriptor for the output "h" state. |
| // output_h_data: the memory region that stores the output "h" data. |
| // output_c_desc: descriptor for the output "c" state. |
| // output_c_data: the memory region that stores the output "c" data. This |
| // must be specified for LSTM models. |
| // output_backprop_data: the device memory region that contains the backprop |
| // to the output sequence. |
| // output_h_backprop_data: the device memory region that contains the |
| // backprop to the output "h" state. |
| // output_c_backprop_data: the device memory region that contains the |
| // backprop to the output "c" state. |
| // input_backprop_data: the device memory region that stores the backprop |
| // to the input sequence. |
| // input_h_backprop_data: the device memory region that stores the backprop |
| // to the input "h" state. |
| // input_c_backprop_data: the device memory region that stores the backprop |
| // to the input "c" state. |
| // params_backprop_data: the device memory region that stores the backprop |
| // to the parameters. |
| // reserve_space_data: the reserve_space data that is produced by the forward |
| // operation. This memory region could be modified by this operation. |
| // workspace_allocator: a memory allocator that creates the temporary |
| // workspace memory used by this operation. The caller is responsible for |
| // keeping the memory alive long enough for this operation, and recylces |
| // afterwards. |
| virtual bool DoRnnBackward( |
| Stream* stream, const dnn::RnnDescriptor& rnn_desc, |
| const dnn::RnnSequenceTensorDescriptor& input_desc, |
| const DeviceMemory<Eigen::half>& input_data, |
| const dnn::RnnStateTensorDescriptor& input_h_desc, |
| const DeviceMemory<Eigen::half>& input_h_data, |
| const dnn::RnnStateTensorDescriptor& input_c_desc, |
| const DeviceMemory<Eigen::half>& input_c_data, |
| const DeviceMemory<Eigen::half>& params, |
| const dnn::RnnSequenceTensorDescriptor& output_desc, |
| const DeviceMemory<Eigen::half>& output_data, |
| const dnn::RnnStateTensorDescriptor& output_h_desc, |
| const DeviceMemory<Eigen::half>& output_h_data, |
| const dnn::RnnStateTensorDescriptor& output_c_desc, |
| const DeviceMemory<Eigen::half>& output_c_data, |
| const DeviceMemory<Eigen::half>& output_backprop_data, |
| const DeviceMemory<Eigen::half>& output_h_backprop_data, |
| const DeviceMemory<Eigen::half>& output_c_backprop_data, |
| DeviceMemory<Eigen::half>* input_backprop_data, |
| DeviceMemory<Eigen::half>* input_h_backprop_data, |
| DeviceMemory<Eigen::half>* input_c_backprop_data, |
| DeviceMemory<Eigen::half>* params_backprop_data, |
| DeviceMemory<uint8>* reserve_space_data, |
| ScratchAllocator* workspace_allocator, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| virtual bool DoRnnBackward( |
| Stream* stream, const dnn::RnnDescriptor& rnn_desc, |
| const dnn::RnnSequenceTensorDescriptor& input_desc, |
| const DeviceMemory<float>& input_data, |
| const dnn::RnnStateTensorDescriptor& input_h_desc, |
| const DeviceMemory<float>& input_h_data, |
| const dnn::RnnStateTensorDescriptor& input_c_desc, |
| const DeviceMemory<float>& input_c_data, |
| const DeviceMemory<float>& params, |
| const dnn::RnnSequenceTensorDescriptor& output_desc, |
| const DeviceMemory<float>& output_data, |
| const dnn::RnnStateTensorDescriptor& output_h_desc, |
| const DeviceMemory<float>& output_h_data, |
| const dnn::RnnStateTensorDescriptor& output_c_desc, |
| const DeviceMemory<float>& output_c_data, |
| const DeviceMemory<float>& output_backprop_data, |
| const DeviceMemory<float>& output_h_backprop_data, |
| const DeviceMemory<float>& output_c_backprop_data, |
| DeviceMemory<float>* input_backprop_data, |
| DeviceMemory<float>* input_h_backprop_data, |
| DeviceMemory<float>* input_c_backprop_data, |
| DeviceMemory<float>* params_backprop_data, |
| DeviceMemory<uint8>* reserve_space_data, |
| ScratchAllocator* workspace_allocator, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| virtual bool DoRnnBackward( |
| Stream* stream, const dnn::RnnDescriptor& rnn_desc, |
| const dnn::RnnSequenceTensorDescriptor& input_desc, |
| const DeviceMemory<double>& input_data, |
| const dnn::RnnStateTensorDescriptor& input_h_desc, |
| const DeviceMemory<double>& input_h_data, |
| const dnn::RnnStateTensorDescriptor& input_c_desc, |
| const DeviceMemory<double>& input_c_data, |
| const DeviceMemory<double>& params, |
| const dnn::RnnSequenceTensorDescriptor& output_desc, |
| const DeviceMemory<double>& output_data, |
| const dnn::RnnStateTensorDescriptor& output_h_desc, |
| const DeviceMemory<double>& output_h_data, |
| const dnn::RnnStateTensorDescriptor& output_c_desc, |
| const DeviceMemory<double>& output_c_data, |
| const DeviceMemory<double>& output_backprop_data, |
| const DeviceMemory<double>& output_h_backprop_data, |
| const DeviceMemory<double>& output_c_backprop_data, |
| DeviceMemory<double>* input_backprop_data, |
| DeviceMemory<double>* input_h_backprop_data, |
| DeviceMemory<double>* input_c_backprop_data, |
| DeviceMemory<double>* params_backprop_data, |
| DeviceMemory<uint8>* reserve_space_data, |
| ScratchAllocator* workspace_allocator, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| template <typename ElementType> |
| port::Status PrepareForCtcLoss(Stream* stream, |
| const RnnStateTensorDescriptor& probs_desc, |
| DeviceMemory<ElementType> probs_data, |
| const RnnStateTensorDescriptor& grads_desc, |
| absl::Span<const int> labels_data, |
| absl::Span<const int> labels_lengths_data, |
| absl::Span<const int> input_lengths_data, |
| ScratchAllocator* workspace_allocator, |
| DeviceMemory<uint8>* scratch_memory) { |
| return DoPrepareForCtcLoss(stream, ToDataType<ElementType>::value, |
| probs_desc, grads_desc, labels_data, |
| labels_lengths_data, input_lengths_data, |
| workspace_allocator, scratch_memory); |
| } |
| |
| // Enqueue a CTC Loss operation onto the stream. |
| // |
| // Arguments: |
| // stream: pointer to the stream where this operation should be enqueued to. |
| // element_type: date type of the input tensors |
| // probs_desc: specifies the shape and the data layout of the input tensor. |
| // probs_data: the device memory region that contains the input tensor. |
| // labels_data: the device memory region that contains the labels_value |
| // tensor. |
| // labels_lengths_data: the device memory region that contains the |
| // labels_lengths tensor |
| // input_lengths_data: the device memory region that contains the seq_lengths |
| // tensor |
| // costs_data: the device memory region that contains the costs tensor. |
| // grads_desc: specifies the shape and the data layout of the grads tensor. |
| // grads_data: the device memory region that contains the grads tensor. |
| // ctc_loss_desc: a CTCLoss descriptor. |
| // workspace_allocator: a memory allocator that creates the temporary |
| // workspace memory used by this operation. The caller is responsible for |
| // keeping the memory alive long enough for this operation, and recylces |
| // afterwards. |
| virtual port::Status DoCtcLoss(Stream* stream, dnn::DataType element_type, |
| const RnnStateTensorDescriptor& probs_desc, |
| const DeviceMemoryBase probs_data, |
| absl::Span<const int> labels_data, |
| absl::Span<const int> labels_lengths_data, |
| absl::Span<const int> input_lengths_data, |
| DeviceMemoryBase costs_data, |
| const RnnStateTensorDescriptor& grads_desc, |
| DeviceMemoryBase grads_data, |
| DeviceMemory<uint8> scratch_memory); |
| |
| template <typename ElementType> |
| bool DoCtcLoss(Stream* stream, |
| const dnn::RnnStateTensorDescriptor& probs_desc, |
| const DeviceMemory<ElementType>& probs_data, |
| absl::Span<const int> labels_data, |
| absl::Span<const int> labels_lengths_data, |
| absl::Span<const int> input_lengths_data, |
| DeviceMemory<ElementType>* costs_data, |
| const dnn::RnnStateTensorDescriptor& grads_desc, |
| DeviceMemory<ElementType>* grads_data, |
| DeviceMemory<uint8>* scratch_memory) { |
| return IsStatusOk( |
| DoCtcLoss(stream, ToDataType<ElementType>::value, probs_desc, |
| probs_data, labels_data, labels_lengths_data, |
| input_lengths_data, *costs_data, grads_desc, *grads_data, |
| *scratch_memory), |
| false); |
| } |
| |
| // Transforms a tensor into another tensor with a different layout and/or data |
| // type. |
| // |
| // Arguments: |
| // stream: pointer to the stream where this operation should be enqueued to. |
| // input_desc: specifies the shape and the data layout of the input tensor. |
| // input_type: the data type of the input tensor. |
| // input_data: the device memory region that contains the input tensor. |
| // output_desc: specifies the shape and the data layout of the output tensor. |
| // output_type: the data type of the output tensor. |
| // scale: an element-wise scaling factor to apply. |
| // output_data: the device memory region that contains the output tensor. |
| virtual bool DoTransformTensor(Stream* stream, |
| const dnn::BatchDescriptor& input_desc, |
| dnn::DataType input_type, |
| const DeviceMemoryBase& input_data, |
| const dnn::BatchDescriptor& output_desc, |
| dnn::DataType output_type, float scale, |
| DeviceMemoryBase* output_data) { |
| return false; |
| } |
| |
| // Enqueues a fused convolution+bias+activation operation onto the stream. |
| // |
| // Arguments (all borrowed): |
| // |
| // stream: borrowed pointer to the stream that the 'fusion' operation should |
| // be enqueued onto. |
| // |
| // conv_input_descriptor: dimensions of the convolution input layer. |
| // conv_input_data: device memory which contains the convolution input. |
| // |
| // filter_descriptor: dimensions of the convolution filter. |
| // filter_data: device memory which contains the convolution filter weights. |
| // |
| // convolution_descriptor: stride of the convolution filter. |
| // |
| // bias_descriptor: dimensions of the bias layer |
| // biases: device memory region containing biases to add to the convolution |
| // output |
| // |
| // activation_mode: Type of activation to perform. |
| // |
| // output_descriptor: dimensions of the output layer. |
| // output_data: device memory region in which to place the fusion result. |
| // |
| // output_profile_result: the output profile result for this call. |
| // The profiling is only enabled when this is not nullptr. |
| // |
| virtual bool DoFusedConvolutionBiasActivation( |
| Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, |
| const DeviceMemory<float>& conv_input_data, |
| const dnn::FilterDescriptor& filter_descriptor, |
| const DeviceMemory<float>& filter_data, |
| const dnn::ConvolutionDescriptor& convolution_descriptor, |
| const dnn::BatchDescriptor& bias_descriptor, |
| const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode, |
| const dnn::BatchDescriptor& output_descriptor, |
| DeviceMemory<float>* output_data, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| // Enqueues a fused batchnorm+activation (inference) operation onto the |
| // stream. |
| // |
| // Arguments (all borrowed): |
| // |
| // stream: borrowed pointer to the stream that the 'fusion' operation should |
| // be enqueued onto. |
| // |
| // x_descriptor: dimensions of the batchnorm input layer. |
| // x_data: device memory which contains the batchnorm input. |
| // |
| // scale_offset_mean_variance_descriptor: |
| // dimensions of the scale/offset/mean/variance tensor. |
| // scale_data: device memory which contains the scale input. |
| // offset_data: device memory which contains the offset input. |
| // mean_data: device memory which contains the mean input. |
| // variance_data: device memory which contains the variance input. |
| // epsilon : the epsilon value to use in batchnorm calculation |
| // |
| // activation_mode: Type of activation to perform. |
| // |
| // y_data: device memory region in which to place the fusion result. |
| // |
| // output_profile_result: the output profile result for this call. |
| // The profiling is only enabled when this is not nullptr. |
| // |
| virtual bool DoFusedBatchNormActivationInference( |
| Stream* stream, const dnn::BatchDescriptor& x_descriptor, |
| const DeviceMemory<float>& x_data, |
| const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, |
| const DeviceMemory<float>& scale_data, |
| const DeviceMemory<float>& offset_data, |
| const DeviceMemory<float>& mean_data, |
| const DeviceMemory<float>& variance_data, double epsilon, |
| dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| virtual bool DoFusedBatchNormActivationInference( |
| Stream* stream, const dnn::BatchDescriptor& x_descriptor, |
| const DeviceMemory<Eigen::half>& x_data, |
| const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, |
| const DeviceMemory<float>& scale_data, |
| const DeviceMemory<float>& offset_data, |
| const DeviceMemory<float>& mean_data, |
| const DeviceMemory<float>& variance_data, double epsilon, |
| dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| // Enqueues a fused batchnorm+activation (training-fwd) operation onto the |
| // stream. |
| // |
| // Arguments (all borrowed): |
| // |
| // stream: borrowed pointer to the stream that the 'fusion' operation should |
| // be enqueued onto. |
| // |
| // x_descriptor: dimensions of the batchnorm input layer. |
| // x_data: device memory which contains the batchnorm input. |
| // |
| // scale_offset_mean_variance_descriptor: |
| // dimensions of the scale/offset/mean/variance tensor. |
| // scale_data: device memory which contains the scale input. |
| // offset_data: device memory which contains the offset input. |
| // epsilon : the epsilon value to use in batchnorm calculation |
| // |
| // activation_mode: Type of activation to perform. |
| // |
| // y_data: device memory region in which to place the fusion result. |
| // batch_mean_data: device memory in which to place the batch mean output. |
| // batch_var_data: device memory in which to place the batch variance output. |
| // saved_mean_data: device memory in which to save the mean for bwd pass. |
| // saved_var_data: device memory in which to save the variance for bwd pass. |
| // |
| // output_profile_result: the output profile result for this call. |
| // The profiling is only enabled when this is not nullptr. |
| // |
| virtual bool DoFusedBatchNormActivationForward( |
| Stream* stream, const dnn::BatchDescriptor& x_descriptor, |
| const DeviceMemory<float>& x_data, |
| const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, |
| const DeviceMemory<float>& scale_data, |
| const DeviceMemory<float>& offset_data, double epsilon, |
| dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data, |
| DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data, |
| DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| virtual bool DoFusedBatchNormActivationForward( |
| Stream* stream, const dnn::BatchDescriptor& x_descriptor, |
| const DeviceMemory<Eigen::half>& x_data, |
| const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, |
| const DeviceMemory<float>& scale_data, |
| const DeviceMemory<float>& offset_data, double epsilon, |
| dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data, |
| DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data, |
| DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| // Enqueues a fused batchnorm+activation (training-bwd) operation onto the |
| // stream. |
| // |
| // Arguments (all borrowed): |
| // |
| // stream: borrowed pointer to the stream that the 'fusion' operation should |
| // be enqueued onto. |
| // |
| // y_act_backprop_descriptor: dimensions of the backprop input from the |
| // previous layer. y_act_backprop_data: device memory which contains the |
| // backprop input. |
| // |
| // y_act_data: device memory which contains the actv-fwd output data. |
| // |
| // activation_mode: actv-fwd type. |
| // |
| // scale_offset_mean_variance_descriptor: |
| // dimensions of the scale/offset/mean/variance tensor. |
| // scale_data: device memory which contains the scale input. |
| // offset_data: device memory which contains the offset input. |
| // saved_mean_data: device memory which contains the saved mean from fwd |
| // pass. saved_var_data: device memory which contains the saved variance from |
| // fwd pass. |
| // |
| // x_bn_backprop_data: device memory region in which to place the backprop |
| // data from this layer scale_backprop_data: device memory in which to place |
| // the scale backprop output. offset_backprop_data: device memory in which to |
| // place the offset backprop output. |
| // |
| // output_profile_result: the output profile result for this call. |
| // The profiling is only enabled when this is not nullptr. |
| // |
| virtual bool DoFusedBatchNormActivationBackward( |
| Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor, |
| const DeviceMemory<float>& y_act_backprop_data, |
| const DeviceMemory<float>& y_act_data, |
| dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data, |
| const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, |
| const DeviceMemory<float>& scale_data, |
| const DeviceMemory<float>& offset_data, |
| const DeviceMemory<float>& saved_mean_data, |
| const DeviceMemory<float>& saved_var_data, |
| DeviceMemory<float>* x_bn_backprop_data, |
| DeviceMemory<float>* scale_backprop_data, |
| DeviceMemory<float>* offset_backprop_data, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| virtual bool DoFusedBatchNormActivationBackward( |
| Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor, |
| const DeviceMemory<Eigen::half>& y_act_backprop_data, |
| const DeviceMemory<Eigen::half>& y_act_data, |
| dnn::ActivationMode activation_mode, |
| const DeviceMemory<Eigen::half>& x_bn_data, |
| const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, |
| const DeviceMemory<float>& scale_data, |
| const DeviceMemory<float>& offset_data, |
| const DeviceMemory<float>& saved_mean_data, |
| const DeviceMemory<float>& saved_var_data, |
| DeviceMemory<Eigen::half>* x_bn_backprop_data, |
| DeviceMemory<float>* scale_backprop_data, |
| DeviceMemory<float>* offset_backprop_data, |
| dnn::ProfileResult* output_profile_result) { |
| return false; |
| } |
| |
| protected: |
| // Returns whether status is 'ok', and potentially logs the error. |
| static bool IsStatusOk(const port::Status& status, bool report_error); |
| |
| private: |
| virtual port::Status DoPrepareForConvolution( |
| ConvolutionKind kind, DataType element_type, Stream* stream, |
| const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data, |
| const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data, |
| const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data, |
| const ConvolutionDescriptor& convolution_descriptor, |
| const AlgorithmConfig& algorithm_config, |
| ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc, |
| DeviceMemory<uint8>* scratch_memory) { |
| *algorithm_desc = {}; |
| *scratch_memory = {}; |
| return port::Status::OK(); |
| } |
| |
| virtual port::Status DoPrepareForCtcLoss( |
| Stream* stream, DataType element_type, |
| const RnnStateTensorDescriptor& probs_desc, |
| const RnnStateTensorDescriptor& grads_desc, |
| absl::Span<const int> labels_data, |
| absl::Span<const int> labels_lengths_data, |
| absl::Span<const int> input_lengths_data, |
| ScratchAllocator* scratch_allocator, |
| DeviceMemory<uint8>* scratch_memory) { |
| *scratch_memory = {}; |
| return port::Status::OK(); |
| } |
| |
| SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport); |
| }; |
| |
| } // namespace dnn |
| } // namespace stream_executor |
| |
| #endif // TENSORFLOW_STREAM_EXECUTOR_DNN_H_ |