caffe2/operators/utility_ops.cc - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "caffe2/operators/utility_ops.h"

 #include <cmath>

 namespace caffe2 {

 template <>
 bool WeightedSumOp<CPUContext>::RunOnDevice() {
   return DoRunWithType<float>();
 }

 template <>
 template <typename T>
 void UniqueOp<CPUContext>::DoRun() {
   auto& inputTensor = Input(0);
   // use dim32 to enforce that it's fine to have remapping of type int
   int N = inputTensor.dim32(0);
   CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1, "Input should be a vector");
   auto* uniqueTensor = Output(UNIQUE);

   int* remapping = nullptr;
   if (REMAPPING < OutputSize()) {
     auto* remappingTensor = Output(REMAPPING);
     remappingTensor->ResizeLike(inputTensor);
     remapping = remappingTensor->template mutable_data<int>();
   }

   const T* input = inputTensor.template data<T>();
   // TODO(dzhulgakov): if perf becomes an issue consider doing hash table
   // instead of sorting
   order_.resize(N);
   std::iota(order_.begin(), order_.end(), 0);
   std::sort(order_.begin(), order_.end(), [input](const int x, const int y) {
     return input[x] < input[y];
   });
   int K = N;
   for (int i = 1; i < N; ++i) {
     K -= input[order_[i]] == input[order_[i - 1]];
   }
   uniqueTensor->Resize(K);
   T* unique = uniqueTensor->template mutable_data<T>();
   K = 0;
   T prev = -1;
   for (int i = 0; i < N; ++i) {
     if (i == 0 || prev != input[order_[i]]) {
       prev = unique[K++] = input[order_[i]];
     }
     if (remapping) {
       remapping[order_[i]] = K - 1;
     }
   }
 }

 REGISTER_CPU_OPERATOR(WallClockTime, WallClockTimeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Print, PrintOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Flatten, FlattenOp<CPUContext>);
 REGISTER_CPU_OPERATOR(FlattenToVec, FlattenToVecOp<CPUContext>);

 REGISTER_CPU_OPERATOR(Alias, AliasOp<CPUContext>);
 REGISTER_CPU_OPERATOR(ResizeLike, ResizeLikeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SumInt, SumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(WeightedSum, WeightedSumOp<CPUContext>);
 REGISTER_CPU_OPERATOR(
     ScatterWeightedSum,
     ScatterWeightedSumOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(Max, MaxOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(MaxGradient, MaxGradientOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
 // From whatever the current context, ensure the output is TensorCPU
 REGISTER_CPU_OPERATOR(
     EnsureCPUOutput,
     CopyOp<CPUContext, CPUContext, CPUContext>);
 // From CPU, copy it to whatever the current context
 REGISTER_CPU_OPERATOR(
     CopyFromCPUInput,
     CopyOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(
     CopyOnDeviceLike,
     CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(IsEmpty, IsEmptyOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Gather, GatherOp<CPUContext>);
 REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsGather, LengthsGatherOp<CPUContext>);
 REGISTER_CPU_OPERATOR(Unique, UniqueOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToSegmentIds, LengthsToSegmentIdsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToRanges, LengthsToRangesOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SegmentIdsToLengths, SegmentIdsToLengthsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SegmentIdsToRanges, SegmentIdsToRangesOp<CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToWeights, LengthsToWeightsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(EnsureDense, EnsureDenseOp<CPUContext>);
 REGISTER_CPU_OPERATOR(
     AccumulateHistogram,
     AccumulateHistogramOp<float, CPUContext>);

 OPERATOR_SCHEMA(WallClockTime)
     .NumInputs(0)
     .NumOutputs(1)
     .SetDoc("Time since epoch in nanoseconds.")
     .Output(0, "time", "The time in nanoseconds.");

 REGISTER_CPU_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CPUContext>);

 OPERATOR_SCHEMA(Print)
     .NumInputs(1)
     .NumOutputs(0)
     .SetDoc("Logs shape and contents of input tensor to stderr or to a file.")
     .Arg(
         "to_file",
         "(bool) if 1, saves contents to the root folder of the current "
         "workspace, appending the tensor contents to a file named after "
         "the blob name. Otherwise, logs to stderr.")
     .Input(0, "tensor", "The tensor to print.");

 OPERATOR_SCHEMA(LengthsToShape).NumInputs(1).NumOutputs(1);

 OPERATOR_SCHEMA(Flatten)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction([](const OperatorDef& def,
                                 const vector<TensorShape>& in) {
       ArgumentHelper helper(def);
       const int axis = helper.GetSingleArgument<int>("axis", 1);
       vector<TensorShape> out(1);
       TIndex outer = 1;
       TIndex inner = 1;
       std::size_t index = 0;
       for (auto d : in[0].dims()) {
         if (index < axis) {
           outer *= d;
         } else {
           inner *= d;
         }
         ++index;
       }
       out[0].set_data_type(in[0].data_type());
       out[0].add_dims(outer);
       out[0].add_dims(inner);
       return out;
     })
     .SetDoc(R"DOC(
 Flattens the input tensor into a 2D matrix. If input tensor has shape
 (d_0, d_1, ... d_n) then the output will have shape
 (d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn)
 )DOC")
     .Input(0, "input", "A tensor of rank >= axis.")
     .Output(
         0,
         "output",
         "A 2D tensor with the contents of the input tensor, "
         "with input dimensions up to axis flattened to the outer dimension "
         "of the output and remaining input dimensions flattened into the inner "
         "dimension of the output.")
     .Arg(
         "axis",
         "(Default to 1) Indicate up to which input dimensions "
         "(exclusive) should be flattened to the outer dimension of the output");

 OPERATOR_SCHEMA(FlattenToVec)
     .NumInputs(1)
     .NumOutputs(1)
     .TensorInferenceFunction([](const OperatorDef& /*def*/,
                                 const vector<TensorShape>& in) {
       vector<TensorShape> out(1);
       int total = 1;
       for (auto d : in[0].dims()) {
         total *= d;
       }
       out[0].set_data_type(in[0].data_type());
       out[0].add_dims(total);
       return out;
     })
     .SetDoc(R"DOC(
 Flattens the input tensor into a 1D vector.
 )DOC")
     .Input(0, "input", "A tensor of rank >= 1.")
     .Output(
         0,
         "output",
         "A tensor of rank 1 with the contents of the input tensor");

 OPERATOR_SCHEMA(Alias)
     .NumInputs(1)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .SetDoc(R"DOC(
 Makes the output and the input share the same underlying storage.

 WARNING: in general, in caffe2's operator interface different tensors should
 have different underlying storage, which is the assumption made by
 components such as the dependency engine and memory optimization. Thus, in
 normal situations you should not use the AliasOp, especially in a normal
 forward-backward pass.

 The Alias op is provided so one can achieve true asynchrony, such as
 Hogwild, in a graph. But make sure you understand all the implications
 similar to multi-thread computation before you use it explicitly.
 )DOC")
     .Input(0, "input", "Input tensor whose storage will be shared.")
     .Output(0, "output", "Tensor of same shape as input, sharing its storage.");

 OPERATOR_SCHEMA(ResizeLike)
     .NumInputs(2)
     .NumOutputs(1)
     .TensorInferenceFunction([](const OperatorDef& /*def*/,
                                 const vector<TensorShape>& in) {
       vector<TensorShape> out(1);
       out.push_back(in[1]);
       out[0].set_data_type(in[0].data_type());
       return out;
     })
     .SetDoc(R"DOC(
 Produces tensor containing data of first input and shape of second input.
 )DOC")
     .Input(0, "data", "Tensor whose data will be copied into the output.")
     .Input(1, "shape_tensor", "Tensor whose shape will be applied to output.")
     .Output(0, "output", "Tensor with data of input 0 and shape of input 1.");

 OPERATOR_SCHEMA(SumInt)
     .NumInputs(1, INT_MAX)
     .NumOutputs(1)
     .InputsCanCrossDevices()
     .TensorInferenceFunction([](const OperatorDef& /*def*/,
                                 const vector<TensorShape>& in) {
       vector<TensorShape> out(1);
       out.push_back(in[0]);
       out[0].set_data_type(TensorProto::INT32);
       return out;
     })
     .AllowInplace({{0, 0}});

 OPERATOR_SCHEMA(WeightedSum)
     .NumInputs([](int n) { return (n > 0 && n % 2 == 0); })
     .NumOutputs(1)
     .AllowInplace({{0, 0}})
     .IdenticalTypeAndShapeOfInput(0)
     .SetDoc(R"DOC(
 Element-wise weighted sum of several data, weight tensor pairs.
 Input should be in the form X_0, weight_0, X_1, weight_1, ... where X_i all
 have the same shape, and weight_i are size 1 tensors that specifies the weight
 of each vector. Note that if one wants to do in-place computation, it could
 only be done with X_0 also as the output, but not other X_i.
 )DOC")
     .Input(0, "data_0", "First of the input tensors.")
     .Input(0, "weight_0", "Weight of the first input in the sum.")
     .Output(0, "output", "Result containing weighted elem-wise sum of inputs.");

 OPERATOR_SCHEMA(ScatterWeightedSum)
     .NumInputs([](int n) { return (n > 3 && (n - 3) % 2 == 0); })
     .NumOutputs(1)
     .EnforceInplace({{0, 0}})
     .SetDoc(R"DOC(
 Similar to WeightedSum, computes the weighted sum of several tensors, with
 the difference that inputs are sliced tensors. The first tensor has to be
 in-place and only slices of it on the first dimension as indexed by INDICES
 will be updated.

 Note: The op pretty much ignores the exact shapes of the input arguments and
 cares only about sizes. It's done for performance consideration to avoid
 unnecessary reshapes. Only first dimension of X_0 is important, let's call it
 N. If M is the total size of X_0 and K is the size of INDICES then X_i is
 assumed to be of shape K x (M / N) regardless of the real shape.

 Note: Each update in INDICES is applied independently which means that if
 duplicated elements are present in INDICES the corresponding slice of X_0
 will be scaled multiple times. Manual collapsing of INDICES is required
 beforehand if necessary.

 Note: Updates are applied sequentially by inputs which might have undesired
 consequences if the input tensor is accessed concurrently by different op
 (e.g. when doing Hogwild). Other threads might see intermediate results even
 on individual slice level, e.g. X_0 scaled by weight_0 but without any
 updates applied.

 Currently only works on CPU because of access to INDICES.
 )DOC")
     .Input(0, "X_0", "Tensor to be updated.")
     .Input(
         1,
         "Weight_0",
         "Scalar weight for X_0, applied only to slices affected.")
     .Input(
         2,
         "INDICES",
         "1-D list of indices on the first dimension of X_0 "
         "that need to be updated")
     .Input(3, "X_1", "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
     .Input(4, "Weight_1", "Scalar weight for X_1 update")
     .Output(0, "X_0", "Has to be exactly the same tensor as the input 0")
     .EnforceInplace({{0, 0}});

 OPERATOR_SCHEMA(Max)
     .NumInputs(1, INT_MAX)
     .NumOutputs(1)
     .IdenticalTypeAndShapeOfInput(0)
     .AllowInplace({{0, 0}})
     .SetDoc(R"DOC(
 Element-wise max of each of the input tensors. The first input tensor can be
 used in-place as the output tensor, in which case the max will be done in
 place and results will be accumulated in input0. All inputs and outputs must
 have the same shape and data type.
 )DOC")
     .Input(0, "data_0", "First of the input tensors. Can be inplace.")
     .Output(0, "max", "Output tensor. Same dimension as inputs.");

 OPERATOR_SCHEMA(MaxGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);

 OPERATOR_SCHEMA(ScatterAssign)
     .NumInputs(3)
     .NumOutputs(1)
     .EnforceInplace({{0, 0}})
     .SetDoc(R"DOC(
 Update slices of the tensor in-place by overriding current value.

 Note: The op pretty much ignores the exact shapes of the input arguments and
 cares only about sizes. It's done for performance consideration to avoid
 unnecessary reshapes. Only first dimension of X_0 is important, let's call it
 N. If M is the total size of X_0 and K is the size of INDICES then X_i is
 assumed to be of shape K x (M / N) regardless of the real shape.

 Note: Each update in INDICES is applied independently which means that if
 duplicated elements are present in INDICES arbitrary one will win.

 Currently only works on CPU because of access to INDICES.
 )DOC")
     .Input(0, "DATA", "Tensor to be updated.")
     .Input(
         1,
         "INDICES",
         "1-D list of indices on the first dimension"
         "of X_0 that need to be updated")
     .Input(
         2,
         "SLICES",
         "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
     .Output(0, "DATA", "Has to be exactly the same tensor as the input 0");

 OPERATOR_SCHEMA(Copy)
     .NumInputs(1)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .InputsCanCrossDevices()
     .SetDoc("Copy input tensor into output, potentially across devices.")
     .Input(0, "input", "The input tensor.")
     .Output(0, "output", "Tensor that will contain a copy of the input.");

 OPERATOR_SCHEMA(CopyGPUToCPU)
     .NumInputs(1)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .InputsCanCrossDevices()
     .DeviceInferenceFunction([](const OperatorDef& def) {
       CAFFE_ENFORCE(
           def.has_device_option(),
           "CopyGPUToCPU op should have cuda device option.");
       auto& cuda_option = def.device_option();
       auto cpu_option = DeviceOption();
       vector<DeviceOption> in_dev(def.input_size(), cuda_option);
       vector<DeviceOption> out_dev(def.output_size(), cpu_option);
       return std::make_pair(in_dev, out_dev);
     })
     .SetDoc(R"DOC(
 Copy tensor for GPU to CPU context. Must be run under GPU device option.
 )DOC")
     .Input(0, "input", "The input tensor.")
     .Output(0, "output", "Tensor that will contain a copy of the input.");

 OPERATOR_SCHEMA(CopyCPUToGPU)
     .NumInputs(1)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .InputsCanCrossDevices()
     .DeviceInferenceFunction([](const OperatorDef& def) {
       CAFFE_ENFORCE(
           def.has_device_option(),
           "CopyCPUToGPU op should have cuda device option.");
       auto& cuda_option = def.device_option();
       auto cpu_option = DeviceOption();
       vector<DeviceOption> in_dev(def.input_size(), cpu_option);
       vector<DeviceOption> out_dev(def.output_size(), cuda_option);
       return std::make_pair(in_dev, out_dev);
     })
     .SetDoc(R"DOC(
 Copy tensor for CPU to GPU context. Must be run under GPU device option.
 )DOC")
     .Input(0, "input", "The input tensor.")
     .Output(0, "output", "Tensor that will contain a copy of the input.");

 OPERATOR_SCHEMA(EnsureCPUOutput)
     .NumInputs(1)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .InputsCanCrossDevices()
     .DeviceInferenceFunction([](const OperatorDef& def) {
       auto op_device =
           def.has_device_option() ? def.device_option() : DeviceOption();
       auto cpu_option = DeviceOption();
       vector<DeviceOption> in_dev(def.input_size(), op_device);
       vector<DeviceOption> out_dev(def.output_size(), cpu_option);
       return std::make_pair(in_dev, out_dev);
     })
     .SetDoc(R"DOC(
 Take an input tensor in the current Context (GPU or CPU) and create an output
 which is always a TensorCPU. This may involves cross-device MemCpy.
 )DOC")
     .Input(0, "input", "The input CUDA or CPU tensor.")
     .Output(0, "output", "TensorCPU that is a copy of the input.");

 OPERATOR_SCHEMA(CopyFromCPUInput)
     .NumInputs(1)
     .NumOutputs(1)
     .IdenticalTypeAndShape()
     .InputsCanCrossDevices()
     .DeviceInferenceFunction([](const OperatorDef& def) {
       auto op_device =
           def.has_device_option() ? def.device_option() : DeviceOption();
       auto cpu_option = DeviceOption();
       vector<DeviceOption> in_dev(def.input_size(), cpu_option);
       vector<DeviceOption> out_dev(def.output_size(), op_device);
       return std::make_pair(in_dev, out_dev);
     })
     .SetDoc(R"DOC(
 Take a CPU input tensor and copy it to an output in the current
 Context (GPU or CPU). This may involves cross-device MemCpy.
 )DOC")
     .Input(0, "input", "The input CPU tensor.")
     .Output(0, "output", "either a TensorCUDA or a TensorCPU");

 OPERATOR_SCHEMA(CopyOnDeviceLike)
     .NumInputs(2)
     .NumOutputs(1)
     .SetDoc("Copy input tensor into output to the specific device.")
     .Input(0, "input", "The input tensor.")
     .Input(1, "dst", "Tensor, on which device the copy will be performed.")
     .Output(0, "output", "Tensor that will contain a copy of the input.");

 OPERATOR_SCHEMA(HasElements)
     .NumInputs(1)
     .NumOutputs(1)
     .SetDoc("Returns true iff the input tensor has size > 0")
     .Input(0, "tensor", "Tensor of any type.")
     .Output(
         0,
         "has_elements",
         "Scalar bool tensor. True if input is not empty.");

 OPERATOR_SCHEMA(IsEmpty)
     .NumInputs(1)
     .NumOutputs(1)
     .SetDoc("Returns true iff the input tensor has size == 0")
     .ScalarType(::caffe2::TensorProto_DataType::TensorProto_DataType_BOOL)
     .Input(0, "tensor", "Tensor of any type.")
     .Output(0, "is_empty", "Scalar bool tensor. True if input is empty.");

 OPERATOR_SCHEMA(Gather)
     .NumInputs(2)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 Given DATA tensor of rank r >= 1, and INDICES tensor of rank q, gather
 entries of the outer-most dimension of DATA indexed by INDICES, and concatenate
 them in an output tensor of rank q + (r - 1).

 Example:
   DATA  = [
       [1.0, 1.2],
       [2.3, 3.4],
       [4.5, 5.7],
   ]
   INDICES = [
       [0, 1],
       [1, 2],
   ]
   OUTPUT = [
       [
           [1.0, 1.2],
           [2.3, 3.4],
       ],
       [
           [2.3, 3.4],
           [4.5, 5.7],
       ],
   ]
 )DOC")
     .Input(0, "DATA", "Tensor of rank r >= 1.")
     .Input(1, "INDICES", "Tensor of int32/int64 indices, of any rank q.")
     .Output(0, "OUTPUT", "Tensor of rank q + (r - 1).")
     .TensorInferenceFunction([](const OperatorDef& def,
                                 const vector<TensorShape>& in) {
       vector<TensorShape> out(1);
       for (auto d : in[1].dims()) {
         out[0].add_dims(d);
       }
       for (int i = 1; i < in[0].dims_size(); ++i) {
         out[0].add_dims(in[0].dims(i));
       }
       out[0].set_data_type(in[0].data_type());
       return out;
     });

 OPERATOR_SCHEMA(GatherRanges)
     .NumInputs(2)
     .NumOutputs(2)
     .SetDoc(R"DOC(
 Given DATA tensor of rank 1, and RANGES tensor of rank 3, gather
 corresponding ranges into a 1-D tensor OUTPUT.

 RANGES dimentions description:
 1: represents list of examples within a batch
 2: represents list features
 3: two values which are start and length or a range (to be applied on DATA)

 Another output LENGTHS represents each example length within OUTPUT

 Example:
   DATA  = [1, 2, 3, 4, 5, 6]
   RANGES = [
     [
       [0, 1],
       [2, 2],
     ],
     [
       [4, 1],
       [5, 1],
     ]
   ]
   OUTPUT = [1, 3, 4, 5, 6]
   LENGTHS = [3, 2]
 )DOC")
     .Input(0, "DATA", "Tensor of rank 1.")
     .Input(
         1,
         "RANGES",
         "Tensor of int32/int64 ranges, of dims (N, M, 2). "
         "Where N is number of examples and M is a size of each example. "
         "Last dimension represents a range in the format (start, lengths)")
     .Output(0, "OUTPUT", "1-D tensor of size sum of range lengths")
     .Output(
         1,
         "LENGTHS",
         "1-D tensor of size N with lengths over gathered data"
         " for each row in a batch. sum(LENGTHS) == OUTPUT.size()");

 OPERATOR_SCHEMA(LengthsGather)
     .NumInputs(3)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 Gather items from sparse tensor. Sparse tensor is described by items and
 lengths. This operator gathers items corresponding to lengths at the given
 indices. This deliberately doesn't return lengths of OUTPUTS so that both lists
 and maps can be supported without special cases. If you need lengths tensor for
  OUTPUT, use `Gather`.

 Example:
 ```
 ITEMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
 LENGTHS = [0, 2, 3, 1, 4]
 INDICES = [0, 2, 4]

 OUTPUT = [2, 3, 4, 6, 7, 8, 9]
 ```

 )DOC")
     .Input(0, "ITEMS", "items tensor")
     .Input(1, "LENGTHS", "lengths tensor")
     .Input(2, "INDICES", "indices into LENGTHS where items should be gathered")
     .Output(0, "OUTPUT", "1-D tensor containing gathered items");

 OPERATOR_SCHEMA(Unique)
     .NumInputs(1)
     .NumOutputs(1, 2)
     .SetDoc(R"DOC(
 Deduplicates input indices vector and optionally produces reverse remapping.
 There's no guarantees on the ordering of the output indices.
 )DOC")
     .Input(0, "indices", "1D tensor of int32 or int64 indices.")
     .Output(0, "unique_indices", "1D tensor of deduped entries.")
     .Output(
         1,
         "remapping",
         "(optional) mapping from `indices` to `unique_indices`. This has the "
         "same shape as `indices`. Its elements are the indices into "
         "`unique_indices` such that `Gather(['unique_indices', 'remapping'])` "
         "yields `indices`.")
     .TensorInferenceFunction([](const OperatorDef& def,
                                 const vector<TensorShape>& in) {
       std::vector<TensorShape> out(1);
       out[0].set_data_type(in[0].data_type());
       CAFFE_ENFORCE_EQ(in[0].dims_size(), 1);
       if (in[0].dims(0) <= 1) {
         // This special case is useful in some situation, e.g., when feeding
         // tensor inference with empty tensor (where the first dim is the batch
         // size)
         out[0].add_dims(in[0].dims(0));
       } else {
         out[0].set_unknown_shape(true);
       }
       if (def.output_size() > 1) {
         // Remapping has the same shape as the input tensor
         out.push_back(in[0]);
         out.back().set_data_type(TensorProto::INT32);
       }
       return out;
     });

 OPERATOR_SCHEMA(LengthsToSegmentIds)
     .NumInputs(1)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 Given a vector of segment lengths, returns a zero-based, consecutive vector
 of segment_ids. For example, [1, 3, 0, 2] will produce [0, 1, 1, 1, 3, 3].
 In general, the inverse operation is SegmentIdsToLengths. Notice though that
 trailing empty sequence lengths can't be properly recovered from segment ids.
 )DOC")
     .Input(0, "lengths", "1D tensor of int32 or int64 segment lengths.")
     .Output(0, "segment_ids", "1D tensor of length `sum(lengths)`");

 OPERATOR_SCHEMA(LengthsToRanges)
     .NumInputs(1)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 Given a vector of segment lengths, calculates offsets of each segment and packs
 them next to the lengths. For the input vector of length N the output is a Nx2
 matrix with (offset, lengths) packaged for each segment.

 For example, `[1, 3, 0, 2]` transforms into `[[0, 1], [1, 3], [4, 0], [4, 2]]`.
 )DOC")
     .Input(0, "lengths", "1D tensor of int32 segment lengths.")
     .Output(
         0,
         "ranges",
         "2D tensor of shape len(lengths) X 2 and the same type as `lengths`");

 OPERATOR_SCHEMA(SegmentIdsToLengths)
     .NumInputs(1, 2)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 Transfers a vector of segment ids to a vector of segment lengths. This operation
 supports non-consecutive segment ids. Segments not appearing in the input vector
 will have length 0. If the second input is provided, the number of segments =
 the size of its first dimension. Otherwise, the number of segments = the last
 index in the first input vector + 1.

 In general, for consecutive, zero-based segment IDs, this is the inverse
 operation of LengthsToSegmentIds, except that a vector of segment IDs
 cannot represent empty segments at the end (if the second input is absent).
 )DOC")
     .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
     .Input(
         1,
         "data (optional)",
         "if provided, number of segments = the size of its first dimension")
     .Output(0, "lengths", "1-D int64_t tensor of segment lengths");

 OPERATOR_SCHEMA(SegmentIdsToRanges)
     .NumInputs(1, 2)
     .NumOutputs(1)
     .SetDoc(R"DOC(
 Transfers a vector of segment ids to a vector of segment ranges. This operation
 supports non-consecutive segment ids. Segments not appearing in the input vector
 will have length 0. If the second input is provided, the number of segments =
 the size of its first dimension. Otherwise, the number of segments = the last
 index in the first input vector + 1.
 )DOC")
     .Input(0, "segment_ids", "1-D int32_t or int64_t tensor of segment ids")
     .Input(
         1,
         "data (optional)",
         "if provided, number of segments = the size of its first dimension")
     .Output(0, "lengths", "1-D int64_t tensor of segment lengths");

 OPERATOR_SCHEMA(LengthsToWeights)
     .NumInputs(1)
     .NumOutputs(1)
     .Arg("power", "n of 1/pow(length,n) for normalization")
     .SetDoc(R"DOC(
 Similar as LengthsToSegmentIds but output vector of segment
 weights derived by lengths. i.e 1/pow(length, power)
 )DOC")
     .Input(0, "lengths", "1-D int32_t or int64_t tensor of lengths")
     .Output(0, "a vector of weights", "1-D float tensor of weights by length");


 SHOULD_NOT_DO_GRADIENT(WallClockTime);

 OPERATOR_SCHEMA(UnsafeCoalesce)
     .NumInputsOutputs([](int inputs, int outputs) {
       return inputs + 1 == outputs;
     })
     .AllowInplace([](int input, int output) { return input == output; })
     .SetDoc(R"DOC(
 Coalesce the N inputs into N outputs and a single coalesced output blob.

 This allows operations that operate over multiple small kernels (e.g.
 biases in a deep CNN) to be coalesced into a single larger operation,
 amortizing the kernel launch overhead, synchronization costs for
 distributed computation, etc.

 The operator:

 - computes the total size of the coalesced blob by summing the input sizes
 - allocates the coalesced output blob as the total size
 - copies the input vectors into the coalesced blob, at the correct offset.
 - aliases each Output(i) to- point into the coalesced blob, at the
   corresponding offset for Input(i).

 This is 'unsafe' as the output vectors are aliased, so use with
 caution.

 )DOC");

 OPERATOR_SCHEMA(EnsureDense)
     .NumInputs(1)
     .NumOutputs(1)
     .AllowInplace({{0, 0}})
     .IdenticalTypeAndShape()
     .SetDoc(R"DOC(
 This operator converts dense or sparse gradients to dense ones.
 Therefore, sparse gradient can be back propagated to Operators that consume
 dense gradients only (e.g., FCGradient).

 The operator's behaviors:
 - In forward, simply pass in place or copy input to the output.
 - In backward, if the gradient passed-in is sparse gradient, change it to
   dense gradient in linear time; otherwise, simply pass the dense gradient.
 )DOC")
     .Input(0, "input", "Input tensors.")
     .Output(0, "output", "Output tensor. Same dimension as inputs.");

 OPERATOR_SCHEMA(AccumulateHistogram)
     .NumInputs(1)
     .NumOutputs(2)
     .SetDoc(R"DOC(
 This operator calculate thes histogram of values in input tensor.
 There're 2 outputs, one for histogram of current input tensor, and another
 for histogram of the all input tensors accumulated through history.
 The output would contain num_buckets + 2 values. index[1 ... num_buckets]
 for values in [lower_bound, upper_bound) interval. And the rest 2 for values
 smaller than lower_bound or greater than upper_bound respectively.
 )DOC")
     .Input(0, "X", "Input tensor.")
     .Output(0, "CurHist", "Output histogram of the current tensor.")
     .Output(1, "AccHist", "Accumulated histogram of the history tensor.")
     .Arg("lower_bound", "the lower bound value")
     .Arg("upper_bound", "the upper bound value")
     .Arg(
         "num_buckets",
         "number of buckets to use in [lower_bound, upper_bound)");

 class GetEnsureDenseGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     CAFFE_ENFORCE(
         GradOut(0).IsSparse() || GradOut(0).IsDense(),
         "Input gradient ",
         O(0),
         " should be either sparse or dense.");

     if (GradOut(0).IsDense()) {
       SetDense(0, GO(0));
       return vector<OperatorDef>();
     } else {
       return SingleGradientDef(
           "SparseToDense",
           "",
           vector<string>{GO_I(0), GO_V(0), I(0)},
           vector<string>{GI(0)});
     }
   }
 };
 REGISTER_GRADIENT(EnsureDense, GetEnsureDenseGradient);

 SHOULD_NOT_DO_GRADIENT(Print);
 SHOULD_NOT_DO_GRADIENT(HasElements);
 SHOULD_NOT_DO_GRADIENT(IsEmpty);
 SHOULD_NOT_DO_GRADIENT(LengthsToShape);
 SHOULD_NOT_DO_GRADIENT(UnsafeCoalesce);

 class GetFlattenGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "ResizeLike", "", vector<string>{GO(0), I(0)}, vector<string>{GI(0)});
   }
 };
 REGISTER_GRADIENT(Flatten, GetFlattenGradient);

 class GetAliasGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     // We will simply pass-along the gradient. Nothing needs to
     // be calculated.
     SetDense(0, GO(0));
     return vector<OperatorDef>();
   }
 };
 REGISTER_GRADIENT(Alias, GetAliasGradient);

 SHOULD_NOT_DO_GRADIENT(ResizeLike);

 class GetSumGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     for (auto i = 0; i < def_.input_size(); ++i) {
       SetDense(i, GO(0));
     }
     return vector<OperatorDef>();
   }
 };
 REGISTER_GRADIENT(Sum, GetSumGradient);

 // TODO(jiayq): Weighted sum is originally intended to be used in SGD, but in
 // theory, its gradient DOES exist. Should we enable the gradient?
 SHOULD_NOT_DO_GRADIENT(WeightedSum);
 SHOULD_NOT_DO_GRADIENT(ScatterWeightedSum);
 SHOULD_NOT_DO_GRADIENT(ScatterAssign);

 class GetMaxGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     auto gradInputs = vector<string>();
     auto inputs = vector<string>{O(0), GO(0)};
     for (int i = 0; i < def_.input_size(); i++) {
       gradInputs.push_back(GI(i));
       inputs.push_back(I(i));
     }
     return SingleGradientDef("MaxGradient", "", inputs, gradInputs);
   }
 };
 REGISTER_GRADIENT(Max, GetMaxGradient);

 class GetGatherGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     ArgumentHelper argsHelper(def_);
     const bool dense_gradient =
         argsHelper.GetSingleArgument<bool>("dense_gradient", false);

     using Op = GatherOp<CPUContext>;

     if (dense_gradient) {
       return vector<OperatorDef>{CreateOperatorDef(
           "SparseToDense",
           "",
           vector<string>{I(Op::INDICES), GO(0), I(Op::DATA)},
           vector<string>{GI(Op::DATA)})};
     } else {
       // For now we don't do any reshaping as the consumer of this op would
       // probably be ScatterUpdate which is intenionally ignores shapes. We
       // might need to revisit it in the future for correctness purposes. The
       // right shape for the output woild be to flatten INDICES and collapse
       // first X dims of GRAD
       SetSparse(Op::DATA, I(Op::INDICES), GO(0));
       return vector<OperatorDef>();
     }
   }
 };
 REGISTER_GRADIENT(Gather, GetGatherGradient);

 struct GetFlattenToVecGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "ResizeLike", "", vector<string>{GO(0), I(0)}, vector<string>{GI(0)});
   }
 };
 REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient);

 struct GetCopyGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "CopyOnDeviceLike",
         "",
         vector<string>{GO(0), I(0)},
         vector<string>{GI(0)});
   }
 };
 REGISTER_GRADIENT(Copy, GetCopyGradient);

 struct GetGPUToCPUGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     if (g_output_[0].IsDense()) {
       return SingleGradientDef(
           "CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
     } else {
       return vector<OperatorDef>{CreateOperatorDef(
                                      "CopyCPUToGPU",
                                      "",
                                      std::vector<string>{GO_I(0)},
                                      std::vector<string>{GI_I(0)}),
                                  CreateOperatorDef(
                                      "CopyCPUToGPU",
                                      "",
                                      std::vector<string>{GO_V(0)},
                                      std::vector<string>{GI_V(0)})};
     }
   }
 };
 REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);

 struct GetCPUToGPUGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     if (g_output_[0].IsDense()) {
       return SingleGradientDef(
           "CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
     } else {
       return vector<OperatorDef>{CreateOperatorDef(
                                      "CopyGPUToCPU",
                                      "",
                                      std::vector<string>{GO_I(0)},
                                      std::vector<string>{GI_I(0)}),
                                  CreateOperatorDef(
                                      "CopyGPUToCPU",
                                      "",
                                      std::vector<string>{GO_V(0)},
                                      std::vector<string>{GI_V(0)})};
     }
   }
 };
 REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);

 SHOULD_NOT_DO_GRADIENT(Unique);
 SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengthWeights);
 SHOULD_NOT_DO_GRADIENT(GatherRangesOp);
 SHOULD_NOT_DO_GRADIENT(LengthsGather);
 SHOULD_NOT_DO_GRADIENT(AccumulateHistogram);

 template <typename T, class Context>
 bool MaxOp<T, Context>::Compute() {
   auto& input0 = Input(0);
   const int N = input0.size();
   T* output_data = Output(0)->template mutable_data<T>();

   for (int i = 1; i < InputSize(); i++) {
     auto input_data = Input(i).template data<T>();
     EigenVectorMap<T> output_vec(output_data, N);
     output_vec = output_vec.cwiseMax(ConstEigenVectorMap<T>(input_data, N));
   }

   return true;
 }

 template <typename T, class Context>
 bool MaxGradientOp<T, Context>::RunOnDevice() {
   auto& output = Input(0);
   auto& grad_output = Input(1);
   const int kInputStartOffset = 2;

   const T* data = output.template data<T>();
   ConstEigenArrayMap<T> output_array(
       output.template data<T>(), 1, output.size());
   ConstEigenArrayMap<T> grad_out_array(
       grad_output.template data<T>(), 1, grad_output.size());

   for (int i = 0; i < OutputSize(); i++) {
     auto& input = Input(i + kInputStartOffset);
     ConstEigenArrayMap<T> input_array(
         input.template data<T>(), 1, input.size());

     auto* grad_input = Output(i);
     grad_input->ResizeLike(input);
     EigenArrayMap<T> grad_in_array(
         grad_input->template mutable_data<T>(), 1, grad_input->size());
     grad_in_array = grad_out_array *
         input_array.cwiseEqual(output_array).template cast<T>();
   }
   return true;
 }

 template <>
 bool NanCheckOp<CPUContext>::RunOnDevice() {
   auto& X = Input(0);
   auto* Y = Output(0);
   const int D = X.size();
   const float* data = X.data<float>();
   ConstEigenVectorMap<float> input_data(data, D);

   bool all_finite = input_data.allFinite();

   if (!all_finite) {
     std::cerr << "Tensor contained NaN or inf: [" << this->debug_def().input(0)
               << "]" << std::endl;

     for (int j = 0; j < InputSize(); j++) {
       std::cerr << "Tensor name: " << this->debug_def().input(j) << std::endl;
       std::cerr << "Input tensor:" << std::endl;
       tensorPrinter_.Print<float>(Input(j));
       std::cerr << "NaN idxs:" << std::endl;
       const float* x = Input(j).data<float>();
       for (size_t i = 0; i < Input(j).size(); ++i) {
         if (std::isnan(x[i]) || std::isinf(x[i])) {
           std::cerr << i << " ";
         }
       }
       std::cerr << std::endl;
     }
     return false;
   }

   if (&X != Y) {
     Y->CopyFrom(X, &context_);
   }
   return true;
 }
 REGISTER_CPU_OPERATOR(NanCheck, NanCheckOp<CPUContext>);
 REGISTER_GRADIENT(NanCheck, GetNanCheckGradient);

 OPERATOR_SCHEMA(NanCheck)
     .NumInputs(1, INT_MAX)
     .NumOutputs(1)
     .AllowInplace({{0, 0}})
     .IdenticalTypeAndShapeOfInput(0)
     .SetDoc("Identity operator, but checks all values for nan or inf")
     .Input(0, "tensor", "Tensor to check for nan/inf")
     .Output(
         0,
         "output",
         "Tensor to copy input into if no NaNs or inf."
         " Can be in-place");

 OPERATOR_SCHEMA(Size)
     .NumInputs(1)
     .NumOutputs(1)
     .SetDoc(
         "Return a 1D tensor of type int64 that contains the number "
         "of elements of the input tensor")
     .Input(0, "tensor", "Tensor to calculate number of elements")
     .Output(
         0,
         "output",
         "1D tensor of type int64 that contains the number of "
         "elements in the input tensor.");

 REGISTER_CPU_OPERATOR(Size, SizeOp<CPUContext>);
 NO_GRADIENT(Size);

 template <>
 template <typename T>
 bool RangeOp<CPUContext>::DoRunOnDevice(
     const T& start,
     const T& step,
     Tensor<CPUContext>* output) {
   auto* output_data = output->template mutable_data<T>();
   for (int i = 0; i < output->size(); ++i) {
     output_data[i] = i * step + start;
   }
   return true;
 }

 OPERATOR_SCHEMA(Range)
     .NumInputs(1, 3)
     .NumOutputs(1)
     .SetDoc(
         "Values are generated within the half-open interval [start, stop) "
         "(in other words, the interval including start but excluding stop). "
         "When called with a single value, this will return `[0, v]` with the "
         "result type inferred from the input types.")
     .Input(
         0,
         "start",
         "Optional scalar Tensor with the start of the interval (inclusive).")
     .Input(1, "stop", "scalar Tensor with the end of the interval (exclusive)")
     .Input(2, "step", "Optional scalar Tensor with spacing between values.")
     .Output(
         0,
         "output",
         "1D tensor of same type as inputs that contains the sequence.");

 REGISTER_CPU_OPERATOR(Range, RangeOp<CPUContext>);
 NO_GRADIENT(Range);

 } // namespace caffe2