caffe2/operators/utility_ops.h - platform/external/pytorch - Git at Google

 #ifndef CAFFE2_OPERATORS_UTILITY_OPS_H_
 #define CAFFE2_OPERATORS_UTILITY_OPS_H_

 #include "caffe2/core/common_omp.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 template <class Context>
 class WallClockTimeOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   WallClockTimeOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws) {}

   bool RunOnDevice() override {
     int64_t nanoseconds = static_cast<long int>(
         std::chrono::duration_cast<std::chrono::nanoseconds>(
             std::chrono::high_resolution_clock::now().time_since_epoch())
             .count());

     TensorCPU* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize();
     *output->template mutable_data<int64_t>() = nanoseconds;

     return true;
   }
 };

 const char kPrintFileExtension[] = ".log";

 template <class Context>
 class PrintOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_DISPATCH_HELPER;
   PrintOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         tensor_printer_(
             def().input(0),
             OperatorBase::GetSingleArgument<int>("to_file", 0)
                 ? ws->RootFolder() + "/" + def().input(0) + kPrintFileExtension
                 : "",
             OperatorBase::GetSingleArgument<int>("limit", 0)) {}

   bool RunOnDevice() override {
     if (!OperatorBase::InputIsType<Tensor<Context>>(0) &&
         !OperatorBase::InputIsType<TensorCPU>(0)) {
       LOG(INFO) << "Blob of type: "
                 << OperatorBase::Inputs().at(0)->meta().name();
       return true;
     }
     // special-case empty tensors since they may have no meta()
     if (Input(0).size() == 0) {
       tensor_printer_.PrintMeta(Input(0));
       return true;
     }

     using Types = TensorTypes<
         float,
         double,
         int,
         long,
         bool,
         char,
         unsigned char,
         std::string>;

     if (OperatorBase::InputIsType<TensorCPU>(0)) {
       return DispatchHelper<Types>::call(
           this, OperatorBase::Input<TensorCPU>(0));
     } else {
       return DispatchHelper<Types>::call(this, Input(0));
     }
   }

  private:
   template <typename T>
   bool DoRunWithType() {
     // A simple strategy to copy tensor if needed, and have the tensor pointer
     // pointing to the right instantiation. Note that tensor_copy_if_needed
     // will handle memory deallocation itself so no smart pointer is needed.
     const TensorCPU* tensor;
     TensorCPU tensor_copy_if_needed;
     if (OperatorBase::InputIsType<TensorCPU>(0)) {
       tensor = &OperatorBase::Input<TensorCPU>(0);
     } else {
       tensor_copy_if_needed.CopyFrom(Input(0), &context_);
       // Make sure that the copy is finished.
       context_.FinishDeviceComputation();
       tensor = &tensor_copy_if_needed;
     }
     tensor_printer_.Print<T>(*tensor);
     return true;
   }

  private:
   TensorPrinter tensor_printer_;
 };

 /**
  * @brief Alias op makes the output and the input share the same underlying
  * storage.
  *
  * WARNING: in general, in caffe2's operator interface different tensors should
  * have different underlying storage, which is the assumption made by
  * components such as the dependency engine and memory optimization. Thus, in
  * normal situations you should not use the AliasOp, especially in a normal
  * forward-backward pass.
  *
  * The Alias op is provided so one can achieve true asynchrony, such as
  * Hogwild, in a graph. But make sure you understand all the implications
  * similar to multi-thread computation before you use it explicitly.
  */
 template <class Context>
 class AliasOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AliasOp);

   bool RunOnDevice() override {
     auto& input = Input(0);
     DCHECK_GT(input.size(), 0);
     Output(0)->ResizeLike(input);
     Output(0)->ShareData(input);
     return true;
   }
 };

 /**
  * @brief Pass inputs to outputs.
  * Input:
  *   DATA - dense tensor.
  * Output:
  *   DATA - same tensor as input.
  */
 template <class Context>
 class EnsureDenseOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(EnsureDenseOp)

   bool RunOnDevice() override {
     const auto& input = Input(0);
     auto* output = Output(0);
     CAFFE_ENFORCE_GT(input.ndim(), 0, "Input has to be at least a vector.");
     // it is allowed to have the output inplace overwrite the input but also
     // allow the output to be copied from the input
     if (&input != output) {
       output->ResizeLike(input);
       output->CopyFrom(input, &context_);
     }
     return true;
   }
 };

 template <class Context>
 class FlattenOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(FlattenOp);

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
     DCHECK_GT(input.size(), 0);
     CAFFE_ENFORCE_GE(
         input.dims().size(), 2, "The rank of the tensor must be >= 2.");
     output->Resize(input.dim(0), input.size() / input.dim(0));
     context_.template CopyItems<Context, Context>(
         input.meta(),
         input.size(),
         input.raw_data(),
         output->raw_mutable_data(input.meta()));
     return true;
   }
 };

 template <class Context>
 class FlattenToVecOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(FlattenToVecOp);

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
     DCHECK_GT(input.size(), 0);
     CAFFE_ENFORCE_GE(
         input.dims().size(), 1, "The rank of the tensor must be >= 1.");
     output->Resize(input.size());

     context_.template CopyItems<Context, Context>(
         input.meta(),
         input.size(),
         input.raw_data(),
         output->raw_mutable_data(input.meta()));
     return true;
   }
 };

 // Output gets the data of input(0), but reshapes it like input(1).
 template <class Context>
 class ResizeLikeOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(ResizeLikeOp);

   bool RunOnDevice() override {
     auto& input0 = Input(0);
     auto& input1 = Input(1);
     auto* output = Output(0);
     DCHECK_EQ(input0.size(), input1.size());
     output->ResizeLike(Input(1));
     context_.template CopyItems<Context, Context>(
         input0.meta(),
         input0.size(),
         input0.raw_data(),
         output->raw_mutable_data(input0.meta()));
     return true;
   }
 };

 template <typename T, class Context>
 class SumOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(SumOp);

   bool RunOnDevice() override {
     auto& input0 = Input(0);
     auto* output = Output(0);
     if (InputSize() == 1) {
       output->CopyFrom(input0, &context_);
       return true;
     }
     output->ResizeLike(input0);
     T* output_data = output->template mutable_data<T>();
     // Dimension checking
     for (int i = 1; i < InputSize(); ++i) {
       if (output->dims() != Input(i).dims()) {
         CAFFE_THROW(
             "Check failed: output->dims() == Input(i).dims().",
             "Description: Input #",
             i,
             ", input dimension:",
             Input(i).dims(),
             " should match output dimension: ",
             output->dims());
       }
     }

     // Add the first two - works if in-place or not.
     math::Add(
         output->size(),
         input0.template data<T>(),
         Input(1).template data<T>(),
         output_data,
         &context_);
     // Add remaining.
     for (int i = 2; i < InputSize(); ++i) {
       math::Add(
           output->size(),
           output_data,
           Input(i).template data<T>(),
           output_data,
           &context_);
     }
     return true;
   }
 };

 // WeightedSumOp computes the weighted sum of several tensors. The input should
 // be in the form X_0, weight_0, X_1, weight_1, ... where X_i all have the same
 // shape, and weight_i are size 1 tensors that specifies the weight of each
 // vector. Note that if one wants to do in-place computation, it could only be
 // done with X_0 also as the output, but not other X_i.
 template <typename T, class Context>
 class WeightedSumOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(WeightedSumOp);

   bool RunOnDevice() override {
     DCHECK_EQ(InputSize() % 2, 0);
     auto& X0 = Input(0);
     auto& weight0 = Input(1);
     DCHECK_GT(X0.size(), 0);
     DCHECK_EQ(weight0.size(), 1);
     int size = X0.size();
     auto* output = Output(0);
     output->ResizeLike(X0);
     math::Scale<T, Context>(
         size,
         weight0.template data<T>(),
         X0.template data<T>(),
         output->template mutable_data<T>(),
         &context_);
     for (int i = 2; i < InputSize(); i += 2) {
       auto& X = Input(i);
       // Do a check: if the input is the same as output, we have a problem -
       // in-place update should always only happen with the zeroth input.
       if (&X == output) {
         LOG(ERROR) << "Input #" << i << " is the same as output. "
                    << "If you want to do in-place updates, put the output as "
                    << "input #0.";
         return false;
       }
       auto& weight = Input(i + 1);
       DCHECK_EQ(X.size(), size);
       DCHECK_EQ(weight.size(), 1);
       math::Axpy<T, Context>(
           size,
           weight.template data<T>(),
           X.template data<T>(),
           output->template mutable_data<T>(),
           &context_);
     }
     return true;
   }
 };

 /**
  * @brief Update slices of the tensor in-place with weighted sum.
  *
  * ScatterWeightedSumOp is similar to WeightedSum and computes the weighted sum
  * of several tensors. The first tensor has to be in-place and only slices of it
  * on the first dimension as indexed by INDICES will be updated.
  *
  * Input:
  *   X_0 - tensor to be updated
  *   weight_0 - scalar weight for X_0, applied only to slices affected,
  *   INDICES - 1-D list of indices on the first dimension of X_0 that need to be
  * updated
  *   X_1 - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
  *   weight_1 - scalar weight for X_1 update
  *   X_2, weight_2, ...
  *
  * Output:
  *   X_0 - has to be exactly the same tensor as the input 0
  *
  * Note: The op pretty much ignores the exact shapes of the input arguments and
  * cares only about sizes. It's done for performance consideration to avoid
  * unnecessary reshapes. Only first dimension of X_0 is important, let's call it
  * N. If M is the total size of X_0 and K is the size of INDICES then X_i is
  * assumed to be of shape K x (M / N) regardless of the real shape.
  *
  * Note: Each update in INDICES is applied independently which means that if
  * duplicated elements are present in INDICES the corresponding slice of X_0
  * will be scaled multiple times. Manual collapsing of INDICES is required
  * beforehand if necessary.
  *
  * Note: Updates are applied sequentially by inputs which might have undesired
  * consequences if the input tensor is accessed concurrently by different op
  * (e.g. when doing Hogwild). Other threads might see intermediate results even
  * on individual slice level, e.g. X_0 scaled by weight_0 but without any
  * updates applied.
  *
  * For now really works only on CPU because of INDICES access
  */
 template <typename T, class Context>
 class ScatterWeightedSumOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(ScatterWeightedSumOp);
   USE_DISPATCH_HELPER;

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
   }

  private:
   template <typename Index>
   bool DoRunWithType() {
     TIndex block_size = Input(0).size_from_dim(1);
     return DispatchHelper<FixedValues<1>, Index>::call(this, block_size);
   }

   template <typename Index, int FixedSize>
   bool DoRunWithValue() {
     DCHECK_EQ(InputSize() % 2, 1);
     auto& X0 = Input(0);
     auto& weight0 = Input(1);
     auto& indices = Input(2);
     auto* output = Output(0);
     CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");

     DCHECK_GT(X0.size(), 0);
     DCHECK_GT(X0.ndim(), 0) << "X0 has to be at least the vector";
     DCHECK_EQ(weight0.size(), 1);
     TIndex M = X0.size();
     TIndex N = X0.dim(0);
     TIndex K = indices.size();
     TIndex block_size = M / N;
     T* data = output->template mutable_data<T>();
     const Index* idxs = indices.template data<Index>();
     T w0 = *weight0.template data<T>();
     // It's most likely a constant so exact comparison is fine
     if (w0 != 1.0) {
       for (int i = 0; i < K; ++i) {
         Index idx = idxs[i];
         DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
                                     << ", range 0 to " << N;
         math::ScaleFixedSize<T, Context, FixedSize>(
             block_size,
             w0,
             data + block_size * idx,
             data + block_size * idx,
             &context_);
       }
     }
     for (int inp = 3; inp < InputSize(); inp += 2) {
       auto& X = Input(inp);
       auto& weight = Input(inp + 1);
       DCHECK_EQ(X.size(), block_size * K);
       DCHECK_EQ(weight.size(), 1);
       const T* x_data = X.template data<T>();
       T w = *weight.template data<T>();
       for (int i = 0; i < K; ++i) {
         Index idx = idxs[i];
         // double-checking the indices, but it's fine as it's DCHECK only
         DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
                                     << ", range 0 to " << N;
         math::AxpyFixedSize<T, Context, FixedSize>(
             block_size,
             w,
             x_data + block_size * i,
             data + block_size * idx,
             &context_);
       }
     }
     return true;
   }
 };

 template <typename T, class Context>
 class SumElementsOp : public Operator<Context> {
  public:
   USE_SIMPLE_CTOR_DTOR(SumElementsOp);
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   bool RunOnDevice() override {
     bool average = OperatorBase::GetSingleArgument<bool>("average", false);
     auto& X = Input(0);
     auto* sum = Output(0);
     sum->Resize(vector<TIndex>());
     math::Sum<T, Context>(
         X.size(),
         X.template data<T>(),
         sum->template mutable_data<T>(),
         &context_);
     if (average) {
       math::Scale<T, Context>(
           1,
           static_cast<T>(1.) / X.size(),
           sum->template data<T>(),
           sum->template mutable_data<T>(),
           &context_);
     }
     return true;
   }
 };

 template <typename T, class Context>
 class SumElementsGradientOp final : public Operator<Context> {
  public:
   USE_SIMPLE_CTOR_DTOR(SumElementsGradientOp);
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   bool RunOnDevice() override {
     bool average = OperatorBase::GetSingleArgument<bool>("average", false);
     auto& X = Input(0);
     TensorCPU sum_grad = TensorCPU(Input(1));
     auto* dX = Output(0);
     dX->ResizeLike(X);
     DCHECK_EQ(sum_grad.size(), 1);
     math::Set<T, Context>(
         dX->size(),
         static_cast<T>(sum_grad.data<T>()[0] * (average ? 1.0 / X.size() : 1)),
         dX->template mutable_data<T>(),
         &context_);
     return true;
   }
 };

 template <typename T, class Context>
 class MaxOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(MaxOp);

   bool RunOnDevice() override {
     auto& input0 = Input(0);
     auto* output = Output(0);

     output->ResizeLike(input0);
     output->CopyFrom(input0, &context_);

     if (InputSize() == 1) {
       return true;
     }

     // Dimension checking
     for (int i = 1; i < InputSize(); ++i) {
       CAFFE_ENFORCE_EQ(
           output->dims(),
           Input(i).dims(),
           "Description: Input #",
           i,
           ", input dimension:",
           Input(i).dims(),
           " should match output dimension: ",
           output->dims());
     }

     return Compute();
   }

   virtual bool Compute();
 };

 template <typename T, class Context>
 class MaxGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(MaxGradientOp);

   bool RunOnDevice() override;
 };

 /**
  * @brief Update slices of the tensor in-place by overriding.
  *
  * Input:
  *   DATA - tensor to be updated
  *   INDICES - 1-D list of indices on the first dimension of X_0 that need to be
  *             updated
  *   SLICES - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
  *
  * Output:
  *   DATA - has to be exactly the same tensor as the input 0
  *
  * Note: The op pretty much ignores the exact shapes of the input arguments and
  * cares only about sizes. It's done for performance consideration to avoid
  * unnecessary reshapes. Only first dimension of X_0 is important, let's call it
  * N. If M is the total size of X_0 and K is the size of INDICES then X_i is
  * assumed to be of shape K x (M / N) regardless of the real shape.
  *
  * Note: Each update in INDICES is applied independently which means that if
  * duplicated elements are present in INDICES arbitrary one will win.
  *
  * For now really works only on CPU because of INDICES access
  */
 template <typename T, class Context>
 class ScatterAssignOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(ScatterAssignOp);

   bool RunOnDevice() override {
     // Use run-time polymorphism
     auto& indices = Input(INDICES);
     if (indices.template IsType<int32_t>()) {
       DoRun<int32_t>();
     } else if (indices.template IsType<int64_t>()) {
       DoRun<int64_t>();
     } else {
       LOG(FATAL) << "Unsupported type of INDICES in ScatterAssignOp: "
                  << indices.meta().name();
     }
     return true;
   }

  private:
   template <typename Index>
   void DoRun() {
     auto& input = Input(DATA);
     auto& indices = Input(INDICES);
     auto& slices = Input(SLICES);
     auto* output = Output(0);
     CAFFE_ENFORCE_EQ(&input, output, "In place operation is required");

     DCHECK_GT(input.ndim(), 0) << "X0 has to be at least the vector";
     TIndex M = input.size();
     TIndex N = input.dim(0);
     TIndex K = indices.size();
     TIndex block_size = M / N;
     DCHECK_EQ(slices.size(), block_size * K);
     // TODO(dzhulgakov): it can be made to work with arbitrary data type by
     // using raw_mutable_data
     T* data = output->template mutable_data<T>();
     const Index* idxs = indices.template data<Index>();
     const T* slicesData = slices.template data<T>();
     for (int i = 0; i < K; ++i) {
       Index idx = idxs[i];
       // double-checking the indices, but it's fine as it's DCHECK only
       DCHECK(0 <= idx && idx < N) << "Index out of bounds: " << idx
                                   << ", range 0 to " << N;
       context_.template Copy<T, Context, Context>(
           block_size, slicesData + block_size * i, data + block_size * idx);
     }
   }

   INPUT_TAGS(DATA, INDICES, SLICES);
 };

 template <class Context, class DstContext, class SrcContext>
 class CopyOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(CopyOp);

   bool RunOnDevice() override {
     auto& input = OperatorBase::Input<Tensor<SrcContext>>(0);
     auto* output = OperatorBase::Output<Tensor<DstContext>>(0);
     output->ResizeLike(input);
     this->context_.template CopyItems<SrcContext, DstContext>(
         input.meta(),
         input.size(),
         input.raw_data(),
         output->raw_mutable_data(input.meta()));
     return true;
   }
 };

 template <class Context, class DstContext, class SrcContext>
 class CopyOnDeviceLikeOp : public CopyOp<Context, DstContext, SrcContext> {
  public:
   CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
       : CopyOp<Context, DstContext, SrcContext>(operator_def, ws) {}
 };

 template <class Context>
 class LengthsToSegmentIdsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp);

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
     auto* input_data = input.template data<int32_t>();

     CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
     auto total_length =
         std::accumulate(input_data, input_data + input.size(), 0);

     output->Resize(total_length);
     auto* output_data = output->template mutable_data<int32_t>();

     for (int i = 0; i < input.size(); ++i) {
       auto len = input_data[i];
       std::fill(output_data, output_data + len, i);
       output_data += len;
     }
     return true;
   }
 };

 template <class Context>
 class LengthsToRangesOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp);

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
     auto* input_data = input.template data<int32_t>();

     CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
     auto size = input.size();

     output->Resize(size, 2);
     auto* output_data = output->template mutable_data<int32_t>();

     int32_t offset = 0;
     for (int i = 0; i < size; ++i) {
       auto len = input_data[i];
       output_data[i * 2] = offset;
       output_data[i * 2 + 1] = len;
       offset += len;
     }
     return true;
   }
 };

 template <class Context>
 class SegmentIdsToLengthsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp);

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
   }

   template <typename Index>
   bool DoRunWithType() {
     auto& input = Input(0);
     if (input.ndim() == 2) {
       CAFFE_ENFORCE(
           input.dim32(0) == 1 || input.dim32(1) == 1,
           "Input must be a vector.");
     } else {
       CAFFE_ENFORCE_EQ(input.ndim(), 1, "Input must be a vector.");
     }
     auto* input_data = input.template data<Index>();
     auto input_size = input.size();
     auto* output = Output(0);
     // segment id starts from 0
     auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
     if (InputSize() > 1) {
       CAFFE_ENFORCE_GE(Input(1).ndim(), 1);
       CAFFE_ENFORCE_LE(
           num_segments,
           Input(1).dim(0),
           "The number of segments inferred should *NOT* be larger "
           "than the size of Input(1)'s first dimension");
       num_segments = Input(1).dim(0);
     }
     CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
     output->Resize(num_segments);
     auto* output_data = output->template mutable_data<int32_t>();
     if (num_segments == 0) {
       return true;
     }
     std::fill(output_data, output_data + num_segments, 0);
     Index prev = 0; // Assume that segment_id >= 0.
     for (int64_t i = 0; i < input_size; i++) {
       CAFFE_ENFORCE(
           prev <= input_data[i],
           "Segment ids must be sorted: ",
           prev,
           " vs ",
           input_data[i]);
       prev = input_data[i];
       output_data[input_data[i]] += 1;
     }

     return true;
   }
 };

 template <class Context>
 class SegmentIdsToRangesOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(SegmentIdsToRangesOp);

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
   }

   template <typename Index>
   bool DoRunWithType() {
     auto& input = Input(0);
     CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
     auto* input_data = input.template data<Index>();
     auto input_size = input.size();
     auto* output = Output(0);
     // segment id starts from 0
     auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
     if (InputSize() > 1) {
       CAFFE_ENFORCE_GE(Input(1).ndim(), 1);
       CAFFE_ENFORCE_LE(
           num_segments,
           Input(1).dim(0),
           "The number of segments inferred should *NOT* be larger "
           "than the size of Input(1)'s first dimension");
       num_segments = Input(1).dim(0);
     }
     CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
     output->Resize(num_segments, 2);
     auto* output_data = output->template mutable_data<int32_t>();
     if (num_segments == 0) {
       return true;
     }
     std::fill(output_data, output_data + num_segments * 2, 0);
     Index prev = input_data[0];
     for (int64_t i = 0; i < input_size; i++) {
       CAFFE_ENFORCE(
           prev <= input_data[i],
           "Segment ids must be sorted: ",
           prev,
           " vs ",
           input_data[i]);
       while (prev != input_data[i]) {
         ++prev;
         output_data[prev * 2] = i;
       }
       output_data[input_data[i] * 2 + 1] += 1;
     }

     return true;
   }
 };

 template <class Context>
 class LengthsToWeightsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   LengthsToWeightsOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         power_(OperatorBase::GetSingleArgument<float>("power", 0.5)) {}

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
   }

   template <typename Index>
   bool DoRunWithType() {
     auto& input = Input(0);
     CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
     auto* input_data = input.template data<Index>();
     auto input_size = input.size();
     auto* output = Output(0);

     int64_t output_size = 0;
     for (auto i = 0; i < input_size; i++) {
       CAFFE_ENFORCE_GE(input_data[i], 0, "unexpected negative length value");
       output_size += input_data[i];
     }

     std::function<float(const int64_t& length, const float& power)> getWeight;
     if (power_ == 0.5) {
       getWeight = [](const int64_t& length, const float& power) {
         return 1.0 / std::sqrt(length);
       };
     } else if (power_ == 1) {
       getWeight = [](const int64_t& length, const float& power) {
         return 1.0 / length;
       };
     } else {
       getWeight = [](const int64_t& length, const float& power) {
         return 1.0 / std::pow(length, power);
       };
     }

     output->Resize(output_size);
     auto* output_data = output->template mutable_data<float>();
     int64_t cnt = 0;
     for (auto i = 0; i < input_size; i++) {
       auto len = input_data[i];
       if (len == 0) {
         continue;
       }
       CAFFE_ENFORCE_LE(cnt + len, output_size, "unexpected lengths value");

       float weight_value = getWeight(len, power_);
       std::fill(output_data + cnt, output_data + cnt + len, weight_value);
       cnt += len;
     }

     return true;
   }

  private:
   float power_;
 };

 template <class SIndex, class Context>
 class SliceOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(SliceOp);

   bool RunOnDevice() override {
     auto* output = Output(0);
     auto& data = Input(0);

     auto& starts = Input(1);
     auto& ends = Input(2);
     auto* starts_data = starts.template data<SIndex>();
     auto* ends_data = ends.template data<SIndex>();

     CAFFE_ENFORCE_EQ(starts.ndim(), 1);
     CAFFE_ENFORCE_EQ(ends.ndim(), 1);
     CAFFE_ENFORCE_GE(data.ndim(), starts.size());
     CAFFE_ENFORCE_EQ(starts.size(), ends.size());

     std::vector<SIndex> starts_idx(data.ndim());
     std::vector<SIndex> ends_idx(data.ndim());
     std::vector<SIndex> dst_sizes(data.ndim());

     for (int i = 0; i < data.ndim(); ++i) {
       if (i >= starts.size()) {
         starts_idx[i] = 0;
         ends_idx[i] = data.dims()[i];
         continue;
       }
       if (data.dims()[i] > 0) {
         auto start = starts_data[i];
         auto end = ends_data[i];
         if (start < 0) {
           start = data.dims()[i] + 1 + start;
         }
         if (end < 0) {
           end = data.dims()[i] + 1 + end;
         }
         CAFFE_ENFORCE_GE(start, 0);
         CAFFE_ENFORCE_GE(end, 0);
         CAFFE_ENFORCE_LT(start, data.dims()[i]);
         CAFFE_ENFORCE_LE(end, data.dims()[i]);
         CAFFE_ENFORCE_GE(end, start);
         starts_idx[i] = start;
         ends_idx[i] = end;
         dst_sizes[i] = end - start;
       } else {
         starts_idx[i] = 0;
         ends_idx[i] = 0;
         dst_sizes[i] = 0;
       }
     }

     if (data.size() <= 0) {
       // When the input is empty, we do not need to do copy.
       output->Resize(dst_sizes);
       output->raw_mutable_data(data.meta());
       return true;
     }
     // for now only supports slicing in 1 dimension
     int dim = -1;
     for (int i = 0; i < data.ndim(); ++i) {
       if (starts_idx[i] > 0 || ends_idx[i] < data.dims()[i]) {
         CAFFE_ENFORCE_EQ(
             dim, -1, "Currently only possible to slice in 1 dimension.");
         dim = i;
       }
     }
     if (dim == -1) {
       output->CopyFrom(data, &context_);
       return true;
     }
     auto unit = std::accumulate(
         data.dims().begin() + dim + 1,
         data.dims().end(),
         1,
         std::multiplies<SIndex>());
     auto num_blocks = std::accumulate(
         data.dims().begin(),
         data.dims().begin() + dim,
         1,
         std::multiplies<SIndex>());
     output->Resize(dst_sizes);
     auto* src_bytes = (char*)data.raw_data();
     auto* dst_bytes = (char*)output->raw_mutable_data(data.meta());

     auto src_nbytes = data.nbytes();
     auto dst_nbytes = output->nbytes();

     auto src_block_size = unit * data.dims()[dim];
     auto dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
     auto src_offset = unit * starts_idx[dim];

     if (num_blocks == 0 || dst_block_size == 0) {
       return true;
     }

     auto itemsize = data.meta().itemsize();
     auto src_block_size_bytes = itemsize * src_block_size;
     auto dst_block_size_bytes = itemsize * dst_block_size;
     auto src_offset_bytes = src_bytes + itemsize * src_offset;
     auto dst_offset_bytes = dst_bytes;
     for (int i = 0; i < num_blocks; ++i) {
       DCHECK_LE(
           src_offset_bytes + dst_block_size_bytes, src_bytes + src_nbytes);
       DCHECK_LE(
           dst_offset_bytes + dst_block_size_bytes, dst_bytes + dst_nbytes);
       this->context_.template CopyItems<Context, Context>(
           data.meta(),
           dst_block_size,
           (void*)src_offset_bytes,
           (void*)dst_offset_bytes);
       src_offset_bytes += src_block_size_bytes;
       dst_offset_bytes += dst_block_size_bytes;
     }
     return true;
   }

   DISABLE_COPY_AND_ASSIGN(SliceOp);
 };

 template <class Context>
 class HasElementsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(HasElementsOp);

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize(std::vector<TIndex>{});
     *output->template mutable_data<bool>() = input.size() > 0;
     return true;
   }
 };

 template <class Context>
 class IsEmptyOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(IsEmptyOp);

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = OperatorBase::Output<TensorCPU>(0);
     output->Resize(std::vector<TIndex>{});
     *output->template mutable_data<bool>() = (input.size() == 0);
     return true;
   }
 };

 // RecordShapeOp records the shape of the input tensor to a vector of int. You
 // mostly don't need this operator explicitly, and it is mostly used in the
 // autodiff process.
 template <class Context>
 class ShapeOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(ShapeOp);

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = OperatorBase::Output<Tensor<Context>>(0);
     output->Resize(input.ndim());
     TIndex* output_data = output->template mutable_data<TIndex>();
     context_.template CopyBytes<Context, Context>(
         input.ndim() * sizeof(TIndex), input.dims().data(), output_data);
     return true;
   }
 };

 // Takes a shape and data tensor and reshapes it
 template <typename F, class Context>
 class ReshapeOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   ReshapeOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         new_shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}

   bool RunOnDevice() override {
     if (InputSize() == 2) {
       return DispatchHelper<TensorTypes<int, int64_t>>::call(this, Input(1));
     }
     CAFFE_ENFORCE(
         OperatorBase::HasArgument("shape"), "Argument `shape` is missing.");
     return this->template DoRunWithType<int64_t>();
   }

   template <typename T>
   bool DoRunWithType() {
     auto& input = Input(0);

     vector<int64_t> actual_new_shape = new_shape_;
     if (InputSize() == 2) {
       CAFFE_ENFORCE(
           !OperatorBase::HasArgument("shape"),
           "New shape is specified by the input blob, do not pass in "
           "the argument `shape`.");

       auto& shape = Input(1);
       CAFFE_ENFORCE(shape.ndim() == 1, "Shape should be 1-D");

       const T* shape_data = shape.template data<T>();

       // Bit awkward, but needed so works on both CPU and CUDA contexts
       std::vector<T> tmpv(shape.size());
       context_.template CopyBytes<Context, CPUContext>(
           shape.size() * sizeof(T), shape_data, &tmpv[0]);
       actual_new_shape.assign(tmpv.begin(), tmpv.begin() + shape.size());
     }

     // Copy over the dimensions for those that are specified zero.
     for (int i = 0; i < actual_new_shape.size(); ++i) {
       if (actual_new_shape[i] == 0) {
         actual_new_shape[i] = input.dim(i);
       }
     }

     // Checks if the new shape is valid and fills in the missing dimension
     // specified by -1.
     // NOTE: At most one dimension can be -1.
     auto total_size = input.size_from_dim(0);
     T size = 1;
     int unknown_idx = -1;
     for (int i = 0; i < actual_new_shape.size(); ++i) {
       const auto dim = actual_new_shape[i];
       if (dim == -1) {
         CAFFE_ENFORCE(
             unknown_idx == -1,
             "Argument `shape` has more than one missing dimension.");
         unknown_idx = i;
       } else {
         size *= dim;
       }
     }

     if (unknown_idx != -1) {
       CAFFE_ENFORCE(
           total_size % size == 0,
           "Argument `shape` does not agree with the input data.",
           " (",
           total_size,
           " vs ",
           size,
           ")");
       actual_new_shape[unknown_idx] = total_size / size;
     } else {
       CAFFE_ENFORCE_EQ(
           total_size,
           size,
           "Argument `shape` does not agree with the input data.",
           " (",
           total_size,
           " != ",
           size,
           ")");
     }

     // Write the original shape to the second output.
     auto* old_shape = Output(1);
     old_shape->Resize(input.ndim());
     T* old_shape_data = old_shape->template mutable_data<T>();
     for (int i = 0; i < input.ndim(); ++i) {
       math::Set<T, Context>(1, input.dim(i), old_shape_data + i, &context_);
     }

     auto* output = Output(0);
     output->Resize(actual_new_shape);
     if (output != &input) {
       // If we are not doing in-place computation, a copy is needed.
       context_.template CopyBytes<Context, Context>(
           input.nbytes(),
           input.raw_data(),
           output->raw_mutable_data(input.meta()));
     }

     return true;
   }

  private:
   vector<int64_t> new_shape_;
 };

 // Takes a length vector, check that all lengths are equal and
 // returns a shape to be passed to Reshape
 template <class Context>
 class LengthsToShapeOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp);

   bool RunOnDevice() override {
     auto& input = Input(0);

     CAFFE_ENFORCE(input.dims().size() == 1, "Input must be a vector.");
     auto* output = Output(0);
     auto* input_data = input.template data<int32_t>();

     auto size = input.size();
     auto first = input_data[0];

     for (int i = 1; i < size; i++) {
       CAFFE_ENFORCE(
           input_data[i] == first, "All elements of input must be same ");
     }

     output->Resize(2);
     auto* output_data = output->template mutable_data<int32_t>();
     output_data[0] = size;
     output_data[1] = first;

     return true;
   }
 };

 template <class Context>
 class SqueezeOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   SqueezeOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
     auto originalSize = dims_.size();
     CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");

     std::sort(dims_.begin(), dims_.end());
     dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
     if (dims_.size() < originalSize) {
       LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
     }
     CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
   }

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
     output->CopyFrom(input, &context_);

     CAFFE_ENFORCE(
         input.dims().back() + 1 >= dims_.size(),
         "Input needs at least ",
         (dims_.back() + 1),
         " dimensions.");
     int j = 0;
     std::vector<int> newDims;
     for (int i = 0; i < input.dims().size(); ++i) {
       if (j < dims_.size() && dims_[j] == i) {
         CAFFE_ENFORCE(
             input.dims()[i] == 1,
             "Dimension ",
             i,
             " of input must be 1",
             " instead of ",
             input.dims()[i],
             ".");
         ++j;
         continue;
       }
       newDims.push_back(input.dims().at(i));
     }
     output->Reshape(newDims);
     return true;
   }

  private:
   vector<int> dims_;

  public:
   DISABLE_COPY_AND_ASSIGN(SqueezeOp);
 };

 template <class Context>
 class ExpandDimsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   ExpandDimsOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
     auto originalSize = dims_.size();
     CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
     std::sort(dims_.begin(), dims_.end());
     dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
     if (dims_.size() < originalSize) {
       LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
     }
     CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
   }

   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
     output->CopyFrom(input, &context_);
     if (dims_.empty()) {
       return true;
     }

     auto newDims = input.dims();
     CAFFE_ENFORCE_GE(
         input.dims().size() + dims_.size(),
         dims_.back() + 1,
         "Input needs at least ",
         (1 + dims_.back() - dims_.size()),
         " dimensions given `dims`.");
     for (const auto dim : dims_) {
       newDims.insert(newDims.begin() + dim, 1);
     }
     output->Reshape(newDims);
     return true;
   }

  private:
   vector<int> dims_;
 };

 template <class Context>
 class GatherOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(GatherOp);

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
         this, OperatorBase::Input<TensorCPU>(INDICES));
   }

   template <typename Index>
   bool DoRunWithType() {
     // If we endup using it on GPU doing O(N) memcpy is probably not best :)
     // TODO: implement prefetching if it starts mattering (TF does it)
     auto& data = Input(DATA);
     auto& indices = Input(INDICES);
     auto* output = Output(0);

     CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
     auto shape = indices.dims();
     shape.insert(shape.end(), data.dims().begin() + 1, data.dims().end());
     output->Resize(shape);

     int block_size = data.size() / data.dim(0);
     auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
     CAFFE_ENFORCE(
         block_bytesize == data.nbytes() / data.dim(0),
         "block_bytesize should be consistent with data dim");
     int N = indices.size();

     auto src_base = static_cast<const char*>(data.raw_data());
     const Index* idxs = indices.template data<Index>();
     auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));

     for (int i = 0; i < N; ++i) {
       auto idx = idxs[i];
       CAFFE_ENFORCE(
           0 <= idx && idx < data.dim(0),
           "INDICES element is out of DATA bounds");
       auto src = src_base + idx * block_bytesize;
       context_.template CopyItems<Context, Context>(
           data.meta(), block_size, src, out + block_bytesize * i);
     }
     return true;
   }

   INPUT_TAGS(DATA, INDICES);
 };

 template <class Context>
 class GatherRangesOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(GatherRangesOp);

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
         this, OperatorBase::Input<TensorCPU>(RANGES));
   }

   template <typename Index>
   bool DoRunWithType() {
     auto& data = Input(DATA);
     auto& ranges = Input(RANGES);
     auto* outputData = Output(0);
     auto* outputLengths = Output(1);

     auto batchSize = ranges.dim(0);
     CAFFE_ENFORCE(data.ndim() == 1, "Data has to be 1-D");
     CAFFE_ENFORCE(ranges.ndim() == 3, "Ranges must be 3-D");
     CAFFE_ENFORCE(ranges.dim(1) > 0, "There has to be at least one range");
     CAFFE_ENFORCE_EQ(
         ranges.dim(2), 2, "Ranges last dimention should be of size 2");

     auto* rawData = static_cast<const char*>(data.raw_data());
     auto* rangesData = ranges.template data<Index>();

     outputLengths->Resize(batchSize);
     auto* outputLengthsPtr = outputLengths->template mutable_data<int32_t>();
     size_t start = 0;
     size_t blockSize = ranges.size_from_dim(1);
     for (size_t i = 0; i < batchSize; ++i) {
       auto end = start + blockSize;
       outputLengthsPtr[i] = accumulate(rangesData, start, end);
       start = end;
     }

     size_t outputSize = accumulate(rangesData, 0, ranges.size());
     outputData->Resize(outputSize);

     auto outputRawData =
         static_cast<char*>(outputData->raw_mutable_data(data.meta()));
     VLOG(1) << "Copying data";
     size_t outputOffsetBytes = 0;
     auto itemsize = data.meta().itemsize();
     for (int i = 0; i < ranges.size(); i += 2) {
       auto rangeStart = rangesData[i];
       auto rangeLength = rangesData[i + 1];
       if (!rangeLength) {
         continue;
       }
       auto rangeSizeBytes = rangeLength * itemsize;
       CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
       CAFFE_ENFORCE(rangeStart + rangeLength <= data.size());
       context_.template CopyItems<Context, Context>(
           data.meta(),
           rangeLength,
           rawData + rangeStart * itemsize,
           outputRawData + outputOffsetBytes);
       outputOffsetBytes += rangeSizeBytes;
     }
     CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize);
     return true;
   }

   INPUT_TAGS(DATA, RANGES, LENGTHS);

  private:
   template <typename Index>
   size_t accumulate(Index* ranges, size_t start, size_t end) {
     size_t result = 0;
     for (int i = start + 1; i < end; i += 2) {
       result += ranges[i];
     }
     return result;
   }
 };

 // Since we just do copying, consider untemplating it on T and using raw_data()
 /**
  * Deduplicates input indices vector and optionally produces reverse remapping.
  * Current implementation produces a sorted list but it's not guaranteed in
  * general.
  */
 template <class Context>
 class UniqueOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(UniqueOp);

   bool RunOnDevice() override {
     // Use run-time polymorphism
     auto& input = Input(0);
     if (input.template IsType<int32_t>()) {
       DoRun<int32_t>();
     } else if (input.template IsType<int64_t>()) {
       DoRun<int64_t>();
     } else {
       LOG(FATAL) << "Unsupported type of input in Unique: "
                  << input.meta().name();
     }
     return true;
   }

  private:
   vector<int> order_;

   template <typename T>
   void DoRun() {
     auto& inputTensor = Input(0);
     // use dim32 to enforce that it's fine to have remapping of type int
     int N = inputTensor.dim32(0);
     CAFFE_ENFORCE_EQ(inputTensor.ndim(), 1, "Input should be a vector");
     auto* uniqueTensor = Output(UNIQUE);

     int* remapping = nullptr;
     if (REMAPPING < OutputSize()) {
       auto* remappingTensor = Output(REMAPPING);
       remappingTensor->ResizeLike(inputTensor);
       remapping = remappingTensor->template mutable_data<int>();
     }

     const T* input = inputTensor.template data<T>();
     // TODO(dzhulgakov): if perf becomes an issue consider doing hash table
     // instead of sorting
     order_.resize(N);
     std::iota(order_.begin(), order_.end(), 0);
     std::sort(order_.begin(), order_.end(), [input](const int x, const int y) {
       return input[x] < input[y];
     });
     int K = N;
     for (int i = 1; i < N; ++i) {
       K -= input[order_[i]] == input[order_[i - 1]];
     }
     uniqueTensor->Resize(K);
     T* unique = uniqueTensor->template mutable_data<T>();
     K = 0;
     T prev = -1;
     for (int i = 0; i < N; ++i) {
       if (i == 0 || prev != input[order_[i]]) {
         prev = unique[K++] = input[order_[i]];
       }
       if (remapping) {
         remapping[order_[i]] = K - 1;
       }
     }
   }

  public:
   OUTPUT_TAGS(UNIQUE, REMAPPING);
 };

 template <class Context>
 class UnsafeCoalesceOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   using Operator<Context>::Operator;

   bool RunOnDevice() override {
     size_t coalesced_size = 0;
     for (int i = 0; i < InputSize(); ++i) {
       CAFFE_ENFORCE(
           !Input(i).meta().ctor(),
           "Must only coalesce fundamental types, error at input: ",
           i);
     }

     auto roundToAlignment = [](size_t bytes) -> size_t {
       return ((bytes + gCaffe2Alignment - 1) / gCaffe2Alignment) *
           gCaffe2Alignment;
     };

     for (int i = 0; i < InputSize(); ++i) {
       coalesced_size += roundToAlignment(Input(i).nbytes());
     }

     auto* coalesced = Output(OutputSize() - 1);
     coalesced->Resize(coalesced_size);
     math::Set<uint8_t, Context>(
         coalesced_size,
         0.0,
         coalesced->template mutable_data<uint8_t>(),
         &context_);

     size_t coalesced_offset = 0;
     for (auto i = 0; i < InputSize(); ++i) {
       const auto input_nbytes = Input(i).nbytes();
       context_.template CopyBytes<Context, Context>(
           input_nbytes,
           (const uint8_t*)Input(i).raw_data(),
           coalesced->template mutable_data<uint8_t>() + coalesced_offset);

       // Note: this could cause Input(i) to free it's data if
       // Output(i) and Input(i) alias each other. This is safe on a
       // GPU (as the copy will happen-before the free), but it's
       // worth mentioning.

       Output(i)->ResizeLike(Input(i));
       Output(i)->ShareExternalPointer(
           coalesced->template mutable_data<uint8_t>() + coalesced_offset,
           Input(i).meta(),
           input_nbytes);
       coalesced_offset += roundToAlignment(input_nbytes);
     }
     return true;
   }
 };

 template <typename T, class Context>
 class AccumulateHistogramOp : public Operator<Context> {
  public:
   AccumulateHistogramOp(const OperatorDef& def, Workspace* ws)
       : Operator<Context>(def, ws),
         lower_bound_(
             OperatorBase::GetSingleArgument<float>("lower_bound", 0.0)),
         upper_bound_(
             OperatorBase::GetSingleArgument<float>("upper_bound", 1.0)),
         num_buckets_(OperatorBase::GetSingleArgument<int>("num_buckets", 1)) {
     CAFFE_ENFORCE_GT(num_buckets_, 0);
     // 2 more for histograms < lower_bound, >= upper_bound respectively
     num_output_buckets_ = num_buckets_ + 2;
     accumulate_hist_ = std::vector<int64_t>(num_output_buckets_, 0);
   }

   USE_OPERATOR_CONTEXT_FUNCTIONS;

   bool RunOnDevice() override {
     auto& X = Input(X_IN);
     auto* X_data = X.template data<T>();
     int N = X.size();
     auto* cur_hist = Output(CUR_HIST);
     auto* acc_hist = Output(ACC_HIST);
     cur_hist->Resize(num_output_buckets_);
     acc_hist->Resize(num_output_buckets_);
     auto* cur_hist_data = cur_hist->template mutable_data<int64_t>();
     auto* acc_hist_data = acc_hist->template mutable_data<int64_t>();
     auto segment = (upper_bound_ - lower_bound_) / num_buckets_;
     math::Set<int64_t, Context>(
         num_output_buckets_, 0, cur_hist_data, &context_);

     for (int i = 0; i < N; i++) {
       int bucket_index = -1;
       if (X_data[i] < lower_bound_) {
         bucket_index = 0;
       } else if (X_data[i] >= upper_bound_) {
         bucket_index = num_buckets_ + 1;
       } else {
         bucket_index = (int)((X_data[i] - lower_bound_) / segment) + 1;
       }
       cur_hist_data[bucket_index] += 1;
       accumulate_hist_[bucket_index] += 1;
     }

     for (int i = 0; i < num_output_buckets_; i++) {
       acc_hist_data[i] = accumulate_hist_[i];
     }

     return true;
   }

  private:
   float lower_bound_;
   float upper_bound_;
   int num_buckets_;
   int num_output_buckets_;
   std::vector<int64_t> accumulate_hist_;

   INPUT_TAGS(X_IN);
   OUTPUT_TAGS(CUR_HIST, ACC_HIST);
 };
 } // namespace caffe2

 #endif // CAFFE2_OPERATORS_UTILITY_OPS_H_