caffe2/operators/concat_split_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/concat_split_op.h"

 namespace caffe2 {
 namespace {
 std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> splitOpDevInfer(
     const OperatorDef& def) {
   auto op_device =
       def.has_device_option() ? def.device_option() : DeviceOption();
   vector<DeviceOption> in_dev(def.input_size(), op_device);
   vector<DeviceOption> out_dev(def.output_size(), op_device);

   // If we obtain split from input tensor, then 2nd input's type is always CPU.
   if (def.input_size() == SplitOp<CPUContext>::kSplitOpInputSize) {
     CAFFE_ENFORCE_GT(in_dev.size(), 1);
     in_dev[1] = DeviceOption();
   }
   return std::make_pair(in_dev, out_dev);
 }

 vector<TensorShape> TensorInferenceForSplit(
     const OperatorDef& def,
     const vector<TensorShape>& in) {
   auto ret_invalid_shape = [&def]() {
     vector<TensorShape> out(def.output().size());
     for (auto& out_ts : out) {
       out_ts.set_unknown_shape(true);
     }
     return out;
   };
   // We only support shape inference of Split with 1 input
   if (def.input_size() != 1 || in.empty() || in.front().unknown_shape()) {
     return ret_invalid_shape();
   } else if (def.output_size() == 0) {
     return vector<TensorShape>();
   }
   ArgumentHelper helper(def);
   const int axis = helper.HasArgument("axis")
       ? helper.GetSingleArgument<int>("axis", -1)
       : GetDimFromOrderString(
             helper.GetSingleArgument<string>("order", "NCHW"));
   const int add_axis = helper.HasArgument("axis")
       ? helper.GetSingleArgument<int>("add_axis", 0)
       : 0;
   const auto& input = in[0];
   const int canonical_axis = canonical_axis_index_(axis, input.dims_size());
   const int input_channels = input.dims(canonical_axis);
   auto split = helper.GetRepeatedArgument<int>("split");
   // Equally split the input into outputs
   const int output_size = def.output_size();
   if (def.input_size() == caffe2::SplitOp<CPUContext>::kSplitOpInputSize) {
     if (!split.empty()) {
       LOG(WARNING) << "If you set split with an input blob, do not pass in "
                       "split in the argument.";
     }
     // We cannot infer output shape until we see the value of split input
     return ret_invalid_shape();
   } else if (split.empty()) {
     if (input_channels % output_size != 0) {
       LOG(WARNING) << "Input channels (" << input_channels
                    << ") should be divisible by number of outputs ("
                    << output_size << ")";
       return ret_invalid_shape();
     }
     split.resize(output_size, input_channels / output_size);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
   } else if (split.size() != output_size) {
     LOG(WARNING) << "`split` size (" << split.size()
                  << ") should be equal to output size (" << output_size << ")";
     return ret_invalid_shape();
   }

   // Check validity of the split
   const int total_channels = add_axis
       ? def.output_size()
       : std::accumulate(split.begin(), split.begin() + output_size, 0);
   if (total_channels != input_channels) {
     LOG(WARNING) << "Input channels (" << input_channels
                  << ") is not equal to total output channels ("
                  << total_channels << ")";
     return ret_invalid_shape();
   }

   vector<int> output_dims(input.dims().begin(), input.dims().end());
   if (add_axis) {
     output_dims.erase(output_dims.begin() + canonical_axis);
   }
   vector<TensorShape> output_shapes;
   for (int i = 0; i < output_size; ++i) {
     if (!add_axis) {
       output_dims[canonical_axis] = split[i];
     }
     output_shapes.emplace_back(
         CreateTensorShape(output_dims, input.data_type()));
   }
   return output_shapes;
 }

 OpSchema::Cost CostInferenceForSplit(
     const OperatorDef&,
     const vector<TensorShape>& in) {
   CAFFE_ENFORCE_GT(in.size(), 0);
   struct OpSchema::Cost cost;
   cost.flops = 0;
   auto const& input_0_element_size_byte =
       DataTypeToTypeMeta(in[0].data_type()).itemsize();
   auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
   auto split_bytes_count = in.size() > 1
       ? nElemFromDim(in[1]) * DataTypeToTypeMeta(in[1].data_type()).itemsize()
       : 0;
   // There can be two input blobs:
   // (1) actual tensor to be split
   // (2) lengths of outputs along split axis
   // So, bytes_read is the sum of the bytes in the two blobs.
   cost.bytes_read = input_bytes_count + split_bytes_count;
   // Split operator only changes shape, does not change element count. So,
   // bytes_written is same as input_bytes_count.
   cost.bytes_written = input_bytes_count;
   cost.params_bytes = 0;
   return cost;
 }
 } // namespace.

 REGISTER_CPU_OPERATOR(Split, SplitOp<CPUContext>);
 REGISTER_CPU_OPERATOR(SplitByLengths, SplitByLengthsOp<CPUContext>);
 OPERATOR_SCHEMA(Split)
     .NumInputs(1, 2)
     .NumOutputs(1, INT_MAX)
     .Input(0, "input", "(*Tensor*): tensor to split")
     .Input(
         1,
         "split",
         "(*Tensor`<int>`*): [OPTIONAL] list of output lengths (see also arg `split`)")
     .Arg("axis", "(*int*): axis to split on")
     .Arg(
         "add_axis",
         "*(type: int)* Pass non-zero integer to remove the axis specified in `axis` to all input tensors.")
     .Arg("split", "(*Tuple(int)*): length of each output")
     .Arg(
         "order",
         // NOLINTNEXTLINE(modernize-raw-string-literal)
         "(*string*): order of dimensions of input and output blobs; either \"NCHW\" or \"NHWC\"")
     .Output(0, "[output_0, output_1, ...]", "(*Tensor*): output tensor")
     .TensorInferenceFunction(TensorInferenceForSplit)
     .CostInferenceFunction(CostInferenceForSplit)
     .DeviceInferenceFunction(splitOpDevInfer)
     .SetDoc(R"DOC(
 Split an `input` tensor into a list of tensors, along the axis specified by the `axis` dimension. The lengths of the split can be specified using argument `split` or optional second input blob to the operator. Otherwise, the tensor is split to equal sized parts.

 Github Links:
 - https://github.com/pytorch/pytorch/blob/main/caffe2/operators/concat_split_op.cc

 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```

 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "Split",
     ["input"],
     ["output_0","output_1","output_2"],
     split=(3,2,4),
     axis=0
 )

 workspace.FeedBlob("input", np.random.randint(10, size=(9)))
 print("input:", workspace.FetchBlob("input"))
 workspace.RunOperatorOnce(op)
 print("output_0:", workspace.FetchBlob("output_0"))
 print("output_1:", workspace.FetchBlob("output_1"))
 print("output_2:", workspace.FetchBlob("output_2"))

 ```

 **Result**

 ```

 input: [2 2 6 6 6 0 5 7 4]
 output_0: [2 2 6]
 output_1: [6 6]
 output_2: [0 5 7 4]

 ```

 </details>

 )DOC")
     .InheritOnnxSchema();

 OPERATOR_SCHEMA(SplitByLengths)
     .NumInputs(2)
     .NumOutputs(1, INT_MAX)
     .Input(0, "input", "The tensor to split")
     .Input(1, "legnths", "The tensor `l_i` indicates the logic block of input.")
     .Arg("axis", "Which axis to split on")
     .Arg("order", "Either NHWC or NCWH, will split on C axis, defaults to NCHW")
     .Arg(
         "use_scaling_lengths",
         "(*bool*): Enables automatic scaling of the lengths values. When enabled "
         "will automatically find a value K >= 1, such that sum(lengths) * K == len(input).")
     .DeviceInferenceFunction([](const OperatorDef& def) {
       auto op_device =
           def.has_device_option() ? def.device_option() : DeviceOption();
       vector<DeviceOption> in_dev(def.input_size(), op_device);
       vector<DeviceOption> out_dev(def.output_size(), op_device);
       // lengths input should be on CPU
       in_dev[1] = DeviceOption();
       return std::make_pair(in_dev, out_dev);
     })
     .SetDoc(R"DOC(
 Split a tensor into a list of tensors, given a lengths input, along the specified
 'axis'. If `K` outputs are provided, the op assumes `len(lengths) % K == 0`.
 The `input` will be split into `K` parts. Each part of length
 `sum(lengths[i*k:i*k+k))`

 <details>

 <summary> <b>Example 1</b> </summary>

 **Code**

 ```

 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "SplitByLengths",
     ["input", "lengths"],
     ["output_0","output_1","output_2"],
     axis=0
 )

 workspace.FeedBlob("input", np.random.randint(10, size=(9)))
 workspace.FeedBlob("lengths", np.array([3,2,4], dtype=np.int32))
 print("input:", workspace.FetchBlob("input"))
 print("lengths:", workspace.FetchBlob("lengths"))
 workspace.RunOperatorOnce(op)
 print("output_0:", workspace.FetchBlob("output_0"))
 print("output_1:", workspace.FetchBlob("output_1"))
 print("output_2:", workspace.FetchBlob("output_2"))

 ```

 **Result**

 ```

 input: [2 2 6 6 6 0 5 7 4]
 lengths: [3 2 4]
 output_0: [2 2 6]
 output_1: [6 6]
 output_2: [0 5 7 4]

 ```

 <summary> <b>Example 2</b> </summary>

 **Code**

 ```

 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "SplitByLengths",
     ["input", "lengths"],
     ["output_0","output_1","output_2"],
     axis=0,
     use_scaling_lengths=true,
 )

 workspace.FeedBlob("input", np.random.randint(10, size=(9)))
 workspace.FeedBlob("lengths", np.array([1,1,1], dtype=np.int32))
 print("input:", workspace.FetchBlob("input"))
 print("lengths:", workspace.FetchBlob("lengths"))
 print("output_0:", workspace.FetchBlob("output_0"))
 print("output_1:", workspace.FetchBlob("output_1"))
 print("output_2:", workspace.FetchBlob("output_2"))

 ```

 **Result**

 ```

 input: [2 2 6 6 6 0 5 7 4]
 lengths: [1 1 1]
 output_0: [2 2 6]
 output_1: [6 6 6]
 output_2: [5 7 4]

 ```

 </details>

 )DOC");

 OpSchema::Cost CostInferenceForConcat(
     const OperatorDef& def,
     const vector<TensorShape>& in) {
   ArgumentHelper helper(def);
   const int axis = helper.HasArgument("axis")
       ? helper.GetSingleArgument<int>("axis", -1)
       : GetDimFromOrderString(
             helper.GetSingleArgument<string>("order", "NCHW"));
   bool add_axis = helper.GetSingleArgument<int>("add_axis", 0) != 0;
   int adj_size = in[0].dims_size() + (add_axis ? 1 : 0);
   const int canonical_axis = canonical_axis_index_(axis, adj_size);
   CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
   CAFFE_ENFORCE_GT(in.size(), 0);
   vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
   if (add_axis) {
     out_shape.insert(out_shape.begin() + canonical_axis, in.size());
   } else {
     for (size_t i = 1; i < in.size(); ++i) {
       out_shape[canonical_axis] += in[i].dims(canonical_axis);
     }
   }
   uint64_t nElemRead = 0;
   // NOLINTNEXTLINE(modernize-loop-convert,clang-diagnostic-sign-compare)
   for (int i = 0; i < in.size(); ++i) {
     nElemRead += nElemFromDim(in[i]);
   }
   int size = 1;
   for (auto& s : out_shape) {
     size *= s;
   }
   auto split_info_bytes_count = in.size() * sizeof(int);

   auto const& input_0_element_size_byte =
       DataTypeToTypeMeta(in[0].data_type()).itemsize();
   struct OpSchema::Cost cost;
   cost.flops = 0;
   cost.bytes_read = nElemRead * input_0_element_size_byte;
   cost.bytes_written =
       size * input_0_element_size_byte + split_info_bytes_count;
   cost.params_bytes = 0;
   return cost;
 }

 namespace {
 std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
 concatOpDevInfer(const OperatorDef& def) {
   auto op_device =
       def.has_device_option() ? def.device_option() : DeviceOption();
   vector<DeviceOption> in_dev(def.input_size(), op_device);
   vector<DeviceOption> out_dev(def.output_size(), op_device);

   // 2nd output's type is always CPU irrespective of op's device option.
   CAFFE_ENFORCE_GT(out_dev.size(), 1);
   out_dev[1] = DeviceOption();
   return std::make_pair(in_dev, out_dev);
 }
 } // namespace

 vector<TensorShape> TensorInferenceForConcat(
     const OperatorDef& def,
     const vector<TensorShape>& in) {
   ArgumentHelper helper(def);
   const int axis = helper.HasArgument("axis")
       ? helper.GetSingleArgument<int>("axis", -1)
       : GetDimFromOrderString(
             helper.GetSingleArgument<string>("order", "NCHW"));
   bool add_axis = helper.GetSingleArgument<int>("add_axis", 0) != 0;
   int adj_size = in[0].dims_size() + (add_axis ? 1 : 0);
   const int canonical_axis = canonical_axis_index_(axis, adj_size);
   CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
   CAFFE_ENFORCE_GT(in.size(), 0);
   vector<int> split_shape(1, in.size());
   vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
   if (add_axis) {
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     for (int i = 1; i < in.size(); ++i) {
       CAFFE_ENFORCE_EQ(
           in[0].dims().size(),
           in[i].dims().size(),
           "All inputs of Concat should have same dims when add_axis = 1. "
           "Got different sizes for inputs 0 and ",
           i);
       for (int j = 0; j < in[0].dims().size(); ++j) {
         CAFFE_ENFORCE_EQ(
             in[0].dims(j),
             in[i].dims(j),
             "All inputs of Concat should have same dims when add_axis = 1. "
             "Got different dims for inputs 0 and ",
             i,
             ". At dim: ",
             j);
       }
     }
     out_shape.insert(out_shape.begin() + canonical_axis, in.size());
   } else {
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     for (int i = 1; i < in.size(); ++i) {
       CAFFE_ENFORCE(
           in[0].dims_size() == in[i].dims_size() ||
               (canonical_axis == in[0].dims_size() - 1 &&
                in[0].dims_size() == in[i].dims_size() + 1),
           "All inputs of Concat should have same dims except "
           "canonical_axis dim that is equal to ",
           canonical_axis,
           "Got different sizes for inputs 0 and ",
           i);
       for (int j = 0; j < in[0].dims_size(); ++j) {
         if (j == canonical_axis) {
           continue;
         }
         CAFFE_ENFORCE_EQ(
             in[0].dims(j),
             in[i].dims(j),
             "All inputs of Concat should have same dims except "
             "canonical_axis dim that is equal to ",
             canonical_axis,
             "Got different dims for inputs 0 and ",
             i,
             ". At dim: ",
             j);
       }
     }

     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     for (int i = 1; i < in.size(); ++i) {
       out_shape[canonical_axis] += in[i].dims(canonical_axis);
     }
   }
   if (def.output_size() == 1) {
     return vector<TensorShape>{CreateTensorShape(out_shape, in[0].data_type())};
   }
   return vector<TensorShape>{
       CreateTensorShape(out_shape, in[0].data_type()),
       CreateTensorShape(split_shape, TensorProto::INT32)};
 }

 REGISTER_CPU_OPERATOR(Concat, ConcatOp<CPUContext>);
 OPERATOR_SCHEMA(Concat)
     .NumInputs(1, INT_MAX)
     .NumOutputs(2)
     .Arg("axis", "*(type: int; default: -1)* Axis to concatenate on.")
     .Arg(
         "order",
         "*(type: string; default='NCHW')* Order of blob dimensions. Concats on the C dimension.")
     .Arg(
         "add_axis",
         "*(type: int)* Pass non-zero integer to add the axis specified in `axis` to all input tensors.")
     .TensorInferenceFunction(
         OpSchema::NeedsAllInputShapes(TensorInferenceForConcat))
     .CostInferenceFunction(CostInferenceForConcat)
     .DeviceInferenceFunction(concatOpDevInfer)
     .SetDoc(R"DOC(
 Concatenate a list of tensors into a single tensor. Similar functionality to
 Numpy's [concatenate](https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html)
 function. The `axis` argument specifies what axis along which the arrays will be concatenated.
 When set to non-zero (default=0), the `add_axis` argument adds the axis specified in `axis` to
 all input tensors.

 Github Links:

 - https://github.com/pytorch/pytorch/blob/main/caffe2/operators/concat_split_op.cc
 - https://github.com/pytorch/pytorch/blob/main/caffe2/operators/concat_split_op.h


 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```

 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "Concat",
     ["X1",  "X2"],
     ["Y", "split_info"],
     axis=0
 )

 workspace.FeedBlob("X1", np.array([[1,2],[3,4]]))
 workspace.FeedBlob("X2", np.array([[5,6]]))
 print("X1:", workspace.FetchBlob("X1"))
 print("X2:", workspace.FetchBlob("X2"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 print("split_info:", workspace.FetchBlob("split_info"))

 ```

 **Result**

 ```

 X1: [[1 2]
  [3 4]]
 X2: [[5 6]]
 Y: [[1 2]
  [3 4]
  [5 6]]
 split_info: [2 1]

 ```

 </details>

 <details>

 <summary> <b>Example 2</b> </summary>

 **Code**

 ```

 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "Concat",
     ["X1",  "X2"],
     ["Y", "split_info"],
     add_axis=1,
     axis=3
 )

 workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
 workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
 print("X1:", workspace.FetchBlob("X1"))
 print("X2:", workspace.FetchBlob("X2"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 print("split_info:", workspace.FetchBlob("split_info"))

 ```

 **Result**

 ```

 X1: [[[[1 8 3 9 0]
    [6 4 6 5 6]
    [3 9 1 9 9]
    [5 1 0 7 7]
    [9 4 0 0 9]]]]
 X2: [[[[7 0 2 6 1]
    [3 9 4 0 3]
    [5 3 8 9 4]
    [3 4 2 1 0]
    [0 8 8 8 1]]]]
 Y: [[[[[1 8 3 9 0]
     [7 0 2 6 1]]

    [[6 4 6 5 6]
     [3 9 4 0 3]]

    [[3 9 1 9 9]
     [5 3 8 9 4]]

    [[5 1 0 7 7]
     [3 4 2 1 0]]

    [[9 4 0 0 9]
     [0 8 8 8 1]]]]]
 split_info: [1 1]

 ```

 </details>

     )DOC")
     .Input(0, "X1, X2, ...", "*(type: Tensor`<float>`)* List of input tensors.")
     .Output(
         0,
         "concat_result",
         "*(type: Tensor`<float>`)* Concatenated tensor.")
     .Output(
         1,
         "split_info",
         "*(type: Tensor`<int>`)* The dimensions of the inputs.")
     .InheritOnnxSchema();

 // Backward compatibility names.
 REGISTER_CPU_OPERATOR(DepthSplit, SplitOp<CPUContext>);
 REGISTER_CPU_OPERATOR(DepthConcat, ConcatOp<CPUContext>);
 OPERATOR_SCHEMA(DepthSplit)
     .NumInputs(1, 2)
     .NumOutputs(1, INT_MAX)
     .SetDoc("Backward compatible operator name for Split.");
 OPERATOR_SCHEMA(DepthConcat)
     .NumInputs(1, INT_MAX)
     .NumOutputs(2)
     .SetDoc("Backward compatible operator name for Concat.");

 class GetSplitGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     vector<string> output_grads;
     for (int i = 0; i < def_.output_size(); ++i) {
       if (!GradOut(i).IsEmpty()) {
         output_grads.push_back(GO(i));
       }
     }
     if (output_grads.empty()) {
       return {};
     }
     return SingleGradientDef(
         "Concat",
         "",
         output_grads,
         vector<string>{GI(0), "_" + GI(0) + "_dims"});
   }
 };
 REGISTER_GRADIENT(Split, GetSplitGradient);
 REGISTER_GRADIENT(DepthSplit, GetSplitGradient);
 REGISTER_GRADIENT(SplitByLengths, GetSplitGradient);

 class GetConcatGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     if (GradOut(0).IsEmpty()) {
       return {};
     }
     vector<string> grads;
     for (int i = 0; i < def_.input_size(); ++i) {
       // NOLINTNEXTLINE(performance-inefficient-vector-operation)
       grads.push_back(GI(i));
     }
     return SingleGradientDef("Split", "", vector<string>{GO(0), O(1)}, grads);
   }
 };
 REGISTER_GRADIENT(Concat, GetConcatGradient);
 REGISTER_GRADIENT(DepthConcat, GetConcatGradient);
 } // namespace caffe2
	#include "caffe2/operators/concat_split_op.h"

	namespace caffe2 {
	namespace {
	std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> splitOpDevInfer(
	const OperatorDef& def) {
	auto op_device =
	def.has_device_option() ? def.device_option() : DeviceOption();
	vector<DeviceOption> in_dev(def.input_size(), op_device);
	vector<DeviceOption> out_dev(def.output_size(), op_device);

	// If we obtain split from input tensor, then 2nd input's type is always CPU.
	if (def.input_size() == SplitOp<CPUContext>::kSplitOpInputSize) {
	CAFFE_ENFORCE_GT(in_dev.size(), 1);
	in_dev[1] = DeviceOption();
	}
	return std::make_pair(in_dev, out_dev);
	}

	vector<TensorShape> TensorInferenceForSplit(
	const OperatorDef& def,
	const vector<TensorShape>& in) {
	auto ret_invalid_shape = [&def]() {
	vector<TensorShape> out(def.output().size());
	for (auto& out_ts : out) {
	out_ts.set_unknown_shape(true);
	}
	return out;
	};
	// We only support shape inference of Split with 1 input
	if (def.input_size() != 1 \|\| in.empty() \|\| in.front().unknown_shape()) {
	return ret_invalid_shape();
	} else if (def.output_size() == 0) {
	return vector<TensorShape>();
	}
	ArgumentHelper helper(def);
	const int axis = helper.HasArgument("axis")
	? helper.GetSingleArgument<int>("axis", -1)
	: GetDimFromOrderString(
	helper.GetSingleArgument<string>("order", "NCHW"));
	const int add_axis = helper.HasArgument("axis")
	? helper.GetSingleArgument<int>("add_axis", 0)
	: 0;
	const auto& input = in[0];
	const int canonical_axis = canonical_axis_index_(axis, input.dims_size());
	const int input_channels = input.dims(canonical_axis);
	auto split = helper.GetRepeatedArgument<int>("split");
	// Equally split the input into outputs
	const int output_size = def.output_size();
	if (def.input_size() == caffe2::SplitOp<CPUContext>::kSplitOpInputSize) {
	if (!split.empty()) {
	LOG(WARNING) << "If you set split with an input blob, do not pass in "
	"split in the argument.";
	}
	// We cannot infer output shape until we see the value of split input
	return ret_invalid_shape();
	} else if (split.empty()) {
	if (input_channels % output_size != 0) {
	LOG(WARNING) << "Input channels (" << input_channels
	<< ") should be divisible by number of outputs ("
	<< output_size << ")";
	return ret_invalid_shape();
	}
	split.resize(output_size, input_channels / output_size);
	// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
	} else if (split.size() != output_size) {
	LOG(WARNING) << "`split` size (" << split.size()
	<< ") should be equal to output size (" << output_size << ")";
	return ret_invalid_shape();
	}

	// Check validity of the split
	const int total_channels = add_axis
	? def.output_size()
	: std::accumulate(split.begin(), split.begin() + output_size, 0);
	if (total_channels != input_channels) {
	LOG(WARNING) << "Input channels (" << input_channels
	<< ") is not equal to total output channels ("
	<< total_channels << ")";
	return ret_invalid_shape();
	}

	vector<int> output_dims(input.dims().begin(), input.dims().end());
	if (add_axis) {
	output_dims.erase(output_dims.begin() + canonical_axis);
	}
	vector<TensorShape> output_shapes;
	for (int i = 0; i < output_size; ++i) {
	if (!add_axis) {
	output_dims[canonical_axis] = split[i];
	}
	output_shapes.emplace_back(
	CreateTensorShape(output_dims, input.data_type()));
	}
	return output_shapes;
	}

	OpSchema::Cost CostInferenceForSplit(
	const OperatorDef&,
	const vector<TensorShape>& in) {
	CAFFE_ENFORCE_GT(in.size(), 0);
	struct OpSchema::Cost cost;
	cost.flops = 0;
	auto const& input_0_element_size_byte =
	DataTypeToTypeMeta(in[0].data_type()).itemsize();
	auto input_bytes_count = nElemFromDim(in[0]) * input_0_element_size_byte;
	auto split_bytes_count = in.size() > 1
	? nElemFromDim(in[1]) * DataTypeToTypeMeta(in[1].data_type()).itemsize()
	: 0;
	// There can be two input blobs:
	// (1) actual tensor to be split
	// (2) lengths of outputs along split axis
	// So, bytes_read is the sum of the bytes in the two blobs.
	cost.bytes_read = input_bytes_count + split_bytes_count;
	// Split operator only changes shape, does not change element count. So,
	// bytes_written is same as input_bytes_count.
	cost.bytes_written = input_bytes_count;
	cost.params_bytes = 0;
	return cost;
	}
	} // namespace.

	REGISTER_CPU_OPERATOR(Split, SplitOp<CPUContext>);
	REGISTER_CPU_OPERATOR(SplitByLengths, SplitByLengthsOp<CPUContext>);
	OPERATOR_SCHEMA(Split)
	.NumInputs(1, 2)
	.NumOutputs(1, INT_MAX)
	.Input(0, "input", "(Tensor): tensor to split")
	.Input(
	1,
	"split",
	"(Tensor`<int>`): [OPTIONAL] list of output lengths (see also arg `split`)")
	.Arg("axis", "(int): axis to split on")
	.Arg(
	"add_axis",
	"(type: int) Pass non-zero integer to remove the axis specified in `axis` to all input tensors.")
	.Arg("split", "(Tuple(int)): length of each output")
	.Arg(
	"order",
	// NOLINTNEXTLINE(modernize-raw-string-literal)
	"(string): order of dimensions of input and output blobs; either \"NCHW\" or \"NHWC\"")
	.Output(0, "[output_0, output_1, ...]", "(Tensor): output tensor")
	.TensorInferenceFunction(TensorInferenceForSplit)
	.CostInferenceFunction(CostInferenceForSplit)
	.DeviceInferenceFunction(splitOpDevInfer)
	.SetDoc(R"DOC(
	Split an `input` tensor into a list of tensors, along the axis specified by the `axis` dimension. The lengths of the split can be specified using argument `split` or optional second input blob to the operator. Otherwise, the tensor is split to equal sized parts.

	Github Links:
	- https://github.com/pytorch/pytorch/blob/main/caffe2/operators/concat_split_op.cc

	<details>

	<summary> <b>Example</b> </summary>

	Code

	```

	workspace.ResetWorkspace()

	op = core.CreateOperator(
	"Split",
	["input"],
	["output_0","output_1","output_2"],
	split=(3,2,4),
	axis=0
	)

	workspace.FeedBlob("input", np.random.randint(10, size=(9)))
	print("input:", workspace.FetchBlob("input"))
	workspace.RunOperatorOnce(op)
	print("output_0:", workspace.FetchBlob("output_0"))
	print("output_1:", workspace.FetchBlob("output_1"))
	print("output_2:", workspace.FetchBlob("output_2"))

	```

	Result

	```

	input: [2 2 6 6 6 0 5 7 4]
	output_0: [2 2 6]
	output_1: [6 6]
	output_2: [0 5 7 4]

	```

	</details>

	)DOC")
	.InheritOnnxSchema();

	OPERATOR_SCHEMA(SplitByLengths)
	.NumInputs(2)
	.NumOutputs(1, INT_MAX)
	.Input(0, "input", "The tensor to split")
	.Input(1, "legnths", "The tensor `l_i` indicates the logic block of input.")
	.Arg("axis", "Which axis to split on")
	.Arg("order", "Either NHWC or NCWH, will split on C axis, defaults to NCHW")
	.Arg(
	"use_scaling_lengths",
	"(bool): Enables automatic scaling of the lengths values. When enabled "
	"will automatically find a value K >= 1, such that sum(lengths) * K == len(input).")
	.DeviceInferenceFunction([](const OperatorDef& def) {
	auto op_device =
	def.has_device_option() ? def.device_option() : DeviceOption();
	vector<DeviceOption> in_dev(def.input_size(), op_device);
	vector<DeviceOption> out_dev(def.output_size(), op_device);
	// lengths input should be on CPU
	in_dev[1] = DeviceOption();
	return std::make_pair(in_dev, out_dev);
	})
	.SetDoc(R"DOC(
	Split a tensor into a list of tensors, given a lengths input, along the specified
	'axis'. If `K` outputs are provided, the op assumes `len(lengths) % K == 0`.
	The `input` will be split into `K` parts. Each part of length
	`sum(lengths[ik:ik+k))`

	<details>

	<summary> <b>Example 1</b> </summary>

	Code

	```

	workspace.ResetWorkspace()

	op = core.CreateOperator(
	"SplitByLengths",
	["input", "lengths"],
	["output_0","output_1","output_2"],
	axis=0
	)

	workspace.FeedBlob("input", np.random.randint(10, size=(9)))
	workspace.FeedBlob("lengths", np.array([3,2,4], dtype=np.int32))
	print("input:", workspace.FetchBlob("input"))
	print("lengths:", workspace.FetchBlob("lengths"))
	workspace.RunOperatorOnce(op)
	print("output_0:", workspace.FetchBlob("output_0"))
	print("output_1:", workspace.FetchBlob("output_1"))
	print("output_2:", workspace.FetchBlob("output_2"))

	```

	Result

	```

	input: [2 2 6 6 6 0 5 7 4]
	lengths: [3 2 4]
	output_0: [2 2 6]
	output_1: [6 6]
	output_2: [0 5 7 4]

	```

	<summary> <b>Example 2</b> </summary>

	Code

	```

	workspace.ResetWorkspace()

	op = core.CreateOperator(
	"SplitByLengths",
	["input", "lengths"],
	["output_0","output_1","output_2"],
	axis=0,
	use_scaling_lengths=true,
	)

	workspace.FeedBlob("input", np.random.randint(10, size=(9)))
	workspace.FeedBlob("lengths", np.array([1,1,1], dtype=np.int32))
	print("input:", workspace.FetchBlob("input"))
	print("lengths:", workspace.FetchBlob("lengths"))
	print("output_0:", workspace.FetchBlob("output_0"))
	print("output_1:", workspace.FetchBlob("output_1"))
	print("output_2:", workspace.FetchBlob("output_2"))

	```

	Result

	```

	input: [2 2 6 6 6 0 5 7 4]
	lengths: [1 1 1]
	output_0: [2 2 6]
	output_1: [6 6 6]
	output_2: [5 7 4]

	```

	</details>

	)DOC");

	OpSchema::Cost CostInferenceForConcat(
	const OperatorDef& def,
	const vector<TensorShape>& in) {
	ArgumentHelper helper(def);
	const int axis = helper.HasArgument("axis")
	? helper.GetSingleArgument<int>("axis", -1)
	: GetDimFromOrderString(
	helper.GetSingleArgument<string>("order", "NCHW"));
	bool add_axis = helper.GetSingleArgument<int>("add_axis", 0) != 0;
	int adj_size = in[0].dims_size() + (add_axis ? 1 : 0);
	const int canonical_axis = canonical_axis_index_(axis, adj_size);
	CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
	CAFFE_ENFORCE_GT(in.size(), 0);
	vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
	if (add_axis) {
	out_shape.insert(out_shape.begin() + canonical_axis, in.size());
	} else {
	for (size_t i = 1; i < in.size(); ++i) {
	out_shape[canonical_axis] += in[i].dims(canonical_axis);
	}
	}
	uint64_t nElemRead = 0;
	// NOLINTNEXTLINE(modernize-loop-convert,clang-diagnostic-sign-compare)
	for (int i = 0; i < in.size(); ++i) {
	nElemRead += nElemFromDim(in[i]);
	}
	int size = 1;
	for (auto& s : out_shape) {
	size *= s;
	}
	auto split_info_bytes_count = in.size() * sizeof(int);

	auto const& input_0_element_size_byte =
	DataTypeToTypeMeta(in[0].data_type()).itemsize();
	struct OpSchema::Cost cost;
	cost.flops = 0;
	cost.bytes_read = nElemRead * input_0_element_size_byte;
	cost.bytes_written =
	size * input_0_element_size_byte + split_info_bytes_count;
	cost.params_bytes = 0;
	return cost;
	}

	namespace {
	std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>>
	concatOpDevInfer(const OperatorDef& def) {
	auto op_device =
	def.has_device_option() ? def.device_option() : DeviceOption();
	vector<DeviceOption> in_dev(def.input_size(), op_device);
	vector<DeviceOption> out_dev(def.output_size(), op_device);

	// 2nd output's type is always CPU irrespective of op's device option.
	CAFFE_ENFORCE_GT(out_dev.size(), 1);
	out_dev[1] = DeviceOption();
	return std::make_pair(in_dev, out_dev);
	}
	} // namespace

	vector<TensorShape> TensorInferenceForConcat(
	const OperatorDef& def,
	const vector<TensorShape>& in) {
	ArgumentHelper helper(def);
	const int axis = helper.HasArgument("axis")
	? helper.GetSingleArgument<int>("axis", -1)
	: GetDimFromOrderString(
	helper.GetSingleArgument<string>("order", "NCHW"));
	bool add_axis = helper.GetSingleArgument<int>("add_axis", 0) != 0;
	int adj_size = in[0].dims_size() + (add_axis ? 1 : 0);
	const int canonical_axis = canonical_axis_index_(axis, adj_size);
	CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
	CAFFE_ENFORCE_GT(in.size(), 0);
	vector<int> split_shape(1, in.size());
	vector<int> out_shape(in[0].dims().begin(), in[0].dims().end());
	if (add_axis) {
	// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
	for (int i = 1; i < in.size(); ++i) {
	CAFFE_ENFORCE_EQ(
	in[0].dims().size(),
	in[i].dims().size(),
	"All inputs of Concat should have same dims when add_axis = 1. "
	"Got different sizes for inputs 0 and ",
	i);
	for (int j = 0; j < in[0].dims().size(); ++j) {
	CAFFE_ENFORCE_EQ(
	in[0].dims(j),
	in[i].dims(j),
	"All inputs of Concat should have same dims when add_axis = 1. "
	"Got different dims for inputs 0 and ",
	i,
	". At dim: ",
	j);
	}
	}
	out_shape.insert(out_shape.begin() + canonical_axis, in.size());
	} else {
	// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
	for (int i = 1; i < in.size(); ++i) {
	CAFFE_ENFORCE(
	in[0].dims_size() == in[i].dims_size() \|\|
	(canonical_axis == in[0].dims_size() - 1 &&
	in[0].dims_size() == in[i].dims_size() + 1),
	"All inputs of Concat should have same dims except "
	"canonical_axis dim that is equal to ",
	canonical_axis,
	"Got different sizes for inputs 0 and ",
	i);
	for (int j = 0; j < in[0].dims_size(); ++j) {
	if (j == canonical_axis) {
	continue;
	}
	CAFFE_ENFORCE_EQ(
	in[0].dims(j),
	in[i].dims(j),
	"All inputs of Concat should have same dims except "
	"canonical_axis dim that is equal to ",
	canonical_axis,
	"Got different dims for inputs 0 and ",
	i,
	". At dim: ",
	j);
	}
	}

	// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
	for (int i = 1; i < in.size(); ++i) {
	out_shape[canonical_axis] += in[i].dims(canonical_axis);
	}
	}
	if (def.output_size() == 1) {
	return vector<TensorShape>{CreateTensorShape(out_shape, in[0].data_type())};
	}
	return vector<TensorShape>{
	CreateTensorShape(out_shape, in[0].data_type()),
	CreateTensorShape(split_shape, TensorProto::INT32)};
	}

	REGISTER_CPU_OPERATOR(Concat, ConcatOp<CPUContext>);
	OPERATOR_SCHEMA(Concat)
	.NumInputs(1, INT_MAX)
	.NumOutputs(2)
	.Arg("axis", "(type: int; default: -1) Axis to concatenate on.")
	.Arg(
	"order",
	"(type: string; default='NCHW') Order of blob dimensions. Concats on the C dimension.")
	.Arg(
	"add_axis",
	"(type: int) Pass non-zero integer to add the axis specified in `axis` to all input tensors.")
	.TensorInferenceFunction(
	OpSchema::NeedsAllInputShapes(TensorInferenceForConcat))
	.CostInferenceFunction(CostInferenceForConcat)
	.DeviceInferenceFunction(concatOpDevInfer)
	.SetDoc(R"DOC(
	Concatenate a list of tensors into a single tensor. Similar functionality to
	Numpy's [concatenate](https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html)
	function. The `axis` argument specifies what axis along which the arrays will be concatenated.
	When set to non-zero (default=0), the `add_axis` argument adds the axis specified in `axis` to
	all input tensors.

	Github Links:

	- https://github.com/pytorch/pytorch/blob/main/caffe2/operators/concat_split_op.cc
	- https://github.com/pytorch/pytorch/blob/main/caffe2/operators/concat_split_op.h


	<details>

	<summary> <b>Example</b> </summary>

	Code

	```

	workspace.ResetWorkspace()

	op = core.CreateOperator(
	"Concat",
	["X1", "X2"],
	["Y", "split_info"],
	axis=0
	)

	workspace.FeedBlob("X1", np.array([[1,2],[3,4]]))
	workspace.FeedBlob("X2", np.array([[5,6]]))
	print("X1:", workspace.FetchBlob("X1"))
	print("X2:", workspace.FetchBlob("X2"))
	workspace.RunOperatorOnce(op)
	print("Y:", workspace.FetchBlob("Y"))
	print("split_info:", workspace.FetchBlob("split_info"))

	```

	Result

	```

	X1: [[1 2]
	[3 4]]
	X2: [[5 6]]
	Y: [[1 2]
	[3 4]
	[5 6]]
	split_info: [2 1]

	```

	</details>

	<details>

	<summary> <b>Example 2</b> </summary>

	Code

	```

	workspace.ResetWorkspace()

	op = core.CreateOperator(
	"Concat",
	["X1", "X2"],
	["Y", "split_info"],
	add_axis=1,
	axis=3
	)

	workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
	workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) // NCHW
	print("X1:", workspace.FetchBlob("X1"))
	print("X2:", workspace.FetchBlob("X2"))
	workspace.RunOperatorOnce(op)
	print("Y:", workspace.FetchBlob("Y"))
	print("split_info:", workspace.FetchBlob("split_info"))

	```

	Result

	```

	X1: [[[[1 8 3 9 0]
	[6 4 6 5 6]
	[3 9 1 9 9]
	[5 1 0 7 7]
	[9 4 0 0 9]]]]
	X2: [[[[7 0 2 6 1]
	[3 9 4 0 3]
	[5 3 8 9 4]
	[3 4 2 1 0]
	[0 8 8 8 1]]]]
	Y: [[[[[1 8 3 9 0]
	[7 0 2 6 1]]

	[[6 4 6 5 6]
	[3 9 4 0 3]]

	[[3 9 1 9 9]
	[5 3 8 9 4]]

	[[5 1 0 7 7]
	[3 4 2 1 0]]

	[[9 4 0 0 9]
	[0 8 8 8 1]]]]]
	split_info: [1 1]

	```

	</details>

	)DOC")
	.Input(0, "X1, X2, ...", "(type: Tensor`<float>`) List of input tensors.")
	.Output(
	0,
	"concat_result",
	"(type: Tensor`<float>`) Concatenated tensor.")
	.Output(
	1,
	"split_info",
	"(type: Tensor`<int>`) The dimensions of the inputs.")
	.InheritOnnxSchema();

	// Backward compatibility names.
	REGISTER_CPU_OPERATOR(DepthSplit, SplitOp<CPUContext>);
	REGISTER_CPU_OPERATOR(DepthConcat, ConcatOp<CPUContext>);
	OPERATOR_SCHEMA(DepthSplit)
	.NumInputs(1, 2)
	.NumOutputs(1, INT_MAX)
	.SetDoc("Backward compatible operator name for Split.");
	OPERATOR_SCHEMA(DepthConcat)
	.NumInputs(1, INT_MAX)
	.NumOutputs(2)
	.SetDoc("Backward compatible operator name for Concat.");

	class GetSplitGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;
	vector<OperatorDef> GetGradientDefs() override {
	vector<string> output_grads;
	for (int i = 0; i < def_.output_size(); ++i) {
	if (!GradOut(i).IsEmpty()) {
	output_grads.push_back(GO(i));
	}
	}
	if (output_grads.empty()) {
	return {};
	}
	return SingleGradientDef(
	"Concat",
	"",
	output_grads,
	vector<string>{GI(0), "_" + GI(0) + "_dims"});
	}
	};
	REGISTER_GRADIENT(Split, GetSplitGradient);
	REGISTER_GRADIENT(DepthSplit, GetSplitGradient);
	REGISTER_GRADIENT(SplitByLengths, GetSplitGradient);

	class GetConcatGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;
	vector<OperatorDef> GetGradientDefs() override {
	if (GradOut(0).IsEmpty()) {
	return {};
	}
	vector<string> grads;
	for (int i = 0; i < def_.input_size(); ++i) {
	// NOLINTNEXTLINE(performance-inefficient-vector-operation)
	grads.push_back(GI(i));
	}
	return SingleGradientDef("Split", "", vector<string>{GO(0), O(1)}, grads);
	}
	};
	REGISTER_GRADIENT(Concat, GetConcatGradient);
	REGISTER_GRADIENT(DepthConcat, GetConcatGradient);
	} // namespace caffe2