| /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| // See docs in ../ops/math_ops.cc. |
| |
| #define EIGEN_USE_THREADS |
| |
| #include <numeric> |
| |
| #include "tensorflow/core/kernels/aggregate_ops.h" |
| #include "tensorflow/core/kernels/aggregate_ops_cpu.h" |
| |
| #include "tensorflow/core/framework/numeric_op.h" |
| #include "tensorflow/core/framework/register_types.h" |
| #include "tensorflow/core/framework/variant.h" |
| #include "tensorflow/core/framework/variant_encode_decode.h" |
| #include "tensorflow/core/framework/variant_op_registry.h" |
| #include "tensorflow/core/lib/gtl/inlined_vector.h" |
| #include "tensorflow/core/platform/logging.h" |
| |
| namespace tensorflow { |
| |
| typedef Eigen::ThreadPoolDevice CPUDevice; |
| typedef Eigen::GpuDevice GPUDevice; |
| #ifdef TENSORFLOW_USE_SYCL |
| typedef Eigen::SyclDevice SYCLDevice; |
| #endif // TENSORFLOW_USE_SYCL |
| |
| template <typename Device, typename T> |
| class AddNOp : public OpKernel { |
| public: |
| explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {} |
| |
| void Compute(OpKernelContext* ctx) override { |
| if (!ctx->ValidateInputsAreSameShape(this)) return; |
| |
| const Tensor& input0 = ctx->input(0); |
| const int num = ctx->num_inputs(); |
| |
| if (num == 1) { |
| ctx->set_output(0, input0); |
| return; |
| } |
| |
| // Try to forward and accumulate the result in one of the input buffers. |
| int reused_input = -1; |
| gtl::InlinedVector<int, 8> input_indices(num); |
| std::iota(input_indices.begin(), input_indices.end(), 0); |
| Tensor* output = nullptr; |
| for (int input_idx = 0; input_idx < num; ++input_idx) { |
| if (ctx->forward_input_to_output_with_shape(input_idx, 0, input0.shape(), |
| &output)) { |
| reused_input = input_idx; |
| break; |
| } |
| } |
| if (reused_input == -1) { |
| OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output)); |
| } else if (reused_input > 0) { |
| // Move the forwarded buffer to the front so we don't double count |
| // anything if there are more than 8 inputs. |
| input_indices[0] = reused_input; |
| input_indices[reused_input] = 0; |
| } |
| auto To = output->flat<T>(); |
| |
| #define I(IDX) ctx->input(input_indices[IDX]).flat<T>() |
| |
| #if defined(__ANDROID_TYPES_SLIM__) |
| // On Android by default,we only support additions of two arguments, so we |
| // can reduce the number of template instantiations. |
| OP_REQUIRES(ctx, num == 2, |
| errors::InvalidArgument("Only additions of two arguments " |
| "supported. Num inputs: ", |
| num)); |
| functor::Add2Functor<Device, T> functor2; |
| functor2(ctx->template eigen_device<Device>(), To, I(0), I(1)); |
| #else |
| static const int kWidth = 8; |
| int r = num % kWidth; |
| |
| switch (r) { |
| case 2: { |
| functor::Add2Functor<Device, T> functor2; |
| functor2(ctx->template eigen_device<Device>(), To, I(0), I(1)); |
| break; |
| } |
| case 3: { |
| functor::Add3Functor<Device, T> functor3; |
| functor3(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2)); |
| break; |
| } |
| case 4: { |
| functor::Add4Functor<Device, T> functor4; |
| functor4(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), |
| I(3)); |
| break; |
| } |
| case 5: { |
| functor::Add5Functor<Device, T> functor5; |
| functor5(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), |
| I(3), I(4)); |
| break; |
| } |
| case 6: { |
| functor::Add6Functor<Device, T> functor6; |
| functor6(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), |
| I(3), I(4), I(5)); |
| break; |
| } |
| case 7: { |
| functor::Add7Functor<Device, T> functor7; |
| functor7(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), |
| I(3), I(4), I(5), I(6)); |
| break; |
| } |
| case 0: { |
| functor::Add8Functor<Device, T> functor8; |
| functor8(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), |
| I(3), I(4), I(5), I(6), I(7)); |
| r = 8; |
| break; |
| } |
| case 1: { |
| functor::Add9Functor<Device, T> functor9; |
| functor9(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), |
| I(3), I(4), I(5), I(6), I(7), I(8)); |
| r = 9; |
| break; |
| } |
| } |
| |
| for (; r < num; r += kWidth) { |
| functor::Add8pFunctor<Device, T> functor8p; |
| functor8p(ctx->template eigen_device<Device>(), To, I(r), I(r + 1), |
| I(r + 2), I(r + 3), I(r + 4), I(r + 5), I(r + 6), I(r + 7)); |
| } |
| #endif // defined(__ANDROID_TYPES_SLIM__) |
| |
| #undef I |
| } |
| }; |
| |
| template <typename Device> |
| class AddNOp<Device, Variant> : public OpKernel { |
| public: |
| explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {} |
| |
| void Compute(OpKernelContext* ctx) override { |
| if (!ctx->ValidateInputsAreSameShape(this)) return; |
| |
| const Tensor& input0 = ctx->input(0); |
| const int num = ctx->num_inputs(); |
| |
| if (num == 1) { |
| ctx->set_output(0, input0); |
| return; |
| } |
| |
| for (int i = 0; i < num; ++i) { |
| // Step 1: ensure unary variants. |
| OP_REQUIRES( |
| ctx, ctx->input(i).dims() == 0, |
| errors::InvalidArgument( |
| "AddN of non-scalar Tensor with dtype=DT_VARIANT is not " |
| "supported; inputs[", |
| i, " has shape: ", ctx->input(i).shape().DebugString(), ".")); |
| } |
| |
| // Step 2: attempt to add using |
| // BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...) |
| // For the output create a default-constructed variant object. |
| // TODO(ebrevdo): Perform summation in a tree-structure. |
| Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({})); |
| Variant* v_out = &(out.scalar<Variant>()()); |
| OP_REQUIRES_OK( |
| ctx, BinaryOpVariants<Device>( |
| ctx, ADD_VARIANT_BINARY_OP, ctx->input(0).scalar<Variant>()(), |
| ctx->input(1).scalar<Variant>()(), v_out)); |
| for (int i = 2; i < num; ++i) { |
| const Variant tmp = std::move(*v_out); |
| const Variant& inp = ctx->input(i).scalar<Variant>()(); |
| OP_REQUIRES_OK(ctx, BinaryOpVariants<Device>(ctx, ADD_VARIANT_BINARY_OP, |
| inp, tmp, v_out)); |
| } |
| ctx->set_output(0, out); |
| } |
| }; |
| |
| #define REGISTER_ADDN(type, dev) \ |
| REGISTER_KERNEL_BUILDER( \ |
| Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \ |
| AddNOp<dev##Device, type>) |
| |
| #define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU) |
| |
| TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU); |
| REGISTER_ADDN_CPU(Variant); |
| |
| #undef REGISTER_ADDN_CPU |
| |
| #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ |
| (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) |
| #define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU) |
| TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU); |
| TF_CALL_int64(REGISTER_ADDN_GPU); |
| TF_CALL_complex64(REGISTER_ADDN_GPU); |
| TF_CALL_complex128(REGISTER_ADDN_GPU); |
| TF_CALL_variant(REGISTER_ADDN_GPU); |
| #undef REGISTER_ADDN_GPU |
| |
| // A special GPU kernel for int32. |
| // TODO(b/25387198): Also enable int32 in device memory. This kernel |
| // registration requires all int32 inputs and outputs to be in host memory. |
| REGISTER_KERNEL_BUILDER(Name("AddN") |
| .Device(DEVICE_GPU) |
| .TypeConstraint<int32>("T") |
| .HostMemory("inputs") |
| .HostMemory("sum"), |
| AddNOp<CPUDevice, int32>); |
| |
| #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
| |
| #ifdef TENSORFLOW_USE_SYCL |
| REGISTER_ADDN(float, SYCL); |
| REGISTER_ADDN(double, SYCL); |
| |
| // A special GPU kernel for int32. |
| // TODO(b/25387198): Also enable int32 in device memory. This kernel |
| // registration requires all int32 inputs and outputs to be in host memory. |
| REGISTER_KERNEL_BUILDER(Name("AddN") |
| .Device(DEVICE_SYCL) |
| .TypeConstraint<int32>("T") |
| .HostMemory("inputs") |
| .HostMemory("sum"), |
| AddNOp<CPUDevice, int32>); |
| #endif // TENSORFLOW_USE_SYCL |
| |
| #undef REGISTER_ADDN |
| |
| } // namespace tensorflow |