tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include <math.h>

 #include <algorithm>
 #include <memory>
 #include <new>
 #include <random>
 #include <utility>

 #define EIGEN_USE_THREADS

 #include "absl/types/span.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test_benchmark.h"

 namespace xla {
 namespace {

 const int test_width = 2, test_height = 3;

 const float test_float_vals[3][test_width][test_height] = {
     {{-1.0, -1.0, 1.0}, {-3.0, 0.0, -1.0}},
     {{-3.0, 2.0, 1.0}, {0.0, -3.0, 1.0}},
     {{-3.0, 0.0, -3.0}, {-1.0, -2.0, 1.0}}};

 // Test whether fusion operations are emitted with no errors and compute
 // accurate outputs.
 class CpuGpuFusionTest : public HloTestBase {
  protected:
   template <typename T, int Arity>
   void TestElementwise2D(
       HloOpcode opcode,
       std::optional<ComparisonDirection> direction = std::nullopt) {
     // Create a variable for comparisons since they require the direction.
     bool is_compare = std::is_same<T, bool>::value;
     Array2D<float> operand_data[Arity];
     for (int i = 0; i < Arity; ++i) {
       new (&operand_data[i]) Array2D<float>(test_width, test_height);
     }
     Array2D<T> answer_data(test_width, test_height);
     for (int i = 0; i < test_width; ++i) {
       for (int j = 0; j < test_height; ++j) {
         float xs[Arity];
         for (int k = 0; k < Arity; ++k) {
           xs[k] = test_float_vals[k][i][j];
           operand_data[k](i, j) = xs[k];
         }
         if (is_compare) {
           answer_data(i, j) = ComputeElementwiseAnswerCompare(*direction, xs);
         } else {
           answer_data(i, j) = ComputeElementwiseAnswerFloat(opcode, xs);
         }
       }
     }

     auto builder = HloComputation::Builder(TestName());
     auto hlo_module = CreateNewVerifiedModule();

     auto prim_type = primitive_util::NativeToPrimitiveType<T>();

     HloInstruction* hlos[4];
     for (int i = 0; i < Arity; ++i) {
       hlos[i + 1] = builder.AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::CreateR2FromArray2D(operand_data[i])));
     }
     auto answer_shape =
         ShapeUtil::MakeShape(prim_type, {test_width, test_height});
     std::unique_ptr<HloInstruction> root_hlo;
     switch (Arity) {
       case 1:
         root_hlo = HloInstruction::CreateUnary(answer_shape, opcode, hlos[1]);
         break;
       case 2:
         if (is_compare) {
           root_hlo = HloInstruction::CreateCompare(answer_shape, hlos[1],
                                                    hlos[2], *direction);
         } else {
           root_hlo = HloInstruction::CreateBinary(answer_shape, opcode, hlos[1],
                                                   hlos[2]);
         }
         break;
       case 3:
         root_hlo = HloInstruction::CreateTernary(answer_shape, opcode, hlos[1],
                                                  hlos[2], hlos[3]);
         break;
       default:
         LOG(FATAL) << "Bad arity: " << Arity;
     }
     hlos[0] = builder.AddInstruction(std::move(root_hlo));
     hlo_module->AddEntryComputation(builder.Build())
         ->CreateFusionInstruction(
             absl::Span<HloInstruction* const>(hlos).subspan(0, Arity + 1),
             HloInstruction::FusionKind::kLoop);

     auto expected = LiteralUtil::CreateR2FromArray2D(answer_data);
     auto actual = ExecuteAndTransfer(std::move(hlo_module), {});
     if (primitive_util::IsFloatingPointType(prim_type)) {
       EXPECT_TRUE(LiteralTestUtil::Near(expected, actual, ErrorSpec(1e-4)));
     } else {
       EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
     }
   }

  private:
   float ComputeElementwiseAnswerFloat(HloOpcode opcode,
                                       absl::Span<const float> xs);
   bool ComputeElementwiseAnswerCompare(ComparisonDirection direction,
                                        absl::Span<const float> xs);
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
     debug_options.add_xla_disable_hlo_passes("layout-assignment");
     return debug_options;
   }
 };

 float CpuGpuFusionTest::ComputeElementwiseAnswerFloat(
     HloOpcode opcode, absl::Span<const float> xs) {
   switch (opcode) {
     case HloOpcode::kAdd:
       return xs[0] + xs[1];
     case HloOpcode::kSubtract:
       return xs[0] - xs[1];
     case HloOpcode::kMultiply:
       return xs[0] * xs[1];
     case HloOpcode::kDivide:
       return xs[0] / xs[1];
     case HloOpcode::kPower:
       return powf(xs[0], xs[1]);
     case HloOpcode::kMinimum:
       return std::min(xs[0], xs[1]);
     case HloOpcode::kMaximum:
       return std::max(xs[0], xs[1]);
     case HloOpcode::kClamp:
       return std::min(xs[2], std::max(xs[1], xs[0]));
     default:
       LOG(FATAL) << "No elementwise opcode: " << opcode;
   }
 }

 bool CpuGpuFusionTest::ComputeElementwiseAnswerCompare(
     ComparisonDirection direction, absl::Span<const float> xs) {
   switch (direction) {
     case ComparisonDirection::kEq:
       return xs[0] == xs[1];
     case ComparisonDirection::kNe:
       return xs[0] != xs[1];
     case ComparisonDirection::kGt:
       return xs[0] > xs[1];
     case ComparisonDirection::kLt:
       return xs[0] < xs[1];
     case ComparisonDirection::kGe:
       return xs[0] >= xs[1];
     case ComparisonDirection::kLe:
       return xs[0] <= xs[1];
   }
 }

 XLA_TEST_F(CpuGpuFusionTest, Test) {
   // test expression:
   // slice(select({{T, F, T}, {F, T, F}},
   //              concat(transpose({{1.0}, {2.0}, {3.0}} +
   //                               {{-1.0}, {-1.0}, {-1.0}}),
   //                     {{1.62, 2.72, 3.14}}) +
   //                     (-{{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}}),
   //              {{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})) = {{0.5}, {2.72}}
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0}, {2.0}, {3.0}})));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{-1.0}, {-1.0}, {-1.0}})));
   auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {3, 1}), HloOpcode::kAdd, const0, const1));
   auto reshape3 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {1, 3}), add2, {1, 0}));
   auto const4 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.62, 2.72, 3.14}})));
   auto concat5 = builder.AddInstruction(HloInstruction::CreateConcatenate(
       ShapeUtil::MakeShape(F32, {2, 3}), {reshape3, const4}, 0));
   auto const6 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}})));
   auto negate7 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 3}), HloOpcode::kNegate, const6));
   auto add8 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {2, 3}), HloOpcode::kAdd, concat5, negate7));
   auto const9 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})));
   auto const10 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2<bool>(
           {{true, false, true}, {false, true, false}})));
   auto select11 = builder.AddInstruction(
       HloInstruction::CreateTernary(ShapeUtil::MakeShape(F32, {2, 3}),
                                     HloOpcode::kSelect, const10, add8, const9));
   auto slice12 = builder.AddInstruction(HloInstruction::CreateSlice(
       ShapeUtil::MakeShape(F32, {2, 1}), select11, {0, 1}, {2, 2}, {1, 1}));
   // CreateFusionInstruction needs the `instructions_to_fuse` argument in
   // reverse topological order, so the first element in `instructions_to_fuse`
   // must be the root.
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(
           {slice12, select11, const10, const9, add8, negate7, const6, concat5,
            const4, reshape3, add2, const1, const0},
           HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(LiteralTestUtil::Near(
       LiteralUtil::CreateR2<float>({{0.5}, {2.72}}),
       ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }

 // Test whether we emit appropriate code for parameters of fusion instructions.
 XLA_TEST_F(CpuGpuFusionTest, Parameter) {
   // Build a computation and fuse part of it so the fusion instruction has an
   // operand parameter.
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0, 2.0, 3.0}})));
   auto copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {1, 3}), HloOpcode::kCopy, const0));
   auto const2 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{-2.0, -2.0, -2.0}})));
   // add3 = copy1 + const2 = const0 + const2 = {1,2,3} + {-2,-2,-2} = {-1,0,+1}
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {1, 3}), HloOpcode::kAdd, copy1, const2));
   // CreateFusionInstruction needs `instructions_to_fuse` in reverse topological
   // order.
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{add3, const2},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(LiteralTestUtil::Near(
       LiteralUtil::CreateR2<float>({{-1.0, 0.0, 1.0}}),
       ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }

 XLA_TEST_F(CpuGpuFusionTest, RandomizedParallelPartition) {
   // Tests parallel partitioning of a fusion instruction.
   // Create shape with random outer dimension size to generate random parallel
   // partition counts for each test run.
   const int seed = tensorflow::testing::RandomSeed();
   LOG(INFO) << "RandomizedParallelPartition seed: " << seed;
   std::mt19937 generator(seed);
   std::uniform_int_distribution<int> distribution(128, 1024);
   const int64_t rand_dim0_size = distribution(generator);
   const int64_t dim1_size = 1024;
   Shape shape =
       ShapeUtil::MakeShapeWithLayout(F32, {rand_dim0_size, dim1_size}, {1, 0});
   // Build simple fusion computation: y = x^2 (elementwise).
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();

   auto two = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
   auto x =
       builder.AddInstruction(HloInstruction::CreateBroadcast(shape, two, {}));
   auto y = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, x, x));

   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{y, x, two},
                                 HloInstruction::FusionKind::kLoop);
   // Compute result.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
   // Every element of result should be y = x^2 = 4.0.
   for (int i = 0; i < rand_dim0_size; ++i) {
     for (int j = 0; j < dim1_size; ++j) {
       EXPECT_EQ(4.0, result.Get<float>({i, j}));
     }
   }
 }

 XLA_TEST_F(CpuGpuFusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
   auto const_array = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{-1.0, -2.0, -4.0}, {10.0, 20.0, 30.0}})));
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(const_array->shape(), const_vector, {1}));
   // add2 = broadcast(const_vector) + const_array
   //      = broadcast({1,2,3}) + {{-1.0, -2.0, -4.0}, {10.0, 20.0, 30.0}}
   //      = {{1, 2, 3}, {1, 2, 3}} + {{-1.0, -2.0, -4.0}, {10.0, 20.0, 30.0}}
   auto add2 = builder.AddInstruction(
       HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {2, 3}),
                                    HloOpcode::kAdd, broadcast, const_array));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{add2, broadcast},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(LiteralTestUtil::Near(
       LiteralUtil::CreateR2<float>({{0.0, 0.0, -1.0}, {11.0, 22.0, 33.0}}),
       ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }

 XLA_TEST_F(CpuGpuFusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto single_element_array = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2<int32_t>({{5}})));
   auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {}), single_element_array));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR0<int32_t>(5),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32_t>({{1, 2}, {3, 4}, {5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 2, 3}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR3<int32_t>({{{1, 2, 3}, {4, 5, 6}}}),
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR3<int32_t>({{{1, 2, 3}, {4, 5, 6}}})));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {3, 2}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32_t>({{1, 2}, {3, 4}, {5, 6}}),
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR3<int32_t>({{{7}}})));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR0<int32_t>(7),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(7)));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR3<int32_t>({{{7}}}),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(7)));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR0<int32_t>(7),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {3, 3}), const0));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}),
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32_t>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {3, 2}), const0, {1, 0}));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32_t>({{1, 4}, {2, 5}, {3, 6}}),
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {3, 3}), const0, {1, 0}));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32_t>({{1, 4, 7}, {2, 5, 8}, {3, 6, 9}}),
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32_t>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
       ShapeUtil::MakeShape(S32, {3}), const0, {0}));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reverse1},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>({3, 2, 1}),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, ReverseNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32_t>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
       ShapeUtil::MakeShape(S32, {3}), const0, {0}));
   auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {3}), HloOpcode::kNegate, reverse1));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, reverse1},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>({-3, -2, -1}),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, BroadcastNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(1)));
   auto broadcast1 = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(S32, {2}), const0, {}));
   auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2}), HloOpcode::kNegate, broadcast1));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, broadcast1},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>({-1, -1}),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, SliceNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32_t>({1, 2, 3, 4})));
   auto slice1 = builder.AddInstruction(HloInstruction::CreateSlice(
       ShapeUtil::MakeShape(S32, {2}), const0, {0}, {4}, {2}));
   auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2}), HloOpcode::kNegate, slice1));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, slice1},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>({-1, -3}),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, DynamicSliceNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32_t>({1, 2, 3, 4})));
   auto const1 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(1)));
   auto dynamic_slice2 =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
           ShapeUtil::MakeShape(S32, {2}), const0, {const1}, {2}));
   auto negate3 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2}), HloOpcode::kNegate, dynamic_slice2));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(
           /*instructions_to_fuse=*/{negate3, dynamic_slice2},
           HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>({-2, -3}),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, ReshapeNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32_t>({1, 2, 3, 4})));
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {2, 2}), const0));
   auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2, 2}), HloOpcode::kNegate, reshape1));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, reshape1},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32_t>({{-1, -2}, {-3, -4}}),
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, TransposeNegate) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32_t>({{1, 2}, {3, 4}})));
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {2, 2}), const0, {1, 0}));
   auto negate2 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2, 2}), HloOpcode::kNegate, transpose1));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, transpose1},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32_t>({{-1, -3}, {-2, -4}}),
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 std::unique_ptr<HloComputation> MakeReduceTestComputation() {
   auto builder = HloComputation::Builder("add");
   auto lhs = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(S32, {}), "lhs"));
   auto rhs = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/1, ShapeUtil::MakeShape(S32, {}), "rhs"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(S32, {}), HloOpcode::kAdd, lhs, rhs));
   return builder.Build();
 }

 XLA_TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(Reduce)) {
   auto hlo_module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(S32, {32}), 0));
   auto const1 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(0)));
   auto reduce2 = builder.AddInstruction(HloInstruction::CreateReduce(
       ShapeUtil::MakeShape(S32, {}), const0, const1, {0},
       hlo_module->AddEmbeddedComputation(MakeReduceTestComputation())));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reduce2},
                                 HloInstruction::FusionKind::kInput);

   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR0<int32_t>(496),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, ReduceImplicitBroadcast) {
   auto hlo_module = CreateNewVerifiedModule();

   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32_t>({1, 2, 4, 8})));
   auto const1 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(0)));
   auto reduce2 = builder.AddInstruction(HloInstruction::CreateReduce(
       ShapeUtil::MakeShape(S32, {}), const0, const1, {0},
       hlo_module->AddEmbeddedComputation(MakeReduceTestComputation())));
   auto negate3 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {}), HloOpcode::kNegate, reduce2));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate3, reduce2},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR0<int32_t>(-15),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
   auto hlo_module = CreateNewVerifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32_t>({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}})));
   auto const1 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(1)));
   Window window;
   ASSERT_TRUE(
       tensorflow::protobuf::TextFormat::ParseFromString("dimensions:{\n"
                                                         "size:2\n"
                                                         "stride:1\n"
                                                         "padding_low:0\n"
                                                         "padding_high:0\n"
                                                         "window_dilation:1\n"
                                                         "base_dilation:1\n"
                                                         "}\n"
                                                         "dimensions:{\n"
                                                         "size:2\n"
                                                         "stride:1\n"
                                                         "padding_low:0\n"
                                                         "padding_high:0\n"
                                                         "window_dilation:1\n"
                                                         "base_dilation:1\n"
                                                         "}\n",
                                                         &window));
   auto nested_builder = HloComputation::Builder("mul");
   {
     auto x = nested_builder.AddInstruction(
         HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(S32, {}), "x"));
     auto y = nested_builder.AddInstruction(
         HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(S32, {}), "y"));
     nested_builder.AddInstruction(HloInstruction::CreateBinary(
         ShapeUtil::MakeShape(S32, {}), HloOpcode::kMultiply, x, y));
   }
   auto nested_computation =
       hlo_module->AddEmbeddedComputation(nested_builder.Build());
   auto reduce_window2 =
       builder.AddInstruction(HloInstruction::CreateReduceWindow(
           ShapeUtil::MakeShape(S32, {2, 2}), const0, const1, window,
           nested_computation));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reduce_window2},
                                 HloInstruction::FusionKind::kLoop);

   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR2<int32_t>({{462, 2145}, {24871, 62491}}),
       ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 // When a constant (or other op) which has multiple users is imported
 // into a fusion, it should remain shared, rather than being duplicated
 // within the fusion.
 XLA_TEST_F(CpuGpuFusionTest, SharedConstant) {
   auto hlo_module = CreateNewVerifiedModule();

   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32_t>({0})));
   auto const1 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32_t>({2})));
   auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, const0));
   auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add1));
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add2));
   auto add4 = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add3));
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction({add4, add3, add2, add1, const1},
                                 HloInstruction::FusionKind::kLoop);

   HloComputation* entry_comp = hlo_module->entry_computation();

   // entry computation contains the constant(0) and the fusion
   EXPECT_EQ(entry_comp->instruction_count(), 2);

   // fused instruction contains the constant(2), the parameter, and 4 adds
   EXPECT_EQ(entry_comp->root_instruction()->fused_instruction_count(), 6);

   EXPECT_TRUE(
       LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>({8}),
                              ExecuteAndTransfer(std::move(hlo_module), {})));
 }

 XLA_TEST_F(CpuGpuFusionTest, Add2D) {
   TestElementwise2D<float, 2>(HloOpcode::kAdd);
 }

 XLA_TEST_F(CpuGpuFusionTest, Subtract2D) {
   TestElementwise2D<float, 2>(HloOpcode::kSubtract);
 }

 XLA_TEST_F(CpuGpuFusionTest, Multiply2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMultiply);
 }

 XLA_TEST_F(CpuGpuFusionTest, Divide2D) {
   TestElementwise2D<float, 2>(HloOpcode::kDivide);
 }

 XLA_TEST_F(CpuGpuFusionTest, Power2D) {
   TestElementwise2D<float, 2>(HloOpcode::kPower);
 }

 XLA_TEST_F(CpuGpuFusionTest, Minimum2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMinimum);
 }

 XLA_TEST_F(CpuGpuFusionTest, Maximum2D) {
   TestElementwise2D<float, 2>(HloOpcode::kMaximum);
 }

 XLA_TEST_F(CpuGpuFusionTest, Equal2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kEq);
 }

 XLA_TEST_F(CpuGpuFusionTest, Inequal2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kNe);
 }

 XLA_TEST_F(CpuGpuFusionTest, Greater2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kGt);
 }

 XLA_TEST_F(CpuGpuFusionTest, Lesser2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kLt);
 }

 XLA_TEST_F(CpuGpuFusionTest, GreaterOrEqual2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kGe);
 }

 XLA_TEST_F(CpuGpuFusionTest, LesserOrEqual2D) {
   TestElementwise2D<bool, 2>(HloOpcode::kCompare, ComparisonDirection::kLe);
 }

 XLA_TEST_F(CpuGpuFusionTest, Clamp2D) {
   TestElementwise2D<float, 3>(HloOpcode::kClamp);
 }

 class FusionClientLibraryTest : public ClientLibraryTestBase {};

 XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
   // On the GPU backend, it's possible to have too many transposes within one
   // fusion, causing the kernel to run out shared memory and thus not compile.
   // We want to check that doesn't happen.
   //
   // To do this, we create a computation that computes
   //
   //   P0 + P0*P1*P1 + P0*P2*P2 ...
   //
   // where even parameters have layout 1 and odd parameters have layout 2.
   //
   // Our goal is to tempt the backend into creating one giant multi-output
   // fusion for the whole computation, including the transposes.  Currently
   // multi-output fusion only fuses fusions, so each of the terms in the sum
   // needs to be a fusion itself, thus the contortions above.
   constexpr int kNumParams = 25;
   XlaBuilder b("ManyLayoutTransformations");

   // This test produces values that overflow int32_t, which is UB, so use
   // uint32_t, where overflow is OK.
   Array2D<uint32_t> arr(32, 32);
   arr.FillUnique();
   Literal l1 = LiteralUtil::CreateR2FromArray2D(arr).Relayout(
       LayoutUtil::MakeLayout({0, 1}));

   Literal l2 = LiteralUtil::CreateR2FromArray2D(arr).Relayout(
       LayoutUtil::MakeLayout({1, 0}));

   XlaOp p0 = AddParam(l1, &b);
   XlaOp sum = p0;
   for (int i = 1; i < kNumParams; ++i) {
     auto pN = AddParam((i % 2 == 0 ? l1 : l2), &b);
     sum = sum + p0 * pN * pN;
   }

   ComputeAndCompare(&b, {});
 }

 void BM_ParallelFusion(::testing::benchmark::State& state) {
   // Simple element-wise computation to benchmark parallel task partitioning.

   se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
   auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
   se::StreamExecutorMemoryAllocator allocator(platform, executors);

   const int64_t intra_op_parallelism_threads = 24;
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform);
   client_options.set_intra_op_parallelism_threads(intra_op_parallelism_threads);
   auto client =
       ClientLibrary::GetOrCreateLocalClient(client_options).ValueOrDie();

   int device_ordinal = client->default_device_ordinal();

   // Computation shape parameters.
   const int64_t param0_dim0 = 1024;
   const int64_t param0_dim1 = 1024;
   const int64_t param1_dim0 = 1024;
   const int64_t param1_dim1 = 1024;
   const int64_t param2_dim0 = 1024;
   const int64_t param2_dim1 = 1024;

   // Create computation.
   XlaBuilder builder("ParallelFusion");
   Shape shape0 = ShapeUtil::MakeShape(F32, {param0_dim0, param0_dim1});
   auto param0 = Parameter(&builder, 0, shape0, "param0");
   Shape shape1 = ShapeUtil::MakeShape(F32, {param1_dim0, param1_dim1});
   auto param1 = Parameter(&builder, 1, shape1, "param1");
   Shape shape2 = ShapeUtil::MakeShape(F32, {param2_dim0, param2_dim1});
   auto param2 = Parameter(&builder, 2, shape2, "param2");

   auto x = Mul(param0, param1);
   Add(x, param2);
   auto computation = builder.Build().value();

   // Transfer literals to device.
   auto param0_literal =
       LiteralUtil::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1);
   ScopedShapedBuffer buffer0 =
       client->LiteralToShapedBuffer(param0_literal, device_ordinal).value();

   auto param1_literal =
       LiteralUtil::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1);
   ScopedShapedBuffer buffer1 =
       client->LiteralToShapedBuffer(param1_literal, device_ordinal).value();

   auto param2_literal =
       LiteralUtil::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1);
   ScopedShapedBuffer buffer2 =
       client->LiteralToShapedBuffer(param2_literal, device_ordinal).value();

   // Build executable.
   auto executables =
       client
           ->Compile(computation,
                     {&buffer0.on_host_shape(), &buffer1.on_host_shape(),
                      &buffer2.on_host_shape()},
                     ExecutableBuildOptions())
           .value();
   auto executable = std::move(executables[0]);

   se::Stream stream(executors[device_ordinal]);
   stream.Init();

   // Initialize thread pool.
   tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "XLAEigen",
                                       intra_op_parallelism_threads);
   Eigen::ThreadPoolDevice device(pool.AsEigenThreadPool(), pool.NumThreads());

   // Initialize ExecutableRunOptions.
   ExecutableRunOptions options;
   options.set_allocator(&allocator).set_stream(&stream);
   options.set_intra_op_thread_pool(&device);

   // Run some warm-up executions.
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
     auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }

   // Run benchmark.
   const int64_t total_bytes = param0_dim0 * param0_dim0 +
                               param1_dim0 * param1_dim0 +
                               param2_dim0 * param2_dim0;

   for (auto s : state) {
     auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
     ASSERT_TRUE(result.ok());
   }
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
                           total_bytes * sizeof(float));
 }

 BENCHMARK(BM_ParallelFusion)->UseRealTime();

 }  // namespace
 }  // namespace xla