[XLA:GPU] NFC: Rename `LayoutsAreReduceInputFusionFriendly` to `IsPhysicallyTransposing`, remove a dead parameter, update documentation and tests.

PiperOrigin-RevId: 460641556
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 7662c9b..7b02a99 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -56,26 +56,13 @@
   return false;
 }
 
-bool IsPhysicallyTransposing(const HloInstruction* instr) {
-  return instr->opcode() == HloOpcode::kCopy ||
-         (instr->opcode() == HloOpcode::kTranspose &&
-          !ShapeUtil::TransposeIsBitcast(instr->operand(0)->shape(),
-                                         instr->shape(), instr->dimensions()));
-}
-
 }  // namespace
 
-bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
-                                         const HloInstruction& reduce) {
-  if (IsPhysicallyTransposing(&producer)) {
-    return false;
-  }
-  if (producer.opcode() == HloOpcode::kFusion) {
-    for (const HloInstruction* instr : producer.fused_instructions()) {
-      if (IsPhysicallyTransposing(instr)) {
-        // Elementwise copies are only inserted in input fusion for
-        // transposition, and those are never friendly to the reduction.
-        return false;
+bool IsPhysicallyTransposing(const HloInstruction& instr) {
+  if (instr.opcode() == HloOpcode::kFusion) {
+    for (const HloInstruction* fused_instr : instr.fused_instructions()) {
+      if (IsPhysicallyTransposing(*fused_instr)) {
+        return true;
       }
     }
   }
@@ -83,7 +70,10 @@
   // A fusion iterates over its output in physically-contiguous order. This
   // applies "upwards" to operands.  Only an operator that changes an operand's
   // physical layout can create a "bad" memory access pattern.
-  return true;
+  return instr.opcode() == HloOpcode::kCopy ||
+         (instr.opcode() == HloOpcode::kTranspose &&
+          !ShapeUtil::TransposeIsBitcast(instr.operand(0)->shape(),
+                                         instr.shape(), instr.dimensions()));
 }
 
 bool IsReduceInputFusion(const HloInstruction& instr) {
@@ -228,12 +218,10 @@
     return "the fusion would create a nested loop";
   }
 
-  // Do not fuse into reduce input fusions if the resulting kernel would suffer
-  // from poor data locality (due to unfriendly input layouts).
-  if (IsInputFusibleReduction(consumer) &&
-      !LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
-    return "the producer layout is not fusion-friendly for the consumer "
-           "reduction";
+  // Do not fuse into fusions if the resulting kernel would suffer from
+  // uncoalesced reads due to a transposed memory access pattern.
+  if (IsInputFusibleReduction(consumer) && IsPhysicallyTransposing(producer)) {
+    return "fusing the producer would break read coalescing";
   }
 
   // Fuse scalar constants into loop fusion nodes. This reduces the number of
@@ -297,7 +285,7 @@
   if (!ShapesCompatibleForMultiOutputFusion(producer, consumer)) {
     return false;
   }
-  if (!LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
+  if (IsPhysicallyTransposing(producer)) {
     return false;
   }
   return true;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index 8f84d7d..72d6a4d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -52,13 +52,9 @@
 
 bool IsLoopFusible(const HloInstruction& instr);
 
-// The code emitted for reduce-rooted input fusions (EmitReductionToVector)
-// suffers from poor data locality if the layouts of input parameters differ. In
-// such situations it is better not to fuse. Only input params with
-// maximum rank are considered. Params with smaller ranks will be broadcasted
-// and have not been observed to cause data locality issues.
-bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
-                                         const HloInstruction& reduce);
+// Whether the op tranposes the physical data layout. Fusing such ops may lead
+// to uncoalesced data access and may thus not be beneficial.
+bool IsPhysicallyTransposing(const HloInstruction& instr);
 
 // Note that reduction ops are lowered in different ways. Reduce input fusions
 // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index 62f7fa3..7de63f2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -33,8 +33,7 @@
       ROOT add = f32[] add(lhs, rhs)
     })";
 
-TEST_F(GpuFusibleTest,
-       LayoutsAreReduceInputFusionFriendly_ElementwiseProducer) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_ElementwiseProducer) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       p0 = f32[2,2,2]{2,1,0} parameter(0)
@@ -44,17 +43,13 @@
     })"))
                     .ValueOrDie();
   SCOPED_TRACE(module->ToString());
-  const HloInstruction* reduce =
-      module->entry_computation()->root_instruction();
-  ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
   const HloInstruction* exp =
       module->entry_computation()->root_instruction()->operand(0);
   ASSERT_EQ(exp->opcode(), HloOpcode::kExp);
-  EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*exp, *reduce));
+  EXPECT_FALSE(IsPhysicallyTransposing(*exp));
 }
 
-TEST_F(GpuFusibleTest,
-       LayoutsAreReduceInputFusionFriendly_MixedLayoutProducer) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_MixedLayoutProducer) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     mixed_input_layouts_computation {
       p0.1 = f16[128,1024,32,32]{1,3,2,0} parameter(0)
@@ -80,19 +75,14 @@
     })"))
                     .ValueOrDie();
   SCOPED_TRACE(module->ToString());
-  const HloInstruction* reduce_fusion =
-      module->entry_computation()->root_instruction()->operand(0);
-  ASSERT_EQ(reduce_fusion->fused_expression_root()->opcode(),
-            HloOpcode::kReduce);
   const HloInstruction* loop_fusion =
       module->entry_computation()->root_instruction()->operand(1);
   ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kSelect);
-  EXPECT_FALSE(
-      LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce_fusion));
+  EXPECT_TRUE(IsPhysicallyTransposing(*loop_fusion));
 }
 
 TEST_F(GpuFusibleTest,
-       LayoutsAreReduceInputFusionFriendly_MixedLayoutProducerWithTrivialDim) {
+       IsPhysicallyTransposing_MixedLayoutProducerWithTrivialDim) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     mixed_input_layouts_computation {
       p0.1 = f16[128,1,32,32]{1,3,2,0} parameter(0)
@@ -118,18 +108,13 @@
     })"))
                     .ValueOrDie();
   SCOPED_TRACE(module->ToString());
-  const HloInstruction* reduce_fusion =
-      module->entry_computation()->root_instruction()->operand(0);
-  ASSERT_EQ(reduce_fusion->fused_expression_root()->opcode(),
-            HloOpcode::kReduce);
   const HloInstruction* loop_fusion =
       module->entry_computation()->root_instruction()->operand(1);
   ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kSelect);
-  EXPECT_TRUE(
-      LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce_fusion));
+  EXPECT_FALSE(IsPhysicallyTransposing(*loop_fusion));
 }
 
-TEST_F(GpuFusibleTest, LayoutsAreReduceInputFusionFriendly_CopyProducer) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_CopyProducer) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     fused_reduce {
       p0.1 = f32[128,1024,32,32]{1,3,2,0} parameter(0)
@@ -143,16 +128,13 @@
     })"))
                     .ValueOrDie();
   SCOPED_TRACE(module->ToString());
-  const HloInstruction* reduce =
-      module->entry_computation()->root_instruction();
-  ASSERT_EQ(reduce->fused_expression_root()->opcode(), HloOpcode::kReduce);
   const HloInstruction* copy =
       module->entry_computation()->root_instruction()->operand(0);
   ASSERT_EQ(copy->opcode(), HloOpcode::kCopy);
-  EXPECT_FALSE(LayoutsAreReduceInputFusionFriendly(*copy, *reduce));
+  EXPECT_TRUE(IsPhysicallyTransposing(*copy));
 }
 
-TEST_F(GpuFusibleTest, LayoutsAreReduceInputFusionFriendly_PhysicalTranspose) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_PhysicalTranspose) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     fused_reduce {
       p0.1 = f32[1024,128,32,32]{3,2,1,0} parameter(0)
@@ -166,17 +148,13 @@
     })"))
                     .ValueOrDie();
   SCOPED_TRACE(module->ToString());
-  const HloInstruction* reduce =
-      module->entry_computation()->root_instruction();
-  ASSERT_EQ(reduce->fused_expression_root()->opcode(), HloOpcode::kReduce);
   const HloInstruction* transpose =
       module->entry_computation()->root_instruction()->operand(0);
   ASSERT_EQ(transpose->opcode(), HloOpcode::kTranspose);
-  EXPECT_FALSE(LayoutsAreReduceInputFusionFriendly(*transpose, *reduce));
+  EXPECT_TRUE(IsPhysicallyTransposing(*transpose));
 }
 
-TEST_F(GpuFusibleTest,
-       LayoutsAreReduceInputFusionFriendly_LayoutChangingFusionProducer) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_LayoutChangingFusionProducer) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     layout_changing_computation {
       p0.1 = f16[128,1024,32,32]{3,2,1,0} parameter(0)
@@ -201,19 +179,14 @@
     })"))
                     .ValueOrDie();
   SCOPED_TRACE(module->ToString());
-  const HloInstruction* reduce_fusion =
-      module->entry_computation()->root_instruction();
-  ASSERT_EQ(reduce_fusion->fused_expression_root()->opcode(),
-            HloOpcode::kReduce);
   const HloInstruction* loop_fusion =
       module->entry_computation()->root_instruction()->operand(0);
   ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kCopy);
-  EXPECT_FALSE(
-      LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce_fusion));
+  EXPECT_TRUE(IsPhysicallyTransposing(*loop_fusion));
 }
 
 TEST_F(GpuFusibleTest,
-       LayoutsAreReduceInputFusionFriendly_ConsiderMaximumTrueRanksParamsOnly) {
+       IsPhysicallyTransposing_ConsiderMaximumTrueRanksParamsOnly) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     broadcasting_computation {
       p0.1 = f32[128,1024,32,32]{1,3,2,0} parameter(0)
@@ -231,13 +204,10 @@
     })"))
                     .ValueOrDie();
   SCOPED_TRACE(module->ToString());
-  const HloInstruction* reduce =
-      module->entry_computation()->root_instruction();
-  ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
   const HloInstruction* loop_fusion =
       module->entry_computation()->root_instruction()->operand(0);
   ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kAdd);
-  EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce));
+  EXPECT_FALSE(IsPhysicallyTransposing(*loop_fusion));
 }
 
 TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) {