[XLA:GPU] NFC: Rename `LayoutsAreReduceInputFusionFriendly` to `IsPhysicallyTransposing`, remove a dead parameter, update documentation and tests.
PiperOrigin-RevId: 460641556
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 7662c9b..7b02a99 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -56,26 +56,13 @@
return false;
}
-bool IsPhysicallyTransposing(const HloInstruction* instr) {
- return instr->opcode() == HloOpcode::kCopy ||
- (instr->opcode() == HloOpcode::kTranspose &&
- !ShapeUtil::TransposeIsBitcast(instr->operand(0)->shape(),
- instr->shape(), instr->dimensions()));
-}
-
} // namespace
-bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
- const HloInstruction& reduce) {
- if (IsPhysicallyTransposing(&producer)) {
- return false;
- }
- if (producer.opcode() == HloOpcode::kFusion) {
- for (const HloInstruction* instr : producer.fused_instructions()) {
- if (IsPhysicallyTransposing(instr)) {
- // Elementwise copies are only inserted in input fusion for
- // transposition, and those are never friendly to the reduction.
- return false;
+bool IsPhysicallyTransposing(const HloInstruction& instr) {
+ if (instr.opcode() == HloOpcode::kFusion) {
+ for (const HloInstruction* fused_instr : instr.fused_instructions()) {
+ if (IsPhysicallyTransposing(*fused_instr)) {
+ return true;
}
}
}
@@ -83,7 +70,10 @@
// A fusion iterates over its output in physically-contiguous order. This
// applies "upwards" to operands. Only an operator that changes an operand's
// physical layout can create a "bad" memory access pattern.
- return true;
+ return instr.opcode() == HloOpcode::kCopy ||
+ (instr.opcode() == HloOpcode::kTranspose &&
+ !ShapeUtil::TransposeIsBitcast(instr.operand(0)->shape(),
+ instr.shape(), instr.dimensions()));
}
bool IsReduceInputFusion(const HloInstruction& instr) {
@@ -228,12 +218,10 @@
return "the fusion would create a nested loop";
}
- // Do not fuse into reduce input fusions if the resulting kernel would suffer
- // from poor data locality (due to unfriendly input layouts).
- if (IsInputFusibleReduction(consumer) &&
- !LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
- return "the producer layout is not fusion-friendly for the consumer "
- "reduction";
+ // Do not fuse into fusions if the resulting kernel would suffer from
+ // uncoalesced reads due to a transposed memory access pattern.
+ if (IsInputFusibleReduction(consumer) && IsPhysicallyTransposing(producer)) {
+ return "fusing the producer would break read coalescing";
}
// Fuse scalar constants into loop fusion nodes. This reduces the number of
@@ -297,7 +285,7 @@
if (!ShapesCompatibleForMultiOutputFusion(producer, consumer)) {
return false;
}
- if (!LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
+ if (IsPhysicallyTransposing(producer)) {
return false;
}
return true;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index 8f84d7d..72d6a4d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -52,13 +52,9 @@
bool IsLoopFusible(const HloInstruction& instr);
-// The code emitted for reduce-rooted input fusions (EmitReductionToVector)
-// suffers from poor data locality if the layouts of input parameters differ. In
-// such situations it is better not to fuse. Only input params with
-// maximum rank are considered. Params with smaller ranks will be broadcasted
-// and have not been observed to cause data locality issues.
-bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
- const HloInstruction& reduce);
+// Whether the op tranposes the physical data layout. Fusing such ops may lead
+// to uncoalesced data access and may thus not be beneficial.
+bool IsPhysicallyTransposing(const HloInstruction& instr);
// Note that reduction ops are lowered in different ways. Reduce input fusions
// are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index 62f7fa3..7de63f2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -33,8 +33,7 @@
ROOT add = f32[] add(lhs, rhs)
})";
-TEST_F(GpuFusibleTest,
- LayoutsAreReduceInputFusionFriendly_ElementwiseProducer) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_ElementwiseProducer) {
auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
ENTRY entry {
p0 = f32[2,2,2]{2,1,0} parameter(0)
@@ -44,17 +43,13 @@
})"))
.ValueOrDie();
SCOPED_TRACE(module->ToString());
- const HloInstruction* reduce =
- module->entry_computation()->root_instruction();
- ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
const HloInstruction* exp =
module->entry_computation()->root_instruction()->operand(0);
ASSERT_EQ(exp->opcode(), HloOpcode::kExp);
- EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*exp, *reduce));
+ EXPECT_FALSE(IsPhysicallyTransposing(*exp));
}
-TEST_F(GpuFusibleTest,
- LayoutsAreReduceInputFusionFriendly_MixedLayoutProducer) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_MixedLayoutProducer) {
auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
mixed_input_layouts_computation {
p0.1 = f16[128,1024,32,32]{1,3,2,0} parameter(0)
@@ -80,19 +75,14 @@
})"))
.ValueOrDie();
SCOPED_TRACE(module->ToString());
- const HloInstruction* reduce_fusion =
- module->entry_computation()->root_instruction()->operand(0);
- ASSERT_EQ(reduce_fusion->fused_expression_root()->opcode(),
- HloOpcode::kReduce);
const HloInstruction* loop_fusion =
module->entry_computation()->root_instruction()->operand(1);
ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kSelect);
- EXPECT_FALSE(
- LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce_fusion));
+ EXPECT_TRUE(IsPhysicallyTransposing(*loop_fusion));
}
TEST_F(GpuFusibleTest,
- LayoutsAreReduceInputFusionFriendly_MixedLayoutProducerWithTrivialDim) {
+ IsPhysicallyTransposing_MixedLayoutProducerWithTrivialDim) {
auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
mixed_input_layouts_computation {
p0.1 = f16[128,1,32,32]{1,3,2,0} parameter(0)
@@ -118,18 +108,13 @@
})"))
.ValueOrDie();
SCOPED_TRACE(module->ToString());
- const HloInstruction* reduce_fusion =
- module->entry_computation()->root_instruction()->operand(0);
- ASSERT_EQ(reduce_fusion->fused_expression_root()->opcode(),
- HloOpcode::kReduce);
const HloInstruction* loop_fusion =
module->entry_computation()->root_instruction()->operand(1);
ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kSelect);
- EXPECT_TRUE(
- LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce_fusion));
+ EXPECT_FALSE(IsPhysicallyTransposing(*loop_fusion));
}
-TEST_F(GpuFusibleTest, LayoutsAreReduceInputFusionFriendly_CopyProducer) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_CopyProducer) {
auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
fused_reduce {
p0.1 = f32[128,1024,32,32]{1,3,2,0} parameter(0)
@@ -143,16 +128,13 @@
})"))
.ValueOrDie();
SCOPED_TRACE(module->ToString());
- const HloInstruction* reduce =
- module->entry_computation()->root_instruction();
- ASSERT_EQ(reduce->fused_expression_root()->opcode(), HloOpcode::kReduce);
const HloInstruction* copy =
module->entry_computation()->root_instruction()->operand(0);
ASSERT_EQ(copy->opcode(), HloOpcode::kCopy);
- EXPECT_FALSE(LayoutsAreReduceInputFusionFriendly(*copy, *reduce));
+ EXPECT_TRUE(IsPhysicallyTransposing(*copy));
}
-TEST_F(GpuFusibleTest, LayoutsAreReduceInputFusionFriendly_PhysicalTranspose) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_PhysicalTranspose) {
auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
fused_reduce {
p0.1 = f32[1024,128,32,32]{3,2,1,0} parameter(0)
@@ -166,17 +148,13 @@
})"))
.ValueOrDie();
SCOPED_TRACE(module->ToString());
- const HloInstruction* reduce =
- module->entry_computation()->root_instruction();
- ASSERT_EQ(reduce->fused_expression_root()->opcode(), HloOpcode::kReduce);
const HloInstruction* transpose =
module->entry_computation()->root_instruction()->operand(0);
ASSERT_EQ(transpose->opcode(), HloOpcode::kTranspose);
- EXPECT_FALSE(LayoutsAreReduceInputFusionFriendly(*transpose, *reduce));
+ EXPECT_TRUE(IsPhysicallyTransposing(*transpose));
}
-TEST_F(GpuFusibleTest,
- LayoutsAreReduceInputFusionFriendly_LayoutChangingFusionProducer) {
+TEST_F(GpuFusibleTest, IsPhysicallyTransposing_LayoutChangingFusionProducer) {
auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
layout_changing_computation {
p0.1 = f16[128,1024,32,32]{3,2,1,0} parameter(0)
@@ -201,19 +179,14 @@
})"))
.ValueOrDie();
SCOPED_TRACE(module->ToString());
- const HloInstruction* reduce_fusion =
- module->entry_computation()->root_instruction();
- ASSERT_EQ(reduce_fusion->fused_expression_root()->opcode(),
- HloOpcode::kReduce);
const HloInstruction* loop_fusion =
module->entry_computation()->root_instruction()->operand(0);
ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kCopy);
- EXPECT_FALSE(
- LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce_fusion));
+ EXPECT_TRUE(IsPhysicallyTransposing(*loop_fusion));
}
TEST_F(GpuFusibleTest,
- LayoutsAreReduceInputFusionFriendly_ConsiderMaximumTrueRanksParamsOnly) {
+ IsPhysicallyTransposing_ConsiderMaximumTrueRanksParamsOnly) {
auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
broadcasting_computation {
p0.1 = f32[128,1024,32,32]{1,3,2,0} parameter(0)
@@ -231,13 +204,10 @@
})"))
.ValueOrDie();
SCOPED_TRACE(module->ToString());
- const HloInstruction* reduce =
- module->entry_computation()->root_instruction();
- ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
const HloInstruction* loop_fusion =
module->entry_computation()->root_instruction()->operand(0);
ASSERT_EQ(loop_fusion->fused_expression_root()->opcode(), HloOpcode::kAdd);
- EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce));
+ EXPECT_FALSE(IsPhysicallyTransposing(*loop_fusion));
}
TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) {