Move CpuFusion to the RunHloPassesAfterLayoutAssn function.

This is less confusing as it was after layout assignment but in the RunHloPassesThroughLayoutAssn function. This is also useful for future work using mlir's mhlo to compile XLA:CPU programs, which does not want to rely on XLA's fusions.

PiperOrigin-RevId: 404236935
Change-Id: I77b061c6eac3e4cf5f8bc56239fe5636755f0031
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index e166bc0..5e9c150 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -474,8 +474,6 @@
   pipeline.AddPass<CpuLayoutAssignment>(
       module->mutable_entry_computation_layout(), target_machine_features);
 
-  pipeline.AddPass<CpuInstructionFusion>();
-
   return pipeline.Run(module).status();
 }
 
@@ -483,13 +481,16 @@
     HloModule* module, bool is_aot_compile,
     LLVMTargetMachineFeatures* target_machine_features) {
   HloPassPipeline pipeline("HLO passes after layout assignment");
-  // After layout assignment, use a layout-sensitive verifier.
 
+  // After layout assignment, use a layout-sensitive verifier.
   pipeline.AddPass<HloPassPipeline>("after layout assignment")
       .AddInvariantCheckerDebug<HloVerifier>(
           /*layout_sensitive=*/true,
           /*allow_mixed_precision=*/false);
 
+  // Add a fusion pass now that layout assignment is done.
+  pipeline.AddPass<CpuInstructionFusion>();
+
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   // Run this to a fixed point.