[XLA:GPU] Simplify heuristics for promotion to int8x32.

Previously, we would upgrade to int8x32 if doing so did not result in a > 2.5x
increase in the input/filter/output size of the conv.

Now we only consider the input/output sizes.  Sometimes the filter increases by
more than 2.5x and we observe that it's still worthwhile to pad.  Anyway I
think this makes more sense, since the increase in the filter size is really a
function of the input/output increases.

As a result we're able to switch the heuristic to the more intuitive "don't
expand input/output by 2x or more".

PiperOrigin-RevId: 390144519
Change-Id: I96cdb0a95dc627157cdcd02d4eb1c593cedd148f
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
index 9015390..407bb21 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
@@ -240,7 +240,7 @@
     // that there's additional room for speedups.  Achieving those speedups
     // without also slowing other things down will likely require a more
     // sophisticated heuristic, possibly some form of auto-tuning.
-    static constexpr double kMaxBytesTouchedIncrease = 1.35;
+    static constexpr double kMaxBytesTouchedBound = 1.35;
 
     // Check that padding wouldn't increase the total bytes read/written by this
     // operation too much.
@@ -248,7 +248,7 @@
                                    const Shape& new_shape) {
       int64_t old_bytes = ShapeUtil::ByteSizeOf(old_shape);
       int64_t new_bytes = ShapeUtil::ByteSizeOf(new_shape);
-      if (new_bytes <= old_bytes * kMaxBytesTouchedIncrease) {
+      if (new_bytes <= old_bytes * kMaxBytesTouchedBound) {
         return true;
       }
       VLOG(3)
@@ -257,7 +257,7 @@
           << ShapeUtil::HumanString(old_shape) << " to "
           << ShapeUtil::HumanString(new_shape) << ", a size increase of "
           << new_bytes / static_cast<double>(old_bytes) << "x > "
-          << kMaxBytesTouchedIncrease << "x: " << conv->ToString();
+          << kMaxBytesTouchedBound << "x: " << conv->ToString();
       return false;
     };
 
@@ -372,8 +372,8 @@
         pad_dim(&new_input_shapes[1], dnums.kernel_output_feature_dimension(),
                 /*cur_vect_size=*/1);
 
-        // Bias.  This ia 1D vector of length batch-size, and it's unclear if we
-        // *have* to pad it.  But hey, we might as well.  cur_vect_size 1
+        // Bias.  This ia 1D vector of length output-depth, and it's unclear if
+        // we *have* to pad it.  But hey, we might as well.  cur_vect_size 1
         // because NCHW_VECT_C doesn't apply here (there is no channels
         // dimension!).
         pad_dim(&new_input_shapes[2], /*dim=*/0, /*cur_vect_size=*/1);
@@ -393,17 +393,15 @@
     }
 
     // We won't pad a conv if doing so increases the total number of bytes in
-    // the lhs, rhs, or result by more than this amount.
+    // the lhs, rhs, or result by a factor of this much or more.
+    //
+    // Note: It's important that this bound is exclusive.  It's a performance
+    // regression to pad and increase input/output size by 2x, so we only pad
+    // strictly less than 2x.
     //
     // TODO(jlebar): This number was tuned experimentally, but without much
     // experimental evidence.
-    static constexpr double kMaxBytesTouchedIncrease = 2.5;
-
-    // It's always OK to pad up to this many bytes, even if it increases us
-    // beyond the kMaxBytesTouchedIncrease factor.  This handles padding very
-    // small vectors, like the bias vector of fused convs (which is just one
-    // float per batch).
-    static constexpr int64_t kAlwaysOkToPadBytes = 4096;
+    static constexpr double kMaxBytesTouchedBound = 2;
 
     // Check that padding wouldn't increase the total bytes read/written by this
     // operation too much.
@@ -411,8 +409,7 @@
                                    const Shape& new_shape) {
       int64_t old_bytes = ShapeUtil::ByteSizeOf(old_shape);
       int64_t new_bytes = ShapeUtil::ByteSizeOf(new_shape);
-      if (new_bytes <= old_bytes * kMaxBytesTouchedIncrease ||
-          new_bytes <= kAlwaysOkToPadBytes) {
+      if (new_bytes < old_bytes * kMaxBytesTouchedBound) {
         return true;
       }
       VLOG(3)
@@ -420,18 +417,17 @@
              "shape from "
           << ShapeUtil::HumanString(old_shape) << " to "
           << ShapeUtil::HumanString(new_shape) << ", a size increase of "
-          << new_bytes / static_cast<double>(old_bytes) << "x > "
-          << kMaxBytesTouchedIncrease << "x: " << conv->ToString();
+          << new_bytes / static_cast<double>(old_bytes)
+          << "x >= " << kMaxBytesTouchedBound << "x: " << conv->ToString();
       return false;
     };
 
-    for (int64_t i = 0; i < conv->operand_count(); ++i) {
-      if (!check_size_increase(conv->operand(i)->shape(),
-                               new_input_shapes[i])) {
-        return false;
-      }
-    }
-    if (!check_size_increase(result_shape, new_result_shape)) {
+    // Check size increase only on the input and output.  No need to check the
+    // filter, since that's determined by the input/output.  The bias (if
+    // present) is tiny (1D array of length output-depth), so padding doesn't
+    // matter.  And the side-input, if present, is the same shape as the input.
+    if (!check_size_increase(conv->operand(0)->shape(), new_input_shapes[0]) ||
+        !check_size_increase(result_shape, new_result_shape)) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
index e9749c4..2597153 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
@@ -364,12 +364,12 @@
     HloModule Test
 
     ENTRY %Test (input: s8[1,3,3,2], filter: s8[3,3,2,5], side_input: s8[1,3,3,5], bias: s8[5]) -> f32[1,3,3,5] {
-    %input = s8[1,3,3,2]{3,2,1,0} parameter(0)
+    %input = s8[1,3,3,3]{3,2,1,0} parameter(0)
     %filter = s8[3,3,2,5]{3,2,1,0} parameter(1)
     %bias = s8[5]{0} parameter(3)
     %convert = f32[5]{0} convert(s8[5]{0} %bias)
     %side_input = f32[1,3,3,5]{3,2,1,0} parameter(2)
-    %custom-call.1 = (f32[1,3,3,5]{3,2,1,0}, u8[0]{0}) custom-call(s8[1,3,3,2]{3,2,1,0} %input, s8[3,3,2,5]{3,2,1,0} %filter, f32[5]{0} %convert, f32[1,3,3,5]{3,2,1,0} %side_input), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config="{\"activationMode\":\"2\",\"convResultScale\":1,\"sideInputScale\":1}"
+    %custom-call.1 = (f32[1,3,3,5]{3,2,1,0}, u8[0]{0}) custom-call(s8[1,3,3,3]{3,2,1,0} %input, s8[3,3,2,5]{3,2,1,0} %filter, f32[5]{0} %convert, f32[1,3,3,5]{3,2,1,0} %side_input), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config="{\"activationMode\":\"2\",\"convResultScale\":1,\"sideInputScale\":1}"
     ROOT %get-tuple-element.1 = f32[1,3,3,5]{3,2,1,0} get-tuple-element((f32[1,3,3,5]{3,2,1,0}, u8[0]{0}) %custom-call.1), index=0
     })")
                     .ValueOrDie();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
index 3c546ea..8ff434e 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@@ -449,10 +449,12 @@
                    &scratch_allocator, stream, options);
 
     if (!launch_status.ok()) {
+      VLOG(4) << "Launch failed: " << launch_status;
       continue;
     }
 
     if (!profile_result.is_valid()) {
+      VLOG(4) << "Launch succeeded but profile result is invalid.";
       continue;
     }