[XLA:GPU] Simplify heuristics for promotion to int8x32.
Previously, we would upgrade to int8x32 if doing so did not result in a > 2.5x
increase in the input/filter/output size of the conv.
Now we only consider the input/output sizes. Sometimes the filter increases by
more than 2.5x and we observe that it's still worthwhile to pad. Anyway I
think this makes more sense, since the increase in the filter size is really a
function of the input/output increases.
As a result we're able to switch the heuristic to the more intuitive "don't
expand input/output by 2x or more".
PiperOrigin-RevId: 390144519
Change-Id: I96cdb0a95dc627157cdcd02d4eb1c593cedd148f
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
index 9015390..407bb21 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
@@ -240,7 +240,7 @@
// that there's additional room for speedups. Achieving those speedups
// without also slowing other things down will likely require a more
// sophisticated heuristic, possibly some form of auto-tuning.
- static constexpr double kMaxBytesTouchedIncrease = 1.35;
+ static constexpr double kMaxBytesTouchedBound = 1.35;
// Check that padding wouldn't increase the total bytes read/written by this
// operation too much.
@@ -248,7 +248,7 @@
const Shape& new_shape) {
int64_t old_bytes = ShapeUtil::ByteSizeOf(old_shape);
int64_t new_bytes = ShapeUtil::ByteSizeOf(new_shape);
- if (new_bytes <= old_bytes * kMaxBytesTouchedIncrease) {
+ if (new_bytes <= old_bytes * kMaxBytesTouchedBound) {
return true;
}
VLOG(3)
@@ -257,7 +257,7 @@
<< ShapeUtil::HumanString(old_shape) << " to "
<< ShapeUtil::HumanString(new_shape) << ", a size increase of "
<< new_bytes / static_cast<double>(old_bytes) << "x > "
- << kMaxBytesTouchedIncrease << "x: " << conv->ToString();
+ << kMaxBytesTouchedBound << "x: " << conv->ToString();
return false;
};
@@ -372,8 +372,8 @@
pad_dim(&new_input_shapes[1], dnums.kernel_output_feature_dimension(),
/*cur_vect_size=*/1);
- // Bias. This ia 1D vector of length batch-size, and it's unclear if we
- // *have* to pad it. But hey, we might as well. cur_vect_size 1
+ // Bias. This ia 1D vector of length output-depth, and it's unclear if
+ // we *have* to pad it. But hey, we might as well. cur_vect_size 1
// because NCHW_VECT_C doesn't apply here (there is no channels
// dimension!).
pad_dim(&new_input_shapes[2], /*dim=*/0, /*cur_vect_size=*/1);
@@ -393,17 +393,15 @@
}
// We won't pad a conv if doing so increases the total number of bytes in
- // the lhs, rhs, or result by more than this amount.
+ // the lhs, rhs, or result by a factor of this much or more.
+ //
+ // Note: It's important that this bound is exclusive. It's a performance
+ // regression to pad and increase input/output size by 2x, so we only pad
+ // strictly less than 2x.
//
// TODO(jlebar): This number was tuned experimentally, but without much
// experimental evidence.
- static constexpr double kMaxBytesTouchedIncrease = 2.5;
-
- // It's always OK to pad up to this many bytes, even if it increases us
- // beyond the kMaxBytesTouchedIncrease factor. This handles padding very
- // small vectors, like the bias vector of fused convs (which is just one
- // float per batch).
- static constexpr int64_t kAlwaysOkToPadBytes = 4096;
+ static constexpr double kMaxBytesTouchedBound = 2;
// Check that padding wouldn't increase the total bytes read/written by this
// operation too much.
@@ -411,8 +409,7 @@
const Shape& new_shape) {
int64_t old_bytes = ShapeUtil::ByteSizeOf(old_shape);
int64_t new_bytes = ShapeUtil::ByteSizeOf(new_shape);
- if (new_bytes <= old_bytes * kMaxBytesTouchedIncrease ||
- new_bytes <= kAlwaysOkToPadBytes) {
+ if (new_bytes < old_bytes * kMaxBytesTouchedBound) {
return true;
}
VLOG(3)
@@ -420,18 +417,17 @@
"shape from "
<< ShapeUtil::HumanString(old_shape) << " to "
<< ShapeUtil::HumanString(new_shape) << ", a size increase of "
- << new_bytes / static_cast<double>(old_bytes) << "x > "
- << kMaxBytesTouchedIncrease << "x: " << conv->ToString();
+ << new_bytes / static_cast<double>(old_bytes)
+ << "x >= " << kMaxBytesTouchedBound << "x: " << conv->ToString();
return false;
};
- for (int64_t i = 0; i < conv->operand_count(); ++i) {
- if (!check_size_increase(conv->operand(i)->shape(),
- new_input_shapes[i])) {
- return false;
- }
- }
- if (!check_size_increase(result_shape, new_result_shape)) {
+ // Check size increase only on the input and output. No need to check the
+ // filter, since that's determined by the input/output. The bias (if
+ // present) is tiny (1D array of length output-depth), so padding doesn't
+ // matter. And the side-input, if present, is the same shape as the input.
+ if (!check_size_increase(conv->operand(0)->shape(), new_input_shapes[0]) ||
+ !check_size_increase(result_shape, new_result_shape)) {
return false;
}
}
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
index e9749c4..2597153 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
@@ -364,12 +364,12 @@
HloModule Test
ENTRY %Test (input: s8[1,3,3,2], filter: s8[3,3,2,5], side_input: s8[1,3,3,5], bias: s8[5]) -> f32[1,3,3,5] {
- %input = s8[1,3,3,2]{3,2,1,0} parameter(0)
+ %input = s8[1,3,3,3]{3,2,1,0} parameter(0)
%filter = s8[3,3,2,5]{3,2,1,0} parameter(1)
%bias = s8[5]{0} parameter(3)
%convert = f32[5]{0} convert(s8[5]{0} %bias)
%side_input = f32[1,3,3,5]{3,2,1,0} parameter(2)
- %custom-call.1 = (f32[1,3,3,5]{3,2,1,0}, u8[0]{0}) custom-call(s8[1,3,3,2]{3,2,1,0} %input, s8[3,3,2,5]{3,2,1,0} %filter, f32[5]{0} %convert, f32[1,3,3,5]{3,2,1,0} %side_input), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config="{\"activationMode\":\"2\",\"convResultScale\":1,\"sideInputScale\":1}"
+ %custom-call.1 = (f32[1,3,3,5]{3,2,1,0}, u8[0]{0}) custom-call(s8[1,3,3,3]{3,2,1,0} %input, s8[3,3,2,5]{3,2,1,0} %filter, f32[5]{0} %convert, f32[1,3,3,5]{3,2,1,0} %side_input), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config="{\"activationMode\":\"2\",\"convResultScale\":1,\"sideInputScale\":1}"
ROOT %get-tuple-element.1 = f32[1,3,3,5]{3,2,1,0} get-tuple-element((f32[1,3,3,5]{3,2,1,0}, u8[0]{0}) %custom-call.1), index=0
})")
.ValueOrDie();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
index 3c546ea..8ff434e 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@@ -449,10 +449,12 @@
&scratch_allocator, stream, options);
if (!launch_status.ok()) {
+ VLOG(4) << "Launch failed: " << launch_status;
continue;
}
if (!profile_result.is_valid()) {
+ VLOG(4) << "Launch succeeded but profile result is invalid.";
continue;
}