blob: a522326633ed13d71de0292f9b6899fb98156edc [file] [log] [blame]
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
#include <memory>
#include "tensorflow/compiler/xla/layout_util.h"
#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
#include "tensorflow/compiler/xla/service/hlo_computation.h"
#include "tensorflow/compiler/xla/service/hlo_instruction.h"
#include "tensorflow/compiler/xla/service/hlo_instructions.h"
#include "tensorflow/compiler/xla/status_macros.h"
#include "tensorflow/compiler/xla/window_util.h"
#include "tensorflow/compiler/xla/xla_data.pb.h"
#include "tensorflow/core/lib/core/errors.h"
namespace xla {
namespace gpu {
using se::dnn::DataLayout;
using se::dnn::FilterLayout;
// Returns (input, filter, output) layouts.
static std::tuple<DataLayout, FilterLayout, DataLayout>
HeuristicLayoutAssignment(const HloInstruction* instr,
se::StreamExecutor* stream_executor) {
// DataLayout and FilterLayout uses weird enum names. Translations:
// N <=> Batch or Output
// C <=> Depth or Input
// H <=> Y
// W <=> X
//
// Therefore kOutputInputYX and kBatchDepthYX mean NCHW.
//
// If you have trouble keeping these straight, consider that all that matters
// is the location of the channel dim: Is it major (NCHW), or minor (NHWC)?
constexpr auto kAllNCHW =
std::make_tuple(DataLayout::kBatchDepthYX, FilterLayout::kOutputInputYX,
DataLayout::kBatchDepthYX);
// kBatchDepthYX4 has the same layout as kBatchDepthYX32; they're both VECT_C
// layouts as far as cudnn is concerned.
constexpr auto kAllNCHW_VECT_C =
std::make_tuple(DataLayout::kBatchDepthYX4, FilterLayout::kOutputInputYX4,
DataLayout::kBatchDepthYX4);
constexpr auto kAllNHWC =
std::make_tuple(DataLayout::kBatchYXDepth, FilterLayout::kOutputYXInput,
DataLayout::kBatchYXDepth);
// Integer convolution must use NHWC or NCHW_VECT_C.
//
// TODO(jlebar): Do non-VECT_C int8_t convs still require NHWC with new
// versions of cudnn?
const ConvolutionDimensionNumbers& dnums =
instr->convolution_dimension_numbers();
Shape input_shape = instr->operand(0)->shape();
PrimitiveType input_ty = instr->operand(0)->shape().element_type();
if (primitive_util::IsIntegralType(input_ty)) {
if (input_ty == S8 && dnums.input_spatial_dimensions_size() == 2 &&
input_shape.dimensions_size() == 5) {
VLOG(2) << "Using NCHW_VECT_C for int8_t conv " << instr->ToString();
return kAllNCHW_VECT_C;
}
VLOG(2) << "Using NHWC for int8_t conv " << instr->ToString();
return kAllNHWC;
}
const DebugOptions& debug_options =
instr->GetModule()->config().debug_options();
if (debug_options.xla_gpu_force_conv_nchw()) {
VLOG(2) << "Overriding layout to NCHW for " << instr->ToString();
return kAllNCHW;
}
if (debug_options.xla_gpu_force_conv_nhwc()) {
VLOG(2) << "Overriding layout to NHWC for " << instr->ToString();
return kAllNHWC;
}
// If we're not Volta or not fp16, or not conv2D, the decision is easy: Use
// NCHW.
if (input_ty != F16 ||
!stream_executor->GetDeviceDescription()
.cuda_compute_capability()
.IsAtLeast(se::CudaComputeCapability::VOLTA) ||
instr->shape().tuple_shapes(0).dimensions_size() != 4) {
return kAllNCHW;
}
VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
// Empirically we've found with Volta and cudnn <= 7.3 that backward-input
// convs with stride are significantly faster with NCHW layouts.
//
// We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
// which on paper gives good performance. However, there are two observations:
// * a mixed layout combination is more cuDNN-bug prone, based on empirical
// evidence.
// * we've also observed that for mixed layouts, cuDNN transposes data back
// and forth from a different layout combination. If we end up with
// transposes anyway, we prefer to have them in XLA, as they can be fused.
if (auto* dnn = stream_executor->AsDnn()) {
auto version_status = dnn->GetVersion();
if (version_status.ok()) {
auto version = version_status.ConsumeValueOrDie();
if (std::make_tuple(version.major_version(), version.minor_version()) <=
std::make_tuple(7, 3) &&
instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
window_util::HasStride(instr->window())) {
return kAllNCHW;
}
}
}
// For other Volta f16 convolutions, use NHWC.
return kAllNHWC;
}
// Adds layout constraints on the cudnn custom-call instruction. The layout
// constraints are represented in terms of minor_to_major fields of both
// operands and the output shape. Depending on the underlying algorithm, one of
// { NCHW, NHWC } ^ 3 = 8 different layout combinations may be chosen.
Status GpuLayoutAssignment::AddBackendConstraintsToDnnConvCustomCall(
HloCustomCallInstruction* instr, LayoutConstraints* constraints) {
Shape lhs_shape = instr->operand(0)->shape();
Shape rhs_shape = instr->operand(1)->shape();
Shape result_shape = instr->shape().tuple_shapes(0);
Shape* input_shape;
Shape* filter_shape;
Shape* output_shape;
TF_ASSIGN_OR_RETURN(auto kind, GetCudnnConvKind(instr));
switch (kind) {
case CudnnConvKind::kForward:
case CudnnConvKind::kForwardActivation:
input_shape = &lhs_shape;
filter_shape = &rhs_shape;
output_shape = &result_shape;
break;
case CudnnConvKind::kBackwardInput:
input_shape = &result_shape;
filter_shape = &rhs_shape;
output_shape = &lhs_shape;
break;
case CudnnConvKind::kBackwardFilter:
input_shape = &lhs_shape;
filter_shape = &result_shape;
output_shape = &rhs_shape;
break;
}
{
DataLayout input;
FilterLayout filter;
DataLayout output;
std::tie(input, filter, output) =
HeuristicLayoutAssignment(instr, stream_executor_);
TF_ASSIGN_OR_RETURN(
std::tie(*input_shape->mutable_layout(),
*filter_shape->mutable_layout(),
*output_shape->mutable_layout()),
StreamExecutorConvLayoutsToXlaLayouts(
instr->convolution_dimension_numbers(), input, filter, output));
}
// The custom call returns a tuple of (actual_result, scratch_buffer);
// call_result_buf is the logical buffer for actual_result, the thing that
// contains the result of the conv call.
TF_ASSIGN_OR_RETURN(
const LogicalBuffer* call_result_buf,
points_to_analysis_->GetBufferDefinedAt(instr, /*index=*/{0}));
// Set layouts of the instructions' shapes.
TF_RETURN_IF_ERROR(SetOperandLayout(lhs_shape, instr, 0));
TF_RETURN_IF_ERROR(SetOperandLayout(rhs_shape, instr, 1));
TF_RETURN_IF_ERROR(SetBufferLayout(result_shape.layout(), *call_result_buf));
// instr->operand(2), if exists, is the bias buffer. There is no need to
// assign layout to it, as it has only one dimension.
// instr->operand(3), if exists, is the side input buffer.
if (instr->operand_count() == 4) {
if (kind != CudnnConvKind::kForwardActivation) {
return InternalError(
"Invalid convolution. Conv has a side input, but kind is not fused "
"conv forward: %s",
instr->ToString());
}
// The side input layout must match the output layout.
TF_RETURN_IF_ERROR(SetOperandLayout(*output_shape, instr, 3));
}
return Status::OK();
}
// Imposes the default layout with first two dimensions swapped on input
// `shape`.
static void SetFortranLayout(Shape* shape) {
LayoutUtil::SetToDefaultLayout(shape);
int n = shape->mutable_layout()->minor_to_major_size();
CHECK_GE(n, 2);
std::swap(shape->mutable_layout()->mutable_minor_to_major()->at(0),
shape->mutable_layout()->mutable_minor_to_major()->at(1));
}
Status GpuLayoutAssignment::AddBackendConstraints(
LayoutConstraints* constraints) {
// Add convolution constraints in reverse postorder that the earliest
// convolution layout propagates first. This reduces the likelihood of fusion
// nodes with copies.
auto post_order = constraints->computation()->MakeInstructionPostOrder();
for (auto iterator = post_order.rbegin(); iterator != post_order.rend();
++iterator) {
HloInstruction* instruction = *iterator;
if (IsCustomCallToDnnConvolution(*instruction)) {
TF_RETURN_IF_ERROR(AddBackendConstraintsToDnnConvCustomCall(
Cast<HloCustomCallInstruction>(instruction), constraints));
}
CHECK(!IsCublasGemm(*instruction))
<< "Gemm rewriting should run after layout assignment";
// For unbatched S8xS8->S32 matrix multiplication enforce a TN layout, which
// will allow the NVidia GPUs to use TensorCores.
if (IsMatrixMultiplication(*instruction)) {
Shape output_shape = instruction->shape();
Shape p1_shape = instruction->operand(0)->shape();
Shape p2_shape = instruction->operand(1)->shape();
if (output_shape.element_type() == PrimitiveType::S32 &&
p1_shape.element_type() == PrimitiveType::S8 &&
p2_shape.element_type() == PrimitiveType::S8 &&
output_shape.dimensions_size() == 2 &&
p1_shape.dimensions_size() == 2 && p2_shape.dimensions_size() == 2) {
LayoutUtil::SetToDefaultLayout(&p1_shape);
SetFortranLayout(&p2_shape);
LayoutUtil::SetToDefaultLayout(&output_shape);
TF_RETURN_IF_ERROR(SetOperandLayout(p1_shape, instruction, 0));
TF_RETURN_IF_ERROR(SetOperandLayout(p2_shape, instruction, 1));
TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction));
continue;
}
}
// For batched dot we require the default layout.
// TODO(b/112111608): This is overly conservative, the only real restriction
// is that batch dimensions must be major.
if (IsMatrixMultiplication(*instruction) &&
instruction->dot_dimension_numbers().lhs_batch_dimensions_size() > 0) {
// Verify that the batch dims come before the row and col dims.
DotDimensionNumbers dim_nums = instruction->dot_dimension_numbers();
CHECK_EQ(dim_nums.lhs_batch_dimensions_size(),
dim_nums.rhs_batch_dimensions_size());
CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2,
instruction->shape().rank());
for (int64_t batch_dim : dim_nums.lhs_batch_dimensions()) {
CHECK_LT(batch_dim, instruction->shape().rank() - 2);
}
// Set both inputs and the output to default layout.
Shape op0_shape = instruction->operand(0)->shape();
LayoutUtil::SetToDefaultLayout(&op0_shape);
Shape op1_shape = instruction->operand(1)->shape();
LayoutUtil::SetToDefaultLayout(&op1_shape);
Shape output_shape = instruction->shape();
LayoutUtil::SetToDefaultLayout(&output_shape);
TF_RETURN_IF_ERROR(SetOperandLayout(op0_shape, instruction, 0));
TF_RETURN_IF_ERROR(SetOperandLayout(op1_shape, instruction, 1));
TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction));
} else if (instruction->opcode() == HloOpcode::kFft) {
// cuFFT requires a dim0 major layout.
Shape op0_shape = instruction->operand(0)->shape();
LayoutUtil::SetToDefaultLayout(&op0_shape);
Shape output_shape = instruction->shape();
LayoutUtil::SetToDefaultLayout(&output_shape);
TF_RETURN_IF_ERROR(SetOperandLayout(op0_shape, instruction, 0));
TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction));
} else if (instruction->opcode() == HloOpcode::kSort &&
instruction->operand(0)->shape().rank() > 1) {
// Make sure that all the operands and the output(s) have the same layout.
Shape keys_shape = instruction->operand(0)->shape();
Layout keys_layout =
LayoutUtil::GetDefaultLayoutForRank(keys_shape.rank());
for (int64_t i = 0; i < instruction->operand_count(); ++i) {
Shape shape = instruction->operand(i)->shape();
*shape.mutable_layout() = keys_layout;
TF_RETURN_IF_ERROR(SetOperandLayout(shape, instruction, i));
const LogicalBuffer* output_buffer;
if (instruction->shape().IsArray()) {
TF_ASSIGN_OR_RETURN(
output_buffer,
points_to_analysis_->GetBufferDefinedAt(instruction, {}));
} else {
TF_ASSIGN_OR_RETURN(
output_buffer,
points_to_analysis_->GetBufferDefinedAt(instruction, {i}));
}
TF_RETURN_IF_ERROR(SetBufferLayout(keys_layout, *output_buffer));
}
} else if (instruction->opcode() == HloOpcode::kTriangularSolve) {
// TODO(phawkins): Ideally we would relax this constraint. What we
// actually want is that:
// a) the batch dimensions are major, in no particular order.
// b) the two minor dimensions are in fortran (column-major) order,
// although for the 'a' argument we could potentially accept row-major
// order and fold the transpose into the operator.
Shape op0_shape = instruction->operand(0)->shape();
Shape op1_shape = instruction->operand(1)->shape();
Shape output_shape = instruction->shape();
SetFortranLayout(&op0_shape);
SetFortranLayout(&op1_shape);
SetFortranLayout(&output_shape);
TF_RETURN_IF_ERROR(SetOperandLayout(op0_shape, instruction, 0));
TF_RETURN_IF_ERROR(SetOperandLayout(op1_shape, instruction, 1));
TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction));
} else if (instruction->opcode() == HloOpcode::kReduceScatter) {
// XLA:GPU can only support reduce-scatter where the scatter dimension
// is the most major dimension in the layout.
auto ars = Cast<HloReduceScatterInstruction>(instruction);
TF_RETURN_IF_ERROR(SetInstructionLayout(
ShapeUtil::MoveDimToMajor(ars->shape(), ars->scatter_dimension()),
ars));
} else if (instruction->opcode() == HloOpcode::kAllGather) {
// XLA:GPU can only support all-gathers where the gather dimension is the
// most major dimension in the layout.
auto ag = Cast<HloAllGatherInstruction>(instruction);
TF_RETURN_IF_ERROR(SetInstructionLayout(
ShapeUtil::MoveDimToMajor(ag->shape(), ag->all_gather_dimension()),
ag));
} else if (instruction->opcode() == HloOpcode::kAllToAll &&
instruction->shape().IsArray()) {
// XLA:GPU can only support all-to-all with split dimensions where the
// split dimension is the most major dimension in the layout.
auto* all_to_all = Cast<HloAllToAllInstruction>(instruction);
TF_RETURN_IF_ERROR(SetInstructionLayout(
ShapeUtil::MoveDimToMajor(all_to_all->shape(),
*all_to_all->split_dimension()),
all_to_all));
}
}
return Status::OK();
}
} // namespace gpu
} // namespace xla