tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"

 #include <limits>
 #include <string>

 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"

 namespace xla {
 namespace gpu {

 using tensorflow::AutotuneResult;

 using GemmCacheKey =
     std::tuple<se::StreamExecutor*, Shape, Shape, Shape, std::string>;

 static absl::Mutex autotune_cache_mu(absl::kConstInit);
 static auto& autotune_cache ABSL_GUARDED_BY(autotune_cache_mu) =
     *new absl::flat_hash_map<GemmCacheKey,
                              absl::optional<se::blas::AlgorithmType>>();
 static int64_t cache_hits ABSL_GUARDED_BY(autotune_cache_mu) = 0;
 static int64_t cache_misses ABSL_GUARDED_BY(autotune_cache_mu) = 0;

 StatusOr<tensorflow::DataType> EncodePrimitiveTypeAsDataType(
     PrimitiveType type) {
   switch (type) {
     case PRED:
       return tensorflow::DT_BOOL;
     case BF16:
       return tensorflow::DT_BFLOAT16;
     case F16:
       return tensorflow::DT_HALF;
     case F32:
       return tensorflow::DT_FLOAT;
     case F64:
       return tensorflow::DT_DOUBLE;
     case C64:
       return tensorflow::DT_COMPLEX64;
     case C128:
       return tensorflow::DT_COMPLEX128;
     case S8:
       return tensorflow::DT_INT8;
     case S16:
       return tensorflow::DT_INT16;
     case S32:
       return tensorflow::DT_INT32;
     case S64:
       return tensorflow::DT_INT64;
     case U8:
       return tensorflow::DT_UINT8;
     case U16:
       return tensorflow::DT_UINT16;
     case U32:
       return tensorflow::DT_UINT32;
     case U64:
       return tensorflow::DT_UINT64;
     default:
       return InternalError("Unsupported type in EncodePrimitiveAsDataType.");
   }
 }

 Status DoBlasPlansAutotune(se::Stream* stream, const HloInstruction* instr,
                            se::DeviceMemoryAllocator* allocator,
                            const GemmBackendConfig& gemm_config) {
   TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(instr));

   const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
   const int32_t cublas_autotune_level =
       hlo_module_config.debug_options().xla_gpu_autotune_level();
   const bool init_cublas_data = cublas_autotune_level >= 2;
   const bool reinit_cublas_data = cublas_autotune_level >= 3;
   const bool check_cublas = cublas_autotune_level >= 4;
   const bool crash_on_checking_failure =
       hlo_module_config.debug_options()
           .xla_gpu_crash_on_verification_failures();

   const int64_t redzone_size =
       check_cublas ? se::RedzoneAllocator::kDefaultRedzoneSize : 0;

   se::RedzoneAllocator input_output_allocator(
       stream, allocator,
       PtxOptsFromDebugOptions(hlo_module_config.debug_options()),
       /*memory_limit=*/std::numeric_limits<int64_t>::max(),
       /*redzone_size=*/redzone_size);
   int64_t rng_state = 0;
   auto get_initialized_buffer =
       [&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
     TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
                         input_output_allocator.AllocateBytes(
                             ShapeUtil::ByteSizeOf(op->shape())));
     if (init_cublas_data) {
       InitializeBuffer(stream, op->shape().element_type(), &rng_state, buffer);
     }
     return buffer;
   };

   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase lhs_buffer,
                       get_initialized_buffer(instr->operand(0)));
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
                       get_initialized_buffer(instr->operand(1)));
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase output_buffer,
                       get_initialized_buffer(instr));

   se::blas::MatrixDescriptor lhs = GetMatrixDesc(config.lhs_layout, lhs_buffer);
   se::blas::MatrixDescriptor rhs = GetMatrixDesc(config.rhs_layout, rhs_buffer);
   se::blas::MatrixDescriptor output =
       GetMatrixDesc(config.output_layout, output_buffer);
   int64_t batch_size = config.output_layout.batch_size;

   // TODO(cjfj): Support transposed output when using cuBLASLt.
   MakeBlasGemmCompatible(lhs, rhs, output);

   TF_ASSIGN_OR_RETURN(
       tensorflow::DataType dtype,
       EncodePrimitiveTypeAsDataType(config.output_layout.dtype));

   int device_id = stream->parent()->device_ordinal();
   bool trans_x = lhs.transpose == se::blas::Transpose::kTranspose;
   bool trans_y = rhs.transpose == se::blas::Transpose::kTranspose;

   int64_t m = output.num_rows;
   int64_t n = output.num_cols;
   int64_t k = lhs.reduced_dim();

   bool broadcast = batch_size == 1;

   VLOG(4) << "matmul params: trans_x " << trans_x << " trans_y " << trans_y
           << " adj_x " << false << " adj_y " << false << " m " << m << " n "
           << n << " k " << k << " batch_size " << batch_size << " broadcast "
           << broadcast << " broadcast " << broadcast << " dtype " << dtype
           << " device_id " << device_id;

   se::BatchMatmulParameters matmul_parameters(
       trans_x, trans_y, false, false, m, n, k, batch_size,
       /*broadcast_a*/ broadcast, /*broadcast_b*/ broadcast, dtype, dtype,
       device_id);

   TF_ASSIGN_OR_RETURN(
       const se::blas::PlanAndAlgorithms* plan_and_algorithms,
       se::GetPlanAndAlgorithms(stream, matmul_parameters, batch_size, dtype,
                                lhs, rhs, output));

   const std::vector<std::unique_ptr<se::blas::IBlasLtMatmulAlgorithm>>&
       algorithms = plan_and_algorithms->algorithms;

   // Note that algorithm_config.algorithm() here is used to refer
   // to the index within the algorithms vector, not the algorithm
   // itself.
   se::blas::AlgorithmConfig algorithm_config(se::blas::kNoAlgorithm);
   if (!BlasPlansAutotuneCacheSingleton::GetInstance()->Find(
           matmul_parameters, &algorithm_config)) {
     VLOG(4) << "Autotuning BlasLtMatmul over " << algorithms.size()
             << " algorithms.";
     se::blas::ProfileResult best_result;
     se::blas::ProfileResult profile_result;

     for (size_t i = 0; i != algorithms.size(); ++i) {
       // Create a new scratch allocator with every autotuning run so that
       // scratch space is deallocated between runs.
       se::OwningScratchAllocator<> scratch_allocator(device_id, allocator);

       // Make sure the output buffer always has the same value if we use
       // the bias parameter.
       if (reinit_cublas_data && gemm_config.beta() != 0) {
         int64_t rng_state = 0;
         InitializeBuffer(stream, instr->shape().element_type(), &rng_state,
                          output_buffer);
       }

       TF_RETURN_IF_ERROR(
           RunGemm(config, lhs_buffer, rhs_buffer, output_buffer, stream,
                   /*scratch allocator=*/&scratch_allocator,
                   /*algorithm_being_profiled=*/algorithms[i].get(),
                   /*profile_result=*/&profile_result, absl::nullopt));

       VLOG(4) << "  Autotune algorithm " << i
               << " result: " << profile_result.elapsed_time_in_ms()
               << " ms, valid=" << profile_result.is_valid();

       if (profile_result.is_valid() && profile_result.elapsed_time_in_ms() <
                                            best_result.elapsed_time_in_ms()) {
         best_result = profile_result;
       }

       if (!check_cublas) {
         continue;
       }

       TF_ASSIGN_OR_RETURN(
           se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
           input_output_allocator.CheckRedzones());
       if (!rz_check_status.ok()) {
         LOG(ERROR) << "Detected cuBLASLT out-of-bounds write in gemm buffer";
         CHECK(!crash_on_checking_failure);
         continue;
       }
     }

     if (best_result.is_valid()) {
       algorithm_config.set_algorithm(best_result.algorithm());
     }
     se::blas::AlgorithmType algorithm_idx = algorithm_config.algorithm();
     CHECK(algorithm_idx >= 0 && algorithm_idx < algorithms.size())
         << "Missing/invalid BatchMatmul algorithm";
     // We make sure that each matmul parameter set only gets one pass of
     // autotune. If no algorithms works, we add kNoAlgorithm to the autotune
     // map.
     VLOG(4) << "Inserting algorithm id " << algorithm_config.algorithm()
             << " for " << trans_x << " " << trans_y << " " << m << " " << n
             << " " << k << " " << batch_size << " " << broadcast << " "
             << broadcast << " " << dtype << " " << device_id;
     BlasPlansAutotuneCacheSingleton::GetInstance()->Insert(matmul_parameters,
                                                            algorithm_config);
   }
   return Status::OK();
 }

 // Experimentally tries to pick the best algorithm for the given gemm.
 //
 // This may fail under perfectly normal circumstances.  In particular, it will
 // fail if the program was built with < CUDA 8 or if we're using a gpu older
 // than sm_50 -- in both cases, cublas doesn't support gemm-with-algorithm at
 // all.
 static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
     const HloInstruction* gemm, se::Stream* stream,
     se::DeviceMemoryAllocator* allocator) {
   if (!stream->parent()->SynchronizeAllActivity()) {
     return InternalError("Failed to synchronize GPU for autotuning.");
   }

   const HloModuleConfig& hlo_module_config = gemm->GetModule()->config();
   const int32_t cublas_autotune_level =
       hlo_module_config.debug_options().xla_gpu_autotune_level();
   const bool init_cublas_data = cublas_autotune_level >= 2;
   const bool reinit_cublas_data = cublas_autotune_level >= 3;
   const bool check_cublas = cublas_autotune_level >= 4;

   const int64_t redzone_size =
       check_cublas ? se::RedzoneAllocator::kDefaultRedzoneSize : 0;
   se::RedzoneAllocator input_output_allocator(
       stream, allocator,
       PtxOptsFromDebugOptions(hlo_module_config.debug_options()),
       /*memory_limit=*/std::numeric_limits<int64_t>::max(),
       /*redzone_size=*/redzone_size);

   BufferComparator comparator(gemm->shape(), hlo_module_config);

   int64_t rng_state = 0;
   auto get_initialized_buffer =
       [&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
     TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
                         input_output_allocator.AllocateBytes(
                             ShapeUtil::ByteSizeOf(op->shape())));
     if (init_cublas_data) {
       InitializeBuffer(stream, op->shape().element_type(), &rng_state, buffer);
     }
     return buffer;
   };

   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase lhs_buffer,
                       get_initialized_buffer(gemm->operand(0)));
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
                       get_initialized_buffer(gemm->operand(1)));
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase output_buffer,
                       get_initialized_buffer(gemm));
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase reference_result_buffer,
                       get_initialized_buffer(gemm));

   const DebugOptions& debug_options =
       gemm->GetModule()->config().debug_options();

   const bool crash_on_checking_failure =
       debug_options.xla_gpu_crash_on_verification_failures();

   GemmBackendConfig backend_config =
       gemm->backend_config<GemmBackendConfig>().ValueOrDie();

   std::vector<se::blas::AlgorithmType> algorithms;
   CHECK(stream->parent()->GetBlasGemmAlgorithms(&algorithms));

   absl::optional<se::blas::AlgorithmType> first_algorithm;
   std::vector<AutotuneResult> profile_results;

   TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(gemm));

   for (se::blas::AlgorithmType algorithm : algorithms) {
     // Make sure the output buffer always has the same value if we use
     // the bias parameter.
     if (reinit_cublas_data && backend_config.beta() != 0) {
       int64_t rng_state = 0;
       InitializeBuffer(stream, gemm->shape().element_type(), &rng_state,
                        output_buffer);
     }
     se::blas::ProfileResult profile_result;

     // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
     // for all algorithms if we're targeting < sm_50.  But because we pass a
     // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
     // and the actual success-ness is returned in ProfileResult::is_valid.
     Status st = RunGemm(config, lhs_buffer, rhs_buffer, output_buffer, stream,
                         /*scratch allocator=*/nullptr,
                         /* algorithm_being_profiled=*/nullptr,
                         /*profile_result=*/&profile_result, algorithm);
     CHECK(st.ok()) << st.ToString();

     if (!profile_result.is_valid()) {
       // Unsupported algorithm.
       continue;
     }

     profile_results.emplace_back();
     AutotuneResult& result = profile_results.back();
     result.mutable_gemm()->set_algorithm(algorithm);

     VLOG(2) << "cublas gemm algorithm " << algorithm << " took "
             << profile_result.elapsed_time_in_ms() << "ms" << std::endl;

     *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
         absl::Milliseconds(profile_result.elapsed_time_in_ms()));

     if (!check_cublas) {
       continue;
     }

     TF_ASSIGN_OR_RETURN(
         se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
         input_output_allocator.CheckRedzones());
     if (!rz_check_status.ok()) {
       result.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
       *result.mutable_failure()->mutable_msg() =
           rz_check_status.RedzoneFailureMsg();
       LOG(ERROR) << "Detected cuBLAS out-of-bounds write in gemm buffer";
       CHECK(!crash_on_checking_failure);
       continue;
     }

     if (!first_algorithm) {
       // First run: set the reference result buffer.
       CHECK(reference_result_buffer.size() == output_buffer.size());
       stream->ThenMemcpy(&reference_result_buffer, output_buffer,
                          output_buffer.size());
       first_algorithm.emplace(algorithm);
     } else {
       // Perform the comparison.
       TF_ASSIGN_OR_RETURN(bool compare_result,
                           comparator.CompareEqual(stream, output_buffer,
                                                   reference_result_buffer));
       if (!compare_result) {
         LOG(ERROR) << "Results mismatch between different GEMM algorithms. "
                    << "This is likely a bug/unexpected loss of precision "
                    << "in cuBLAS.";
         CHECK(!crash_on_checking_failure);

         result.mutable_failure()->set_kind(AutotuneResult::WRONG_RESULT);
         result.mutable_failure()->mutable_reference_gemm()->set_algorithm(
             *first_algorithm);
       }
     }
   }

   tensorflow::AutotuningLog log;
   for (const AutotuneResult& profile : profile_results) {
     *log.add_results() = profile;
   }
   if (!crash_on_checking_failure) {
     tensorflow::Logger::GetSingleton()->LogProto(log);
   }

   StatusOr<AutotuneResult> autotune_result =
       PickBestResult(profile_results, *gemm);
   if (!autotune_result.ok()) {
     LOG(WARNING) << "Failed to find best cuBLAS algorithm, GEMM performance "
                     "might be suboptimal: "
                  << autotune_result.status();
     return {absl::nullopt};
   }
   return {autotune_result->gemm().algorithm()};
 }

 static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     const HloInstruction* instr, const GemmBackendConfig& gemm_config,
     se::DeviceMemoryAllocator* allocator, se::Stream* stream) {
   VLOG(3) << "Starting autotune of GemmThunk " << instr->ToString();
   const HloInstruction* lhs = instr->operand(0);
   const HloInstruction* rhs = instr->operand(1);

   TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(instr));
   // Don't run autotuning concurrently on the same GPU.
   absl::MutexLock gpu_lock(&GetGpuMutex(stream->parent()));

   if (config.use_cublaslt && stream->parent()->SupportsBlasPlans()) {
     TF_RETURN_IF_ERROR(
         DoBlasPlansAutotune(stream, instr, allocator, gemm_config));
     return {se::blas::kNoAlgorithm};
   } else {
     GemmCacheKey key =
         std::make_tuple(stream->parent(), lhs->shape(), rhs->shape(),
                         instr->shape(), gemm_config.SerializeAsString());

     absl::MutexLock cache_lock(&autotune_cache_mu);
     auto it = autotune_cache.find(key);
     int64_t autotuning_requests = cache_hits + cache_misses;
     if (autotuning_requests && autotuning_requests % 10 == 0) {
       VLOG(2) << "Autotuning cache hits/(hits + misses): " << cache_hits << "/"
               << autotuning_requests;
     }

     if (it != autotune_cache.end()) {
       cache_hits++;
       VLOG(4) << "Autotuning cache hit, using algorithm: "
               << (it->second.has_value() ? absl::StrCat(*(it->second))
                                          : "<generic>");
       return it->second;
     }
     cache_misses++;
     VLOG(4) << "Autotuning cache miss";

     // Make sure any previous activity on this executor is done. We don't want
     // other work still running on the GPU to interfere with autotuning.
     if (!stream->parent()->SynchronizeAllActivity()) {
       auto options = HloPrintOptions::Canonical();
       options.set_print_backend_config(true);
       return InternalError(
           "Failed to synchronize GPU for autotuning gemm instruction: %s",
           instr->ToString(options));
     }

     TF_ASSIGN_OR_RETURN(absl::optional<se::blas::AlgorithmType> result,
                         DoUncachedGemmAutotune(instr, stream, allocator));

     CHECK(autotune_cache.emplace(key, result).second);
     return result;
   }
 }

 static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
                                        se::StreamExecutor* executor,
                                        se::DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = executor->GetAllocator();
   }
   TF_ASSIGN_OR_RETURN(se::Stream* const stream,
                       allocator->GetStream(executor->device_ordinal()));

   GemmBackendConfig gemm_config =
       instr->backend_config<GemmBackendConfig>().ValueOrDie();

   TF_ASSIGN_OR_RETURN(absl::optional<se::blas::AlgorithmType> gemm_algorithm,
                       DoGemmAutotune(instr, gemm_config, allocator, stream));

   // We update instruction->backend_config(); if no algorithms are supported,
   // a different API is used, which does not require specifying an algorithm.
   GemmBackendConfig updated_config = gemm_config;
   if (gemm_algorithm) {
     VLOG(4) << "GEMM autotuning picked algorithm " << *gemm_algorithm << " for "
             << instr->name();
     updated_config.set_selected_algorithm(*gemm_algorithm);
   }
   TF_RETURN_IF_ERROR(instr->set_backend_config(updated_config));
   return updated_config.SerializeAsString() != gemm_config.SerializeAsString();
 }

 static StatusOr<bool> RunOnComputation(HloComputation* computation,
                                        se::StreamExecutor* se,
                                        se::DeviceMemoryAllocator* allocator) {
   bool changed = false;
   for (HloInstruction* instr : computation->instructions()) {
     if (IsCublasGemm(*instr)) {
       TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr, se, allocator));
       changed |= result;
     }
   }
   return changed;
 }

 StatusOr<bool> GemmAlgorithmPicker::Run(HloModule* module) {
   XLA_SCOPED_LOGGING_TIMER("GemmAlgorithmPicker");

   if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
     VLOG(2) << "GEMM auto-tuning disabled, GemmAlgorithmPicker returning early";
     return false;
   }

   bool changed = false;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     TF_ASSIGN_OR_RETURN(
         bool result, RunOnComputation(computation, stream_exec_, allocator_));
     changed |= result;
   }
   return changed;
 }

 }  // namespace gpu
 }  // namespace xla
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"

	#include <limits>
	#include <string>

	#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
	#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
	#include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
	#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
	#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
	#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
	#include "tensorflow/compiler/xla/service/hlo_computation.h"
	#include "tensorflow/compiler/xla/service/hlo_instruction.h"
	#include "tensorflow/compiler/xla/service/hlo_instructions.h"
	#include "tensorflow/compiler/xla/service/hlo_opcode.h"
	#include "tensorflow/compiler/xla/util.h"
	#include "tensorflow/core/lib/core/errors.h"
	#include "tensorflow/core/platform/logger.h"
	#include "tensorflow/core/protobuf/autotuning.pb.h"
	#include "tensorflow/core/util/proto/proto_utils.h"
	#include "tensorflow/stream_executor/blas.h"
	#include "tensorflow/stream_executor/device_memory.h"
	#include "tensorflow/stream_executor/device_memory_allocator.h"
	#include "tensorflow/stream_executor/gpu/redzone_allocator.h"

	namespace xla {
	namespace gpu {

	using tensorflow::AutotuneResult;

	using GemmCacheKey =
	std::tuple<se::StreamExecutor*, Shape, Shape, Shape, std::string>;

	static absl::Mutex autotune_cache_mu(absl::kConstInit);
	static auto& autotune_cache ABSL_GUARDED_BY(autotune_cache_mu) =
	*new absl::flat_hash_map<GemmCacheKey,
	absl::optional<se::blas::AlgorithmType>>();
	static int64_t cache_hits ABSL_GUARDED_BY(autotune_cache_mu) = 0;
	static int64_t cache_misses ABSL_GUARDED_BY(autotune_cache_mu) = 0;

	StatusOr<tensorflow::DataType> EncodePrimitiveTypeAsDataType(
	PrimitiveType type) {
	switch (type) {
	case PRED:
	return tensorflow::DT_BOOL;
	case BF16:
	return tensorflow::DT_BFLOAT16;
	case F16:
	return tensorflow::DT_HALF;
	case F32:
	return tensorflow::DT_FLOAT;
	case F64:
	return tensorflow::DT_DOUBLE;
	case C64:
	return tensorflow::DT_COMPLEX64;
	case C128:
	return tensorflow::DT_COMPLEX128;
	case S8:
	return tensorflow::DT_INT8;
	case S16:
	return tensorflow::DT_INT16;
	case S32:
	return tensorflow::DT_INT32;
	case S64:
	return tensorflow::DT_INT64;
	case U8:
	return tensorflow::DT_UINT8;
	case U16:
	return tensorflow::DT_UINT16;
	case U32:
	return tensorflow::DT_UINT32;
	case U64:
	return tensorflow::DT_UINT64;
	default:
	return InternalError("Unsupported type in EncodePrimitiveAsDataType.");
	}
	}

	Status DoBlasPlansAutotune(se::Stream* stream, const HloInstruction* instr,
	se::DeviceMemoryAllocator* allocator,
	const GemmBackendConfig& gemm_config) {
	TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(instr));

	const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
	const int32_t cublas_autotune_level =
	hlo_module_config.debug_options().xla_gpu_autotune_level();
	const bool init_cublas_data = cublas_autotune_level >= 2;
	const bool reinit_cublas_data = cublas_autotune_level >= 3;
	const bool check_cublas = cublas_autotune_level >= 4;
	const bool crash_on_checking_failure =
	hlo_module_config.debug_options()
	.xla_gpu_crash_on_verification_failures();

	const int64_t redzone_size =
	check_cublas ? se::RedzoneAllocator::kDefaultRedzoneSize : 0;

	se::RedzoneAllocator input_output_allocator(
	stream, allocator,
	PtxOptsFromDebugOptions(hlo_module_config.debug_options()),
	/memory_limit=/std::numeric_limits<int64_t>::max(),
	/redzone_size=/redzone_size);
	int64_t rng_state = 0;
	auto get_initialized_buffer =
	[&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
	input_output_allocator.AllocateBytes(
	ShapeUtil::ByteSizeOf(op->shape())));
	if (init_cublas_data) {
	InitializeBuffer(stream, op->shape().element_type(), &rng_state, buffer);
	}
	return buffer;
	};

	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase lhs_buffer,
	get_initialized_buffer(instr->operand(0)));
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
	get_initialized_buffer(instr->operand(1)));
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase output_buffer,
	get_initialized_buffer(instr));

	se::blas::MatrixDescriptor lhs = GetMatrixDesc(config.lhs_layout, lhs_buffer);
	se::blas::MatrixDescriptor rhs = GetMatrixDesc(config.rhs_layout, rhs_buffer);
	se::blas::MatrixDescriptor output =
	GetMatrixDesc(config.output_layout, output_buffer);
	int64_t batch_size = config.output_layout.batch_size;

	// TODO(cjfj): Support transposed output when using cuBLASLt.
	MakeBlasGemmCompatible(lhs, rhs, output);

	TF_ASSIGN_OR_RETURN(
	tensorflow::DataType dtype,
	EncodePrimitiveTypeAsDataType(config.output_layout.dtype));

	int device_id = stream->parent()->device_ordinal();
	bool trans_x = lhs.transpose == se::blas::Transpose::kTranspose;
	bool trans_y = rhs.transpose == se::blas::Transpose::kTranspose;

	int64_t m = output.num_rows;
	int64_t n = output.num_cols;
	int64_t k = lhs.reduced_dim();

	bool broadcast = batch_size == 1;

	VLOG(4) << "matmul params: trans_x " << trans_x << " trans_y " << trans_y
	<< " adj_x " << false << " adj_y " << false << " m " << m << " n "
	<< n << " k " << k << " batch_size " << batch_size << " broadcast "
	<< broadcast << " broadcast " << broadcast << " dtype " << dtype
	<< " device_id " << device_id;

	se::BatchMatmulParameters matmul_parameters(
	trans_x, trans_y, false, false, m, n, k, batch_size,
	/broadcast_a/ broadcast, /broadcast_b/ broadcast, dtype, dtype,
	device_id);

	TF_ASSIGN_OR_RETURN(
	const se::blas::PlanAndAlgorithms* plan_and_algorithms,
	se::GetPlanAndAlgorithms(stream, matmul_parameters, batch_size, dtype,
	lhs, rhs, output));

	const std::vector<std::unique_ptr<se::blas::IBlasLtMatmulAlgorithm>>&
	algorithms = plan_and_algorithms->algorithms;

	// Note that algorithm_config.algorithm() here is used to refer
	// to the index within the algorithms vector, not the algorithm
	// itself.
	se::blas::AlgorithmConfig algorithm_config(se::blas::kNoAlgorithm);
	if (!BlasPlansAutotuneCacheSingleton::GetInstance()->Find(
	matmul_parameters, &algorithm_config)) {
	VLOG(4) << "Autotuning BlasLtMatmul over " << algorithms.size()
	<< " algorithms.";
	se::blas::ProfileResult best_result;
	se::blas::ProfileResult profile_result;

	for (size_t i = 0; i != algorithms.size(); ++i) {
	// Create a new scratch allocator with every autotuning run so that
	// scratch space is deallocated between runs.
	se::OwningScratchAllocator<> scratch_allocator(device_id, allocator);

	// Make sure the output buffer always has the same value if we use
	// the bias parameter.
	if (reinit_cublas_data && gemm_config.beta() != 0) {
	int64_t rng_state = 0;
	InitializeBuffer(stream, instr->shape().element_type(), &rng_state,
	output_buffer);
	}

	TF_RETURN_IF_ERROR(
	RunGemm(config, lhs_buffer, rhs_buffer, output_buffer, stream,
	/scratch allocator=/&scratch_allocator,
	/algorithm_being_profiled=/algorithms[i].get(),
	/profile_result=/&profile_result, absl::nullopt));

	VLOG(4) << " Autotune algorithm " << i
	<< " result: " << profile_result.elapsed_time_in_ms()
	<< " ms, valid=" << profile_result.is_valid();

	if (profile_result.is_valid() && profile_result.elapsed_time_in_ms() <
	best_result.elapsed_time_in_ms()) {
	best_result = profile_result;
	}

	if (!check_cublas) {
	continue;
	}

	TF_ASSIGN_OR_RETURN(
	se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
	input_output_allocator.CheckRedzones());
	if (!rz_check_status.ok()) {
	LOG(ERROR) << "Detected cuBLASLT out-of-bounds write in gemm buffer";
	CHECK(!crash_on_checking_failure);
	continue;
	}
	}

	if (best_result.is_valid()) {
	algorithm_config.set_algorithm(best_result.algorithm());
	}
	se::blas::AlgorithmType algorithm_idx = algorithm_config.algorithm();
	CHECK(algorithm_idx >= 0 && algorithm_idx < algorithms.size())
	<< "Missing/invalid BatchMatmul algorithm";
	// We make sure that each matmul parameter set only gets one pass of
	// autotune. If no algorithms works, we add kNoAlgorithm to the autotune
	// map.
	VLOG(4) << "Inserting algorithm id " << algorithm_config.algorithm()
	<< " for " << trans_x << " " << trans_y << " " << m << " " << n
	<< " " << k << " " << batch_size << " " << broadcast << " "
	<< broadcast << " " << dtype << " " << device_id;
	BlasPlansAutotuneCacheSingleton::GetInstance()->Insert(matmul_parameters,
	algorithm_config);
	}
	return Status::OK();
	}

	// Experimentally tries to pick the best algorithm for the given gemm.
	//
	// This may fail under perfectly normal circumstances. In particular, it will
	// fail if the program was built with < CUDA 8 or if we're using a gpu older
	// than sm_50 -- in both cases, cublas doesn't support gemm-with-algorithm at
	// all.
	static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
	const HloInstruction* gemm, se::Stream* stream,
	se::DeviceMemoryAllocator* allocator) {
	if (!stream->parent()->SynchronizeAllActivity()) {
	return InternalError("Failed to synchronize GPU for autotuning.");
	}

	const HloModuleConfig& hlo_module_config = gemm->GetModule()->config();
	const int32_t cublas_autotune_level =
	hlo_module_config.debug_options().xla_gpu_autotune_level();
	const bool init_cublas_data = cublas_autotune_level >= 2;
	const bool reinit_cublas_data = cublas_autotune_level >= 3;
	const bool check_cublas = cublas_autotune_level >= 4;

	const int64_t redzone_size =
	check_cublas ? se::RedzoneAllocator::kDefaultRedzoneSize : 0;
	se::RedzoneAllocator input_output_allocator(
	stream, allocator,
	PtxOptsFromDebugOptions(hlo_module_config.debug_options()),
	/memory_limit=/std::numeric_limits<int64_t>::max(),
	/redzone_size=/redzone_size);

	BufferComparator comparator(gemm->shape(), hlo_module_config);

	int64_t rng_state = 0;
	auto get_initialized_buffer =
	[&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
	input_output_allocator.AllocateBytes(
	ShapeUtil::ByteSizeOf(op->shape())));
	if (init_cublas_data) {
	InitializeBuffer(stream, op->shape().element_type(), &rng_state, buffer);
	}
	return buffer;
	};

	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase lhs_buffer,
	get_initialized_buffer(gemm->operand(0)));
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
	get_initialized_buffer(gemm->operand(1)));
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase output_buffer,
	get_initialized_buffer(gemm));
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase reference_result_buffer,
	get_initialized_buffer(gemm));

	const DebugOptions& debug_options =
	gemm->GetModule()->config().debug_options();

	const bool crash_on_checking_failure =
	debug_options.xla_gpu_crash_on_verification_failures();

	GemmBackendConfig backend_config =
	gemm->backend_config<GemmBackendConfig>().ValueOrDie();

	std::vector<se::blas::AlgorithmType> algorithms;
	CHECK(stream->parent()->GetBlasGemmAlgorithms(&algorithms));

	absl::optional<se::blas::AlgorithmType> first_algorithm;
	std::vector<AutotuneResult> profile_results;

	TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(gemm));

	for (se::blas::AlgorithmType algorithm : algorithms) {
	// Make sure the output buffer always has the same value if we use
	// the bias parameter.
	if (reinit_cublas_data && backend_config.beta() != 0) {
	int64_t rng_state = 0;
	InitializeBuffer(stream, gemm->shape().element_type(), &rng_state,
	output_buffer);
	}
	se::blas::ProfileResult profile_result;

	// We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
	// for all algorithms if we're targeting < sm_50. But because we pass a
	// non-null ProfileResult, DoGemmWithAlgorithm should always return true,
	// and the actual success-ness is returned in ProfileResult::is_valid.
	Status st = RunGemm(config, lhs_buffer, rhs_buffer, output_buffer, stream,
	/scratch allocator=/nullptr,
	/* algorithm_being_profiled=*/nullptr,
	/profile_result=/&profile_result, algorithm);
	CHECK(st.ok()) << st.ToString();

	if (!profile_result.is_valid()) {
	// Unsupported algorithm.
	continue;
	}

	profile_results.emplace_back();
	AutotuneResult& result = profile_results.back();
	result.mutable_gemm()->set_algorithm(algorithm);

	VLOG(2) << "cublas gemm algorithm " << algorithm << " took "
	<< profile_result.elapsed_time_in_ms() << "ms" << std::endl;

	*result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
	absl::Milliseconds(profile_result.elapsed_time_in_ms()));

	if (!check_cublas) {
	continue;
	}

	TF_ASSIGN_OR_RETURN(
	se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
	input_output_allocator.CheckRedzones());
	if (!rz_check_status.ok()) {
	result.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
	*result.mutable_failure()->mutable_msg() =
	rz_check_status.RedzoneFailureMsg();
	LOG(ERROR) << "Detected cuBLAS out-of-bounds write in gemm buffer";
	CHECK(!crash_on_checking_failure);
	continue;
	}

	if (!first_algorithm) {
	// First run: set the reference result buffer.
	CHECK(reference_result_buffer.size() == output_buffer.size());
	stream->ThenMemcpy(&reference_result_buffer, output_buffer,
	output_buffer.size());
	first_algorithm.emplace(algorithm);
	} else {
	// Perform the comparison.
	TF_ASSIGN_OR_RETURN(bool compare_result,
	comparator.CompareEqual(stream, output_buffer,
	reference_result_buffer));
	if (!compare_result) {
	LOG(ERROR) << "Results mismatch between different GEMM algorithms. "
	<< "This is likely a bug/unexpected loss of precision "
	<< "in cuBLAS.";
	CHECK(!crash_on_checking_failure);

	result.mutable_failure()->set_kind(AutotuneResult::WRONG_RESULT);
	result.mutable_failure()->mutable_reference_gemm()->set_algorithm(
	*first_algorithm);
	}
	}
	}

	tensorflow::AutotuningLog log;
	for (const AutotuneResult& profile : profile_results) {
	*log.add_results() = profile;
	}
	if (!crash_on_checking_failure) {
	tensorflow::Logger::GetSingleton()->LogProto(log);
	}

	StatusOr<AutotuneResult> autotune_result =
	PickBestResult(profile_results, *gemm);
	if (!autotune_result.ok()) {
	LOG(WARNING) << "Failed to find best cuBLAS algorithm, GEMM performance "
	"might be suboptimal: "
	<< autotune_result.status();
	return {absl::nullopt};
	}
	return {autotune_result->gemm().algorithm()};
	}

	static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(
	const HloInstruction* instr, const GemmBackendConfig& gemm_config,
	se::DeviceMemoryAllocator* allocator, se::Stream* stream) {
	VLOG(3) << "Starting autotune of GemmThunk " << instr->ToString();
	const HloInstruction* lhs = instr->operand(0);
	const HloInstruction* rhs = instr->operand(1);

	TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(instr));
	// Don't run autotuning concurrently on the same GPU.
	absl::MutexLock gpu_lock(&GetGpuMutex(stream->parent()));

	if (config.use_cublaslt && stream->parent()->SupportsBlasPlans()) {
	TF_RETURN_IF_ERROR(
	DoBlasPlansAutotune(stream, instr, allocator, gemm_config));
	return {se::blas::kNoAlgorithm};
	} else {
	GemmCacheKey key =
	std::make_tuple(stream->parent(), lhs->shape(), rhs->shape(),
	instr->shape(), gemm_config.SerializeAsString());

	absl::MutexLock cache_lock(&autotune_cache_mu);
	auto it = autotune_cache.find(key);
	int64_t autotuning_requests = cache_hits + cache_misses;
	if (autotuning_requests && autotuning_requests % 10 == 0) {
	VLOG(2) << "Autotuning cache hits/(hits + misses): " << cache_hits << "/"
	<< autotuning_requests;
	}

	if (it != autotune_cache.end()) {
	cache_hits++;
	VLOG(4) << "Autotuning cache hit, using algorithm: "
	<< (it->second.has_value() ? absl::StrCat(*(it->second))
	: "<generic>");
	return it->second;
	}
	cache_misses++;
	VLOG(4) << "Autotuning cache miss";

	// Make sure any previous activity on this executor is done. We don't want
	// other work still running on the GPU to interfere with autotuning.
	if (!stream->parent()->SynchronizeAllActivity()) {
	auto options = HloPrintOptions::Canonical();
	options.set_print_backend_config(true);
	return InternalError(
	"Failed to synchronize GPU for autotuning gemm instruction: %s",
	instr->ToString(options));
	}

	TF_ASSIGN_OR_RETURN(absl::optional<se::blas::AlgorithmType> result,
	DoUncachedGemmAutotune(instr, stream, allocator));

	CHECK(autotune_cache.emplace(key, result).second);
	return result;
	}
	}

	static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
	se::StreamExecutor* executor,
	se::DeviceMemoryAllocator* allocator) {
	if (allocator == nullptr) {
	allocator = executor->GetAllocator();
	}
	TF_ASSIGN_OR_RETURN(se::Stream* const stream,
	allocator->GetStream(executor->device_ordinal()));

	GemmBackendConfig gemm_config =
	instr->backend_config<GemmBackendConfig>().ValueOrDie();

	TF_ASSIGN_OR_RETURN(absl::optional<se::blas::AlgorithmType> gemm_algorithm,
	DoGemmAutotune(instr, gemm_config, allocator, stream));

	// We update instruction->backend_config(); if no algorithms are supported,
	// a different API is used, which does not require specifying an algorithm.
	GemmBackendConfig updated_config = gemm_config;
	if (gemm_algorithm) {
	VLOG(4) << "GEMM autotuning picked algorithm " << *gemm_algorithm << " for "
	<< instr->name();
	updated_config.set_selected_algorithm(*gemm_algorithm);
	}
	TF_RETURN_IF_ERROR(instr->set_backend_config(updated_config));
	return updated_config.SerializeAsString() != gemm_config.SerializeAsString();
	}

	static StatusOr<bool> RunOnComputation(HloComputation* computation,
	se::StreamExecutor* se,
	se::DeviceMemoryAllocator* allocator) {
	bool changed = false;
	for (HloInstruction* instr : computation->instructions()) {
	if (IsCublasGemm(*instr)) {
	TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr, se, allocator));
	changed \|= result;
	}
	}
	return changed;
	}

	StatusOr<bool> GemmAlgorithmPicker::Run(HloModule* module) {
	XLA_SCOPED_LOGGING_TIMER("GemmAlgorithmPicker");

	if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
	VLOG(2) << "GEMM auto-tuning disabled, GemmAlgorithmPicker returning early";
	return false;
	}

	bool changed = false;
	for (HloComputation* computation : module->MakeNonfusionComputations()) {
	TF_ASSIGN_OR_RETURN(
	bool result, RunOnComputation(computation, stream_exec_, allocator_));
	changed \|= result;
	}
	return changed;
	}

	} // namespace gpu
	} // namespace xla