tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"

 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/cuda/redzone_allocator.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace gpu {

 using tensorflow::AutotuneResult;

 using GemmCacheKey =
     std::tuple<se::StreamExecutor*, Shape, Shape, Shape, std::string>;

 static tensorflow::mutex autotune_cache_mu(tensorflow::LINKER_INITIALIZED);
 static auto& autotune_cache GUARDED_BY(autotune_cache_mu) =
     *new absl::flat_hash_map<GemmCacheKey,
                              absl::optional<se::blas::AlgorithmType>>();
 static int64 cache_hits GUARDED_BY(autotune_cache_mu) = 0;
 static int64 cache_misses GUARDED_BY(autotune_cache_mu) = 0;

 // Experimentally tries to pick the best algorithm for the given gemm.
 //
 // This may fail under perfectly normal circumstances.  In particular, it will
 // fail if the program was built with < CUDA 8 or if we're using a gpu older
 // than sm_50 -- in both cases, cublas doesn't support gemm-with-algorithm at
 // all.
 static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
     const HloInstruction* gemm, se::DeviceMemoryBase lhs_buffer,
     se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
     se::DeviceMemoryBase reference_result_buffer, se::Stream* stream,
     const se::cuda::RedzoneAllocator& allocator,
     const BufferComparator& comparator, bool crash_on_checking_failure) {
   if (!stream->parent()->SynchronizeAllActivity()) {
     return InternalError("Failed to synchronize GPU for autotuning.");
   }

   VLOG(3) << "Starting autotune of GemmThunk " << gemm->ToString();

   std::vector<se::blas::AlgorithmType> algorithms;
   CHECK(stream->parent()->GetBlasGemmAlgorithms(&algorithms));

   absl::optional<se::blas::AlgorithmType> first_algorithm;
   std::vector<AutotuneResult> profile_results;

   for (se::blas::AlgorithmType algorithm : algorithms) {
     // Make sure the output buffer always has the same value if we use
     // the bias parameter.
     if (gemm->backend_config<GemmBackendConfig>().ValueOrDie().beta() != 0) {
       int64 rng_state = 0;
       InitializeFloatBuffer(stream, gemm->shape().element_type(), &rng_state,
                             output_buffer);
     }
     se::blas::ProfileResult profile_result;

     // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
     // for all algorithms if we're targeting < sm_50.  But because we pass a
     // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
     // and the actual success-ness is returned in ProfileResult::is_valid.
     CHECK(RunGemm(gemm, lhs_buffer, rhs_buffer, output_buffer, stream,
                   /*implements_whole_instruction=*/true,
                   /*profiler=*/nullptr,
                   /*profile_result=*/&profile_result, algorithm)
               .ok());

     if (!profile_result.is_valid()) {
       // Unsupported algorithm.
       continue;
     }

     profile_results.emplace_back();
     AutotuneResult& result = profile_results.back();
     result.mutable_gemm()->set_algorithm(algorithm);

     VLOG(2) << "cublas gemm algorithm " << algorithm << " took "
             << profile_result.elapsed_time_in_ms() << "ms" << std::endl;

     *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
         absl::Milliseconds(profile_result.elapsed_time_in_ms()));

     TF_ASSIGN_OR_RETURN(
         se::cuda::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
         allocator.CheckRedzones(stream));
     if (!rz_check_status.ok()) {
       result.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
       *result.mutable_failure()->mutable_msg() =
           rz_check_status.RedzoneFailureMsg();
       LOG(ERROR) << "Detected cuBLAS out-of-bounds write in gemm buffer";
       CHECK(!crash_on_checking_failure);
       continue;
     }

     if (!first_algorithm) {
       // First run: set the reference result buffer.
       CHECK(reference_result_buffer.size() == output_buffer.size());
       stream->ThenMemcpy(&reference_result_buffer, output_buffer,
                          output_buffer.size());
       first_algorithm.emplace(algorithm);
     } else {
       // Perform the comparison.
       TF_ASSIGN_OR_RETURN(bool compare_result,
                           comparator.CompareEqual(stream, output_buffer,
                                                   reference_result_buffer));
       if (!compare_result) {
         LOG(ERROR) << "Results mismatch between different GEMM algorithms. "
                    << "This is likely a bug/unexpected loss of precision "
                    << "in cuBLAS.";
         CHECK(!crash_on_checking_failure);

         result.mutable_failure()->set_kind(AutotuneResult::WRONG_RESULT);
         result.mutable_failure()->mutable_reference_gemm()->set_algorithm(
             *first_algorithm);
       }
     }
   }

   tensorflow::AutotuningLog log;
   for (const AutotuneResult& profile : profile_results) {
     *log.add_results() = profile;
   }
   if (!crash_on_checking_failure) {
     tensorflow::Logger::GetSingleton()->LogProto(log);
   }

   // Choose fastest correct GEMM, but allow for incorrect results (since the
   // reference result is chosen arbitrary).
   auto has_failure = [](const AutotuneResult& r) {
     return r.has_failure() &&
            r.failure().kind() != AutotuneResult::WRONG_RESULT;
   };

   auto result_comparison_key = [&has_failure](const AutotuneResult& r) {
     return std::make_tuple(
         has_failure(r),
         tensorflow::proto_utils::FromDurationProto(r.run_time()));
   };
   const auto& best_result = absl::c_min_element(
       profile_results,
       [&](const AutotuneResult& lhs, const AutotuneResult& rhs) {
         return result_comparison_key(lhs) < result_comparison_key(rhs);
       });

   if (best_result != profile_results.end() && !has_failure(*best_result)) {
     return {best_result->gemm().algorithm()};
   }

   VLOG(1) << "Unable to autotune cuBLAS gemm on stream " << stream
           << " none of the " << algorithms.size() << " ran successfully";
   return {absl::nullopt};
 }

 static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     const HloInstruction* instr, const HloInstruction* lhs,
     const HloInstruction* rhs, se::DeviceMemoryBase lhs_buffer,
     se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
     se::DeviceMemoryBase reference_result_buffer, se::Stream* stream,
     bool crash_on_checking_failure, const se::cuda::RedzoneAllocator& allocator,
     const BufferComparator& comparator) {
   // Don't run autotuning concurrently on the same GPU.
   tensorflow::mutex_lock gpu_lock = LockGpu(stream->parent());

   GemmBackendConfig gemm_config =
       instr->backend_config<GemmBackendConfig>().ValueOrDie();

   GemmCacheKey key =
       std::make_tuple(stream->parent(), lhs->shape(), rhs->shape(),
                       instr->shape(), gemm_config.SerializeAsString());

   tensorflow::mutex_lock cache_lock(autotune_cache_mu);
   auto it = autotune_cache.find(key);
   int64 autotuning_requests = cache_hits + cache_misses;
   if (autotuning_requests && autotuning_requests % 10 == 0) {
     VLOG(2) << "Autotuning cache hits/(hits + misses): " << cache_hits << "/"
             << autotuning_requests;
   }

   if (it != autotune_cache.end()) {
     cache_hits++;
     VLOG(4) << "Autotuning cache hit, using algorithm: "
             << (it->second.has_value() ? absl::StrCat(it->second.value())
                                        : "<generic>");
     return it->second;
   }
   cache_misses++;
   VLOG(4) << "Autotuning cache miss";

   int64 batch_size = gemm_config.batch_size();
   absl::optional<se::blas::AlgorithmType> result;
   if (batch_size != 1) {
     // TODO(b/112111608): Implement auto tune for batched gemm.
     VLOG(2) << "Batch size is non-singular, using generic algorithm";
     result = absl::nullopt;
   } else {
     TF_ASSIGN_OR_RETURN(
         result,
         DoUncachedGemmAutotune(instr, lhs_buffer, rhs_buffer, output_buffer,
                                reference_result_buffer, stream, allocator,
                                comparator, crash_on_checking_failure));
   }

   CHECK(autotune_cache.emplace(key, result).second);
   return result;
 }

 static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
                                        se::StreamExecutor* executor,
                                        se::DeviceMemoryAllocator* allocator) {
   se::Stream stream{executor};
   stream.Init();

   if (allocator == nullptr) {
     allocator = executor->GetAllocator();
   }

   const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
   se::cuda::RedzoneAllocator input_output_allocator(
       executor->device_ordinal(), allocator,
       PtxOptsFromConfig(hlo_module_config));

   BufferComparator comparator(instr->shape(), hlo_module_config);

   int64 rng_state = 0;
   auto get_initialized_buffer =
       [&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
     TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
                         input_output_allocator.AllocateBytes(
                             &stream, ShapeUtil::ByteSizeOf(op->shape())));
     InitializeFloatBuffer(&stream, op->shape().element_type(), &rng_state,
                           buffer);
     return buffer;
   };

   GemmBackendConfig gemm_config =
       instr->backend_config<GemmBackendConfig>().ValueOrDie();
   const HloInstruction* lhs = instr->operand(0);
   const HloInstruction* rhs = instr->operand(1);

   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase lhs_buffer,
                       get_initialized_buffer(lhs));
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
                       get_initialized_buffer(rhs));
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase output_buffer,
                       get_initialized_buffer(instr));
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase reference_result_buffer,
                       get_initialized_buffer(instr));

   const DebugOptions& debug_options =
       instr->GetModule()->config().debug_options();

   const bool crash_on_checking_failure =
       debug_options.xla_gpu_crash_on_verification_failures();

   TF_ASSIGN_OR_RETURN(absl::optional<se::blas::AlgorithmType> gemm_algorithm,
                       DoGemmAutotune(instr, lhs, rhs, lhs_buffer, rhs_buffer,
                                      output_buffer, reference_result_buffer,
                                      &stream, crash_on_checking_failure,
                                      input_output_allocator, comparator));

   // We update instruction->backend_config(); if no algorithms are supported,
   // a different API is used, which does not require specifying an algorithm.
   GemmBackendConfig updated_config = gemm_config;
   if (gemm_algorithm) {
     updated_config.set_selected_algorithm(*gemm_algorithm);
   }
   TF_RETURN_IF_ERROR(instr->set_backend_config(updated_config));
   return updated_config.SerializeAsString() != gemm_config.SerializeAsString();
 }

 static StatusOr<bool> RunOnComputation(HloComputation* computation,
                                        se::StreamExecutor* se,
                                        se::DeviceMemoryAllocator* allocator) {
   bool changed = false;
   for (HloInstruction* instr : computation->instructions()) {
     if (IsCublasGemm(*instr)) {
       TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr, se, allocator));
       changed |= result;
     }
   }
   return changed;
 }

 StatusOr<bool> GemmAlgorithmPicker::Run(HloModule* module) {
   XLA_SCOPED_LOGGING_TIMER("GemmAlgorithmPicker");

   if (module->config().debug_options().xla_gpu_disable_autotune()) {
     VLOG(2) << "GEMM auto-tuning disabled, GemmAlgorithmPicker returning early";
     return false;
   }

   bool changed = false;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     TF_ASSIGN_OR_RETURN(
         bool result, RunOnComputation(computation, stream_exec_, allocator_));
     changed |= result;
   }
   return changed;
 }

 }  // namespace gpu
 }  // namespace xla
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"

	#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
	#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
	#include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
	#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
	#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
	#include "tensorflow/compiler/xla/service/hlo_computation.h"
	#include "tensorflow/compiler/xla/service/hlo_instruction.h"
	#include "tensorflow/compiler/xla/service/hlo_instructions.h"
	#include "tensorflow/compiler/xla/service/hlo_opcode.h"
	#include "tensorflow/compiler/xla/util.h"
	#include "tensorflow/core/lib/core/errors.h"
	#include "tensorflow/core/platform/logger.h"
	#include "tensorflow/core/protobuf/autotuning.pb.h"
	#include "tensorflow/core/util/proto/proto_utils.h"
	#include "tensorflow/stream_executor/blas.h"
	#include "tensorflow/stream_executor/cuda/redzone_allocator.h"
	#include "tensorflow/stream_executor/device_memory.h"
	#include "tensorflow/stream_executor/device_memory_allocator.h"

	namespace xla {
	namespace gpu {

	using tensorflow::AutotuneResult;

	using GemmCacheKey =
	std::tuple<se::StreamExecutor*, Shape, Shape, Shape, std::string>;

	static tensorflow::mutex autotune_cache_mu(tensorflow::LINKER_INITIALIZED);
	static auto& autotune_cache GUARDED_BY(autotune_cache_mu) =
	*new absl::flat_hash_map<GemmCacheKey,
	absl::optional<se::blas::AlgorithmType>>();
	static int64 cache_hits GUARDED_BY(autotune_cache_mu) = 0;
	static int64 cache_misses GUARDED_BY(autotune_cache_mu) = 0;

	// Experimentally tries to pick the best algorithm for the given gemm.
	//
	// This may fail under perfectly normal circumstances. In particular, it will
	// fail if the program was built with < CUDA 8 or if we're using a gpu older
	// than sm_50 -- in both cases, cublas doesn't support gemm-with-algorithm at
	// all.
	static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
	const HloInstruction* gemm, se::DeviceMemoryBase lhs_buffer,
	se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
	se::DeviceMemoryBase reference_result_buffer, se::Stream* stream,
	const se::cuda::RedzoneAllocator& allocator,
	const BufferComparator& comparator, bool crash_on_checking_failure) {
	if (!stream->parent()->SynchronizeAllActivity()) {
	return InternalError("Failed to synchronize GPU for autotuning.");
	}

	VLOG(3) << "Starting autotune of GemmThunk " << gemm->ToString();

	std::vector<se::blas::AlgorithmType> algorithms;
	CHECK(stream->parent()->GetBlasGemmAlgorithms(&algorithms));

	absl::optional<se::blas::AlgorithmType> first_algorithm;
	std::vector<AutotuneResult> profile_results;

	for (se::blas::AlgorithmType algorithm : algorithms) {
	// Make sure the output buffer always has the same value if we use
	// the bias parameter.
	if (gemm->backend_config<GemmBackendConfig>().ValueOrDie().beta() != 0) {
	int64 rng_state = 0;
	InitializeFloatBuffer(stream, gemm->shape().element_type(), &rng_state,
	output_buffer);
	}
	se::blas::ProfileResult profile_result;

	// We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
	// for all algorithms if we're targeting < sm_50. But because we pass a
	// non-null ProfileResult, DoGemmWithAlgorithm should always return true,
	// and the actual success-ness is returned in ProfileResult::is_valid.
	CHECK(RunGemm(gemm, lhs_buffer, rhs_buffer, output_buffer, stream,
	/implements_whole_instruction=/true,
	/profiler=/nullptr,
	/profile_result=/&profile_result, algorithm)
	.ok());

	if (!profile_result.is_valid()) {
	// Unsupported algorithm.
	continue;
	}

	profile_results.emplace_back();
	AutotuneResult& result = profile_results.back();
	result.mutable_gemm()->set_algorithm(algorithm);

	VLOG(2) << "cublas gemm algorithm " << algorithm << " took "
	<< profile_result.elapsed_time_in_ms() << "ms" << std::endl;

	*result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
	absl::Milliseconds(profile_result.elapsed_time_in_ms()));

	TF_ASSIGN_OR_RETURN(
	se::cuda::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
	allocator.CheckRedzones(stream));
	if (!rz_check_status.ok()) {
	result.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
	*result.mutable_failure()->mutable_msg() =
	rz_check_status.RedzoneFailureMsg();
	LOG(ERROR) << "Detected cuBLAS out-of-bounds write in gemm buffer";
	CHECK(!crash_on_checking_failure);
	continue;
	}

	if (!first_algorithm) {
	// First run: set the reference result buffer.
	CHECK(reference_result_buffer.size() == output_buffer.size());
	stream->ThenMemcpy(&reference_result_buffer, output_buffer,
	output_buffer.size());
	first_algorithm.emplace(algorithm);
	} else {
	// Perform the comparison.
	TF_ASSIGN_OR_RETURN(bool compare_result,
	comparator.CompareEqual(stream, output_buffer,
	reference_result_buffer));
	if (!compare_result) {
	LOG(ERROR) << "Results mismatch between different GEMM algorithms. "
	<< "This is likely a bug/unexpected loss of precision "
	<< "in cuBLAS.";
	CHECK(!crash_on_checking_failure);

	result.mutable_failure()->set_kind(AutotuneResult::WRONG_RESULT);
	result.mutable_failure()->mutable_reference_gemm()->set_algorithm(
	*first_algorithm);
	}
	}
	}

	tensorflow::AutotuningLog log;
	for (const AutotuneResult& profile : profile_results) {
	*log.add_results() = profile;
	}
	if (!crash_on_checking_failure) {
	tensorflow::Logger::GetSingleton()->LogProto(log);
	}

	// Choose fastest correct GEMM, but allow for incorrect results (since the
	// reference result is chosen arbitrary).
	auto has_failure = [](const AutotuneResult& r) {
	return r.has_failure() &&
	r.failure().kind() != AutotuneResult::WRONG_RESULT;
	};

	auto result_comparison_key = [&has_failure](const AutotuneResult& r) {
	return std::make_tuple(
	has_failure(r),
	tensorflow::proto_utils::FromDurationProto(r.run_time()));
	};
	const auto& best_result = absl::c_min_element(
	profile_results,
	[&](const AutotuneResult& lhs, const AutotuneResult& rhs) {
	return result_comparison_key(lhs) < result_comparison_key(rhs);
	});

	if (best_result != profile_results.end() && !has_failure(*best_result)) {
	return {best_result->gemm().algorithm()};
	}

	VLOG(1) << "Unable to autotune cuBLAS gemm on stream " << stream
	<< " none of the " << algorithms.size() << " ran successfully";
	return {absl::nullopt};
	}

	static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(
	const HloInstruction* instr, const HloInstruction* lhs,
	const HloInstruction* rhs, se::DeviceMemoryBase lhs_buffer,
	se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
	se::DeviceMemoryBase reference_result_buffer, se::Stream* stream,
	bool crash_on_checking_failure, const se::cuda::RedzoneAllocator& allocator,
	const BufferComparator& comparator) {
	// Don't run autotuning concurrently on the same GPU.
	tensorflow::mutex_lock gpu_lock = LockGpu(stream->parent());

	GemmBackendConfig gemm_config =
	instr->backend_config<GemmBackendConfig>().ValueOrDie();

	GemmCacheKey key =
	std::make_tuple(stream->parent(), lhs->shape(), rhs->shape(),
	instr->shape(), gemm_config.SerializeAsString());

	tensorflow::mutex_lock cache_lock(autotune_cache_mu);
	auto it = autotune_cache.find(key);
	int64 autotuning_requests = cache_hits + cache_misses;
	if (autotuning_requests && autotuning_requests % 10 == 0) {
	VLOG(2) << "Autotuning cache hits/(hits + misses): " << cache_hits << "/"
	<< autotuning_requests;
	}

	if (it != autotune_cache.end()) {
	cache_hits++;
	VLOG(4) << "Autotuning cache hit, using algorithm: "
	<< (it->second.has_value() ? absl::StrCat(it->second.value())
	: "<generic>");
	return it->second;
	}
	cache_misses++;
	VLOG(4) << "Autotuning cache miss";

	int64 batch_size = gemm_config.batch_size();
	absl::optional<se::blas::AlgorithmType> result;
	if (batch_size != 1) {
	// TODO(b/112111608): Implement auto tune for batched gemm.
	VLOG(2) << "Batch size is non-singular, using generic algorithm";
	result = absl::nullopt;
	} else {
	TF_ASSIGN_OR_RETURN(
	result,
	DoUncachedGemmAutotune(instr, lhs_buffer, rhs_buffer, output_buffer,
	reference_result_buffer, stream, allocator,
	comparator, crash_on_checking_failure));
	}

	CHECK(autotune_cache.emplace(key, result).second);
	return result;
	}

	static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
	se::StreamExecutor* executor,
	se::DeviceMemoryAllocator* allocator) {
	se::Stream stream{executor};
	stream.Init();

	if (allocator == nullptr) {
	allocator = executor->GetAllocator();
	}

	const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
	se::cuda::RedzoneAllocator input_output_allocator(
	executor->device_ordinal(), allocator,
	PtxOptsFromConfig(hlo_module_config));

	BufferComparator comparator(instr->shape(), hlo_module_config);

	int64 rng_state = 0;
	auto get_initialized_buffer =
	[&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
	input_output_allocator.AllocateBytes(
	&stream, ShapeUtil::ByteSizeOf(op->shape())));
	InitializeFloatBuffer(&stream, op->shape().element_type(), &rng_state,
	buffer);
	return buffer;
	};

	GemmBackendConfig gemm_config =
	instr->backend_config<GemmBackendConfig>().ValueOrDie();
	const HloInstruction* lhs = instr->operand(0);
	const HloInstruction* rhs = instr->operand(1);

	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase lhs_buffer,
	get_initialized_buffer(lhs));
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
	get_initialized_buffer(rhs));
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase output_buffer,
	get_initialized_buffer(instr));
	TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase reference_result_buffer,
	get_initialized_buffer(instr));

	const DebugOptions& debug_options =
	instr->GetModule()->config().debug_options();

	const bool crash_on_checking_failure =
	debug_options.xla_gpu_crash_on_verification_failures();

	TF_ASSIGN_OR_RETURN(absl::optional<se::blas::AlgorithmType> gemm_algorithm,
	DoGemmAutotune(instr, lhs, rhs, lhs_buffer, rhs_buffer,
	output_buffer, reference_result_buffer,
	&stream, crash_on_checking_failure,
	input_output_allocator, comparator));

	// We update instruction->backend_config(); if no algorithms are supported,
	// a different API is used, which does not require specifying an algorithm.
	GemmBackendConfig updated_config = gemm_config;
	if (gemm_algorithm) {
	updated_config.set_selected_algorithm(*gemm_algorithm);
	}
	TF_RETURN_IF_ERROR(instr->set_backend_config(updated_config));
	return updated_config.SerializeAsString() != gemm_config.SerializeAsString();
	}

	static StatusOr<bool> RunOnComputation(HloComputation* computation,
	se::StreamExecutor* se,
	se::DeviceMemoryAllocator* allocator) {
	bool changed = false;
	for (HloInstruction* instr : computation->instructions()) {
	if (IsCublasGemm(*instr)) {
	TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr, se, allocator));
	changed \|= result;
	}
	}
	return changed;
	}

	StatusOr<bool> GemmAlgorithmPicker::Run(HloModule* module) {
	XLA_SCOPED_LOGGING_TIMER("GemmAlgorithmPicker");

	if (module->config().debug_options().xla_gpu_disable_autotune()) {
	VLOG(2) << "GEMM auto-tuning disabled, GemmAlgorithmPicker returning early";
	return false;
	}

	bool changed = false;
	for (HloComputation* computation : module->MakeNonfusionComputations()) {
	TF_ASSIGN_OR_RETURN(
	bool result, RunOnComputation(computation, stream_exec_, allocator_));
	changed \|= result;
	}
	return changed;
	}

	} // namespace gpu
	} // namespace xla