| /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h" |
| |
| #include "absl/memory/memory.h" |
| #include "tensorflow/compiler/xla/layout_util.h" |
| #include "tensorflow/compiler/xla/service/hlo_instruction.h" |
| #include "tensorflow/compiler/xla/util.h" |
| #include "tensorflow/core/lib/core/errors.h" |
| #include "tensorflow/core/lib/gtl/cleanup.h" |
| #include "tensorflow/core/lib/io/path.h" |
| #include "tensorflow/core/platform/cuda_libdevice_path.h" |
| #include "tensorflow/core/platform/regexp.h" |
| #include "tensorflow/core/platform/subprocess.h" |
| #include "tensorflow/core/platform/tracing.h" |
| #include "tensorflow/core/profiler/lib/traceme.h" |
| #include "tensorflow/core/util/determinism.h" |
| #include "tensorflow/core/util/env_var.h" |
| #include "tensorflow/core/util/proto/proto_utils.h" |
| #include "tensorflow/stream_executor/kernel_spec.h" |
| |
| namespace xla { |
| namespace gpu { |
| |
| namespace { |
| |
| using se::dnn::DataLayout; |
| using se::dnn::DataLayoutString; |
| using se::dnn::FilterLayout; |
| using se::dnn::FilterLayoutString; |
| using tensorflow::AutotuneResult; |
| |
| // Returns the smallest integer >= 0 that's not in the given set of numbers. |
| // |
| // For example, FindMissingDnum({1, 0, 3, 4}) returns 2. |
| // |
| // This is useful for handling DataLayout::kBatchDepthYX4, which repesents a |
| // layout [N, C/k, H, W, k] for some constant k, usually 4 or 32. |
| // ConvolutionDimensionNumbers doesn't explicitly say which dimension is `k`, |
| // but we can infer it by finding the first dnum that isn't otherwise mentioned |
| // in the dnums. |
| int64_t FindMissingDnum(absl::Span<const int64_t> vals) { |
| for (int i = 0; i < vals.size(); i++) { |
| if (!absl::c_linear_search(vals, i)) { |
| return i; |
| } |
| } |
| return vals.size(); |
| } |
| |
| // Returns a mutex that can be used to lock the given stream executor. |
| tensorflow::mutex& GetGpuMutex(const se::StreamExecutor* stream_exec) { |
| static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); |
| // se::Platform*s are global singletons guaranteed to live forever. |
| static auto* mutexes = |
| new std::map<std::pair<const se::Platform*, /*device_ordinal*/ int64_t>, |
| tensorflow::mutex>(); |
| |
| tensorflow::mutex_lock global_lock(mu); |
| auto it = mutexes |
| ->emplace(std::piecewise_construct, |
| std::make_tuple(stream_exec->platform(), |
| stream_exec->device_ordinal()), |
| std::make_tuple()) |
| .first; |
| |
| return it->second; |
| } |
| |
| } // anonymous namespace |
| |
| StatusOr<std::tuple<Layout, Layout, Layout>> |
| StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums, |
| DataLayout input, FilterLayout filter, |
| DataLayout output) { |
| std::vector<int64_t> input_layout; |
| switch (input) { |
| case DataLayout::kBatchDepthYX: // NCHW |
| input_layout.push_back(dnums.input_batch_dimension()); |
| input_layout.push_back(dnums.input_feature_dimension()); |
| input_layout.insert(input_layout.end(), |
| dnums.input_spatial_dimensions().begin(), |
| dnums.input_spatial_dimensions().end()); |
| break; |
| case DataLayout::kBatchDepthYX4: // NCHW_VECT_C |
| case DataLayout::kBatchDepthYX32: // NCHW_VECT_C |
| input_layout.push_back(dnums.input_batch_dimension()); |
| input_layout.push_back(dnums.input_feature_dimension()); |
| input_layout.insert(input_layout.end(), |
| dnums.input_spatial_dimensions().begin(), |
| dnums.input_spatial_dimensions().end()); |
| input_layout.push_back(FindMissingDnum(input_layout)); |
| break; |
| case DataLayout::kBatchYXDepth: // NHWC |
| input_layout.push_back(dnums.input_batch_dimension()); |
| input_layout.insert(input_layout.end(), |
| dnums.input_spatial_dimensions().begin(), |
| dnums.input_spatial_dimensions().end()); |
| input_layout.push_back(dnums.input_feature_dimension()); |
| break; |
| default: |
| return InternalError("Invalid input layout %s for conv with dnums %s", |
| DataLayoutString(input), |
| ConvolutionDimensionNumbersToString(dnums)); |
| } |
| |
| std::vector<int64_t> filter_layout; |
| switch (filter) { |
| case FilterLayout::kOutputInputYX: // OIHW |
| filter_layout.push_back(dnums.kernel_output_feature_dimension()); |
| filter_layout.push_back(dnums.kernel_input_feature_dimension()); |
| filter_layout.insert(filter_layout.end(), |
| dnums.kernel_spatial_dimensions().begin(), |
| dnums.kernel_spatial_dimensions().end()); |
| break; |
| case FilterLayout::kOutputInputYX4: // OIHW_VECT_C |
| case FilterLayout::kOutputInputYX32: // OIHW_VECT_C |
| filter_layout.push_back(dnums.kernel_output_feature_dimension()); |
| filter_layout.push_back(dnums.kernel_input_feature_dimension()); |
| filter_layout.insert(filter_layout.end(), |
| dnums.kernel_spatial_dimensions().begin(), |
| dnums.kernel_spatial_dimensions().end()); |
| filter_layout.push_back(FindMissingDnum(filter_layout)); |
| break; |
| case FilterLayout::kOutputYXInput: // OHWI |
| filter_layout.push_back(dnums.kernel_output_feature_dimension()); |
| filter_layout.insert(filter_layout.end(), |
| dnums.kernel_spatial_dimensions().begin(), |
| dnums.kernel_spatial_dimensions().end()); |
| filter_layout.push_back(dnums.kernel_input_feature_dimension()); |
| break; |
| default: |
| return InternalError("Invalid filter layout %s for conv with dnums %s", |
| FilterLayoutString(filter), |
| ConvolutionDimensionNumbersToString(dnums)); |
| } |
| |
| std::vector<int64_t> output_layout; |
| switch (output) { |
| case DataLayout::kBatchDepthYX: // NCHW |
| output_layout.push_back(dnums.output_batch_dimension()); |
| output_layout.push_back(dnums.output_feature_dimension()); |
| output_layout.insert(output_layout.end(), |
| dnums.output_spatial_dimensions().begin(), |
| dnums.output_spatial_dimensions().end()); |
| break; |
| case DataLayout::kBatchDepthYX4: // NCHW_VECT_C |
| case DataLayout::kBatchDepthYX32: // NCHW_VECT_C |
| output_layout.push_back(dnums.output_batch_dimension()); |
| output_layout.push_back(dnums.output_feature_dimension()); |
| output_layout.insert(output_layout.end(), |
| dnums.output_spatial_dimensions().begin(), |
| dnums.output_spatial_dimensions().end()); |
| output_layout.push_back(FindMissingDnum(output_layout)); |
| break; |
| case DataLayout::kBatchYXDepth: // NHWC |
| output_layout.push_back(dnums.output_batch_dimension()); |
| output_layout.insert(output_layout.end(), |
| dnums.output_spatial_dimensions().begin(), |
| dnums.output_spatial_dimensions().end()); |
| output_layout.push_back(dnums.output_feature_dimension()); |
| break; |
| default: |
| return InternalError("Invalid output layout %s for conv with dnums %s", |
| DataLayoutString(output), |
| ConvolutionDimensionNumbersToString(dnums)); |
| } |
| |
| return std::make_tuple(LayoutUtil::MakeLayoutFromMajorToMinor(input_layout), |
| LayoutUtil::MakeLayoutFromMajorToMinor(filter_layout), |
| LayoutUtil::MakeLayoutFromMajorToMinor(output_layout)); |
| } |
| |
| StatusOr<std::tuple<DataLayout, FilterLayout, DataLayout>> |
| XlaConvShapesToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums, |
| const Shape& input, const Shape& filter, |
| const Shape& output) { |
| CHECK(input.has_layout()); |
| CHECK(filter.has_layout()); |
| CHECK(output.has_layout()); |
| |
| Layout nchw_input, nchw_filter, nchw_output; |
| std::tie(nchw_input, nchw_filter, nchw_output) = |
| StreamExecutorConvLayoutsToXlaLayouts(dnums, DataLayout::kBatchDepthYX, |
| FilterLayout::kOutputInputYX, |
| DataLayout::kBatchDepthYX) |
| .ConsumeValueOrDie(); |
| |
| // NCHW4 and NCHW32 have the same Layout; we disambiguate them below. |
| Layout nchw_vect_input, nchw_vect_filter, nchw_vect_output; |
| std::tie(nchw_vect_input, nchw_vect_filter, nchw_vect_output) = |
| StreamExecutorConvLayoutsToXlaLayouts(dnums, DataLayout::kBatchDepthYX4, |
| FilterLayout::kOutputInputYX4, |
| DataLayout::kBatchDepthYX4) |
| .ConsumeValueOrDie(); |
| |
| Layout nhwc_input, nhwc_filter, nhwc_output; |
| std::tie(nhwc_input, nhwc_filter, nhwc_output) = |
| StreamExecutorConvLayoutsToXlaLayouts(dnums, DataLayout::kBatchYXDepth, |
| FilterLayout::kOutputYXInput, |
| DataLayout::kBatchYXDepth) |
| .ConsumeValueOrDie(); |
| |
| DataLayout input_layout; |
| if (LayoutUtil::Equal(input.layout(), nchw_input)) { |
| input_layout = DataLayout::kBatchDepthYX; |
| } else if (LayoutUtil::Equal(input.layout(), nchw_vect_input)) { |
| // Differentiate between VECT_4 and VECT_32 by looking at the input shape. |
| int64_t vect_size = input.dimensions(input.layout().minor_to_major(0)); |
| if (vect_size == 4) { |
| input_layout = DataLayout::kBatchDepthYX4; |
| } else if (vect_size == 32) { |
| input_layout = DataLayout::kBatchDepthYX32; |
| } else { |
| return InternalError( |
| "Invalid input shape %s for conv with dnums %s. Most-minor dim " |
| "should be 4 or 32, but was %d.", |
| ShapeUtil::HumanStringWithLayout(input), |
| ConvolutionDimensionNumbersToString(dnums), vect_size); |
| } |
| } else if (LayoutUtil::Equal(input.layout(), nhwc_input)) { |
| input_layout = DataLayout::kBatchYXDepth; |
| } else { |
| return InternalError("Invalid input layout %s for conv with dnums %s", |
| LayoutUtil::HumanString(input.layout()), |
| ConvolutionDimensionNumbersToString(dnums)); |
| } |
| |
| FilterLayout filter_layout; |
| if (LayoutUtil::Equal(filter.layout(), nchw_filter)) { |
| filter_layout = FilterLayout::kOutputInputYX; |
| } else if (LayoutUtil::Equal(filter.layout(), nchw_vect_filter)) { |
| int64_t vect_size = filter.dimensions(filter.layout().minor_to_major(0)); |
| if (vect_size == 4) { |
| filter_layout = FilterLayout::kOutputInputYX4; |
| } else if (vect_size == 32) { |
| filter_layout = FilterLayout::kOutputInputYX32; |
| } else { |
| return InternalError( |
| "Invalid filter shape %s for conv with dnums %s. Most-minor dim " |
| "should be 4 or 32, but was %d.", |
| ShapeUtil::HumanStringWithLayout(filter), |
| ConvolutionDimensionNumbersToString(dnums), vect_size); |
| } |
| } else if (LayoutUtil::Equal(filter.layout(), nhwc_filter)) { |
| filter_layout = FilterLayout::kOutputYXInput; |
| } else { |
| return InternalError("Invalid filter layout %s for conv with dnums %s", |
| LayoutUtil::HumanString(filter.layout()), |
| ConvolutionDimensionNumbersToString(dnums)); |
| } |
| |
| DataLayout output_layout; |
| if (LayoutUtil::Equal(output.layout(), nchw_output)) { |
| output_layout = DataLayout::kBatchDepthYX; |
| } else if (LayoutUtil::Equal(output.layout(), nchw_vect_output)) { |
| int64_t vect_size = output.dimensions(output.layout().minor_to_major(0)); |
| if (vect_size == 4) { |
| output_layout = DataLayout::kBatchDepthYX4; |
| } else if (vect_size == 32) { |
| output_layout = DataLayout::kBatchDepthYX32; |
| } else { |
| return InternalError( |
| "Invalid output shape %s for conv with dnums %s. Most-minor dim " |
| "should be 4 or 32, but was %d.", |
| ShapeUtil::HumanStringWithLayout(output), |
| ConvolutionDimensionNumbersToString(dnums), vect_size); |
| } |
| } else if (LayoutUtil::Equal(output.layout(), nhwc_output)) { |
| output_layout = DataLayout::kBatchYXDepth; |
| } else { |
| return InternalError("Invalid output layout %s for conv with dnums %s", |
| LayoutUtil::HumanString(output.layout()), |
| ConvolutionDimensionNumbersToString(dnums)); |
| } |
| |
| return std::make_tuple(input_layout, filter_layout, output_layout); |
| } |
| |
| // Given unique integers D = {d0, d1, ds...}, finds the first integer less than |
| // `rank` which is not in D. If there is no such number (because all the values |
| // in [0, rank) appear), returns nullopt. |
| // |
| // When D is the set of dimensions in a ConvolutionDimensionNumbers, this finds |
| // the dimension number that corresponds to the vectorized-features dimension in |
| // the convolution. |
| static absl::optional<int64_t> FindVectorizedDim(int64_t rank, int64_t d0, |
| int64_t d1, |
| absl::Span<const int64_t> ds) { |
| for (int64_t i = 0; i < rank; i++) { |
| if (i == d0 || i == d1 || absl::c_linear_search(ds, i)) { |
| continue; |
| } |
| return i; |
| } |
| return absl::nullopt; |
| } |
| |
| std::tuple<absl::optional<int64_t>, absl::optional<int64_t>, |
| absl::optional<int64_t>> |
| FindVectorizedFeatureDims(const ConvolutionDimensionNumbers& dnums, |
| const Shape& input, const Shape& filter, |
| const Shape& output) { |
| return { |
| FindVectorizedDim(input.dimensions_size(), dnums.input_batch_dimension(), |
| dnums.input_feature_dimension(), |
| dnums.input_spatial_dimensions()), |
| FindVectorizedDim(filter.dimensions_size(), |
| dnums.kernel_input_feature_dimension(), |
| dnums.kernel_output_feature_dimension(), |
| dnums.kernel_spatial_dimensions()), |
| FindVectorizedDim( |
| output.dimensions_size(), dnums.output_batch_dimension(), |
| dnums.output_feature_dimension(), dnums.output_spatial_dimensions()), |
| }; |
| } |
| |
| tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) { |
| tensorflow::mutex& mu = GetGpuMutex(stream_exec); |
| return tensorflow::mutex_lock{mu}; |
| } |
| |
| tensorflow::tf_shared_lock LockGpuShared( |
| const se::StreamExecutor* stream_exec) { |
| tensorflow::mutex& mu = GetGpuMutex(stream_exec); |
| return tensorflow::tf_shared_lock{mu}; |
| } |
| |
| StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel( |
| absl::string_view kernel_name, uint64 num_args, absl::string_view ptx, |
| absl::Span<const uint8> cubin_data, se::StreamExecutor* stream_exec) { |
| se::MultiKernelLoaderSpec loader_spec(num_args); |
| loader_spec.AddCudaPtxInMemory(ptx, kernel_name); |
| |
| if (!cubin_data.empty()) { |
| loader_spec.AddCudaCubinInMemory( |
| reinterpret_cast<const char*>(cubin_data.data()), kernel_name); |
| } |
| |
| auto kernel_base = absl::make_unique<se::KernelBase>(stream_exec); |
| TF_RETURN_IF_ERROR(stream_exec->GetKernel(loader_spec, kernel_base.get())); |
| return std::move(kernel_base); |
| } |
| |
| Status ExecuteKernelOnStream(const se::KernelBase& kernel, |
| absl::Span<const se::DeviceMemoryBase> args, |
| const LaunchDimensions& dims, se::Stream* stream) { |
| static constexpr int kKernelArgsLimit = 1024; |
| auto kernel_args = absl::make_unique<se::KernelArgsArray<kKernelArgsLimit>>(); |
| for (const se::DeviceMemoryBase& buf : args) { |
| kernel_args->add_device_memory_argument(buf); |
| } |
| LaunchDimensions::Dim3D thread_counts = dims.thread_counts_per_block(); |
| LaunchDimensions::Dim3D block_counts = dims.block_counts(); |
| return stream->parent()->Launch( |
| stream, se::ThreadDim(thread_counts.x, thread_counts.y, thread_counts.z), |
| se::BlockDim(block_counts.x, block_counts.y, block_counts.z), kernel, |
| *kernel_args); |
| } |
| |
| // Unimplemented for integers yet. |
| template <typename T, typename Generator> |
| typename std::enable_if<std::is_integral<T>::value, |
| T>::type static UniformDistribution(T lhs, T rhs, |
| Generator* gen) = |
| delete; |
| |
| template <typename T, typename Generator> |
| typename std::enable_if<std::is_floating_point<T>::value, |
| T>::type static UniformDistribution(T lhs, T rhs, |
| Generator* gen) { |
| return std::uniform_real_distribution<T>(lhs, rhs)(*gen); |
| } |
| |
| template <typename T> |
| static void InitializeTypedBuffer(se::Stream* stream, |
| se::DeviceMemoryBase buffer, |
| int64_t* rng_state) { |
| // Accesses to static variables are not locked, since the caller is already |
| // in a critical section. |
| static std::vector<T>* host_buffer = [] { |
| // Use a large prime number to fragment the accesses. |
| auto* ret = new std::vector<T>(10069); |
| // Default-seeded random numbers. |
| std::mt19937 gen; |
| for (auto& element : *ret) { |
| // Only double gets random values in double. Other data types get random |
| // values in float then cast them to the target data types. |
| using RandomFloatingPointType = |
| typename std::conditional<std::is_same<T, Eigen::half>::value, float, |
| T>::type; |
| using RandomType = |
| typename std::conditional<std::is_integral<T>::value, float, |
| RandomFloatingPointType>::type; |
| // Scale down the values for fp16 to have less overflows. |
| auto upper_bound = |
| RandomType(std::is_same<T, Eigen::half>::value ? 0.1 : 1.0); |
| auto rand_val = UniformDistribution(RandomType(0), upper_bound, &gen); |
| // For float or double, it is between [0,1]. |
| // For fp16, it ranges between [0, 0.1]. |
| // For integer types, element is either 0 or 1 for less overflows |
| // especially for int8. |
| element = T(std::is_integral<T>::value ? rand_val + 0.5 : rand_val); |
| } |
| return ret; |
| }(); |
| |
| int64_t& host_index = *rng_state; |
| |
| char* current_addr = static_cast<char*>(buffer.opaque()); |
| CHECK_EQ(0, buffer.size() % sizeof(T)); |
| int64_t elements_left = buffer.size() / sizeof(T); |
| while (elements_left > 0) { |
| CHECK_LE(host_index, host_buffer->size()); |
| if (host_buffer->size() == host_index) { |
| host_index = 0; |
| } |
| int64_t elements_copied = |
| std::min<int64_t>(host_buffer->size() - host_index, elements_left); |
| se::DeviceMemoryBase mem(current_addr, elements_copied * sizeof(T)); |
| stream->ThenMemcpy(&mem, host_buffer->data() + host_index, |
| elements_copied * sizeof(T)); |
| current_addr += elements_copied * sizeof(T); |
| elements_left -= elements_copied; |
| host_index += elements_copied; |
| } |
| } |
| |
| void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type, |
| int64_t* rng_state, se::DeviceMemoryBase buffer) { |
| switch (buffer_type) { |
| case xla::F16: |
| case xla::BF16: |
| // Using F16 for BF16 initialization: it's fine since we only need some |
| // random number there, and random generator is not working for BF16 (not |
| // all required overloads are there). |
| return InitializeTypedBuffer<Eigen::half>(stream, buffer, rng_state); |
| case xla::F32: |
| case xla::C64: |
| return InitializeTypedBuffer<float>(stream, buffer, rng_state); |
| case xla::F64: |
| case xla::C128: |
| return InitializeTypedBuffer<double>(stream, buffer, rng_state); |
| case xla::S8: |
| return InitializeTypedBuffer<int8>(stream, buffer, rng_state); |
| case xla::S32: |
| return InitializeTypedBuffer<int32>(stream, buffer, rng_state); |
| default: |
| LOG(FATAL) << "Unexpected type: " |
| << primitive_util::LowercasePrimitiveTypeName(buffer_type); |
| } |
| } |
| |
| StatusOr<se::dnn::ConvolutionKind> GetDNNConvKindFromCudnnConvKind( |
| CudnnConvKind kind) { |
| switch (kind) { |
| case CudnnConvKind::kBackwardFilter: |
| return se::dnn::BACKWARD_FILTER; |
| case CudnnConvKind::kBackwardInput: |
| return se::dnn::BACKWARD_DATA; |
| case CudnnConvKind::kForward: |
| return se::dnn::FORWARD; |
| case CudnnConvKind::kForwardActivation: |
| return se::dnn::FORWARD_BIAS_ACTIVATION; |
| default: |
| break; |
| } |
| return InternalError("Unexpected convolution kind"); |
| } |
| |
| StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType( |
| PrimitiveType type) { |
| switch (type) { |
| case F16: |
| return se::dnn::ToDataType<Eigen::half>::value; |
| case F32: |
| return se::dnn::ToDataType<float>::value; |
| case F64: |
| return se::dnn::ToDataType<double>::value; |
| default: |
| break; |
| } |
| return InternalError("Unsupported convolution datatype"); |
| } |
| |
| bool RequireDeterminism(const HloModuleConfig& config) { |
| static bool require_cudnn_determinism = [] { |
| // TODO(reedwm): Remove the TF_CUDNN_DETERMINISTIC env var. |
| bool cudnn_deterministic = false; |
| TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC", |
| /*default_val=*/false, |
| &cudnn_deterministic)); |
| return cudnn_deterministic; |
| }(); |
| return tensorflow::OpDeterminismRequired() || require_cudnn_determinism || |
| config.debug_options().xla_gpu_deterministic_ops(); |
| } |
| |
| StatusOr<AutotuneResult> PickBestResult( |
| absl::Span<AutotuneResult const> profile_results, |
| const HloInstruction& instr) { |
| std::vector<AutotuneResult> filtered_results; |
| |
| // For now, we ignore WRONG_RESULT failures because false-positives are |
| // possible (e.g. perhaps the reference algorithm is the one that's |
| // incorrect!). But we don't ignore REDZONE_MODIFIED failures because they're |
| // quite severe and can be detected with high accuracy. |
| absl::c_copy_if( |
| profile_results, std::back_inserter(filtered_results), |
| [](const AutotuneResult& r) { |
| return !(r.has_failure() && |
| r.failure().kind() != AutotuneResult::WRONG_RESULT); |
| }); |
| |
| if (filtered_results.empty()) { |
| return InternalError( |
| "All algorithms tried for %s failed. Falling back to " |
| "default algorithm. ", |
| instr.ToString()); |
| } |
| |
| auto selected_result = filtered_results.begin(); |
| if (!RequireDeterminism(instr.parent()->parent()->config())) { |
| selected_result = absl::c_min_element( |
| filtered_results, |
| [](const AutotuneResult& lhs, const AutotuneResult& rhs) { |
| return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) < |
| tensorflow::proto_utils::FromDurationProto(rhs.run_time()); |
| }); |
| } |
| return *selected_result; |
| } |
| |
| } // namespace gpu |
| } // namespace xla |