| /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ |
| #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ |
| |
| #include <memory> |
| #include <string> |
| #include <vector> |
| |
| #include "absl/container/node_hash_map.h" |
| #include "absl/types/optional.h" |
| #include "absl/types/span.h" |
| #include "tensorflow/compiler/xla/service/executable.h" |
| #include "tensorflow/compiler/xla/service/hlo_module.h" |
| #include "tensorflow/compiler/xla/service/llvm_compiler.h" |
| #include "tensorflow/compiler/xla/statusor.h" |
| #include "tensorflow/compiler/xla/types.h" |
| #include "tensorflow/core/lib/hash/hash.h" |
| #include "tensorflow/core/platform/macros.h" |
| #include "tensorflow/core/platform/mutex.h" |
| #include "tensorflow/core/platform/stream_executor_no_cuda.h" |
| #include "tensorflow/core/platform/thread_annotations.h" |
| #include "tensorflow/stream_executor/stream_executor_pimpl.h" |
| |
| namespace xla { |
| namespace gpu { |
| |
| // Temporarily expose the optimization pipeline for the GPU backend for reuse |
| // in the MLIR GPU backend. |
| // TODO(b/137624192): Remove once MLIR backend uses tailored optimizations. |
| namespace impl { |
| |
| Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, |
| se::DeviceMemoryAllocator* device_allocator); |
| Status PrepareHloModuleForIrEmitting(HloModule* hlo_module); |
| |
| } // namespace impl |
| |
| // The GPU compiler generates efficient GPU executables. |
| class NVPTXCompiler : public LLVMCompiler { |
| public: |
| NVPTXCompiler(); |
| ~NVPTXCompiler() override {} |
| |
| // Bring in |
| // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile( |
| // std::vector<std::unique_ptr<HloModule>> modules, |
| // std::vector<std::vector<se::StreamExecutor*>> |
| // stream_execs) |
| using LLVMCompiler::Compile; |
| |
| StatusOr<std::unique_ptr<HloModule>> RunHloPasses( |
| std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec, |
| se::DeviceMemoryAllocator* device_allocator) override; |
| |
| StatusOr<std::unique_ptr<Executable>> RunBackend( |
| std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec, |
| se::DeviceMemoryAllocator* device_allocator) override; |
| |
| StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>> |
| CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group, |
| AotCompilationOptions const& options) override; |
| |
| se::Platform::Id PlatformId() const override; |
| |
| HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override { |
| // Capture just the pointer size, not the entire NVPTXCompiler object. |
| int64 pointer_size = pointer_size_; |
| return [pointer_size](const Shape& shape) { |
| return ShapeUtil::ByteSizeOf(shape, pointer_size); |
| }; |
| } |
| |
| private: |
| // The size in bytes of a pointer. Used by ShapeSizeBytesFunction. |
| const int64 pointer_size_; |
| |
| tensorflow::mutex mutex_; |
| |
| // When compiling an HLO module, we need to find a path to the nvvm libdevice |
| // files. We search in the module's config.debug_options().cuda_data_dir() |
| // and in tensorflow::LibdeviceRoot(), the latter of which is a constant. |
| // |
| // We cache the cuda_data_dir() and the result of our search, so that if the |
| // next module we have to compile has the same cuda_data_dir(), we can skip |
| // the search. |
| string cached_cuda_data_dir_ GUARDED_BY(mutex_); |
| string cached_libdevice_dir_ GUARDED_BY(mutex_); |
| |
| // Tries to compile the given ptx string to cubin. Returns a vector with the |
| // compiled cubin. If compilation was unsuccessful, returns an empty vector. |
| std::vector<uint8> CompilePtxOrGetCachedResult( |
| se::StreamExecutor* stream_exec, const string& ptx, int cc_major, |
| int cc_minor, const HloModuleConfig& hlo_module_config); |
| |
| // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor} |
| // -> cubin so we don't recompile the same ptx twice. This is important for |
| // some interactive workflows. (We also cache at the HLO level, but sometimes |
| // we can't realize that two modules are the same until we lower to ptx.) |
| // |
| // Compilation of distinct PTX happens in parallel. If more than one thread |
| // attempts to compile the same PTX, the fist thread to obtain |
| // cache_value_->mutex_ performs the compilation. The rest wait() on |
| // cache_value_->compilation_done_cv_ until the compilation is done. |
| // |
| // If compiling the ptx fails, we return an empty cubin, cross our fingers, |
| // and leave compilation up to the driver. |
| struct CompilationCacheKey { |
| CompilationCacheKey(std::string ptx, int cc_major, int cc_minor) |
| : ptx(std::move(ptx)), cc_major(cc_major), cc_minor(cc_minor) {} |
| string ptx; |
| int cc_major; |
| int cc_minor; |
| }; |
| struct CompilationCacheHash { |
| size_t operator()(const CompilationCacheKey& key) const { |
| return tensorflow::Hash64Combine( |
| tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx), key.cc_major), |
| key.cc_minor); |
| } |
| }; |
| struct CompilationCacheEq { |
| size_t operator()(const CompilationCacheKey& a, |
| const CompilationCacheKey& b) const { |
| return a.cc_major == b.cc_major && a.cc_minor == b.cc_minor && |
| a.ptx == b.ptx; |
| } |
| }; |
| struct CompilationCacheValue { |
| bool compilation_done = false; |
| std::vector<uint8> cubin_data; |
| // mutex and condition variable to serialize compilation completing. |
| tensorflow::mutex mutex_; |
| tensorflow::condition_variable compilation_done_cv_; |
| }; |
| |
| // Don't even think about switching this to flat_hash_map; iterator stability |
| // is critical here. |
| absl::node_hash_map<CompilationCacheKey, CompilationCacheValue, |
| CompilationCacheHash, CompilationCacheEq> |
| compilation_cache_ GUARDED_BY(mutex_); |
| |
| TF_DISALLOW_COPY_AND_ASSIGN(NVPTXCompiler); |
| }; |
| |
| } // namespace gpu |
| } // namespace xla |
| |
| #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ |