| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "tensorflow/stream_executor/gpu/asm_compiler.h" |
| |
| #include "absl/container/flat_hash_map.h" |
| #include "absl/strings/str_format.h" |
| #include "absl/synchronization/mutex.h" |
| #include "tensorflow/core/lib/core/errors.h" |
| #include "tensorflow/core/lib/gtl/cleanup.h" |
| #include "tensorflow/core/lib/io/path.h" |
| #include "tensorflow/core/platform/cuda_libdevice_path.h" |
| #include "tensorflow/core/platform/env.h" |
| #include "tensorflow/core/platform/mutex.h" |
| #include "tensorflow/core/platform/regexp.h" |
| #include "tensorflow/core/platform/subprocess.h" |
| #include "tensorflow/stream_executor/gpu/gpu_driver.h" |
| #include "tensorflow/stream_executor/lib/statusor.h" |
| |
| namespace stream_executor { |
| |
| #if TENSORFLOW_USE_ROCM || defined(PLATFORM_WINDOWS) |
| |
| port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal, |
| const char* ptx_contents, |
| GpuAsmOpts options) { |
| // TODO(b/134675935): Subprocess invocation not supported on Windows. |
| return port::InternalError( |
| "Invoking GPU asm compilation is supported on Cuda non-Windows " |
| "platforms only"); |
| } |
| |
| port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached( |
| int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) { |
| return CompileGpuAsm(device_ordinal, ptx, compilation_options); |
| } |
| |
| #else |
| |
| // Prints a warning if the ptxas at ptxas_path has known bugs. |
| // |
| // Only prints a warning the first time it's called for a particular value of |
| // ptxas_path. |
| // |
| // Locks on entry. |
| static void WarnIfBadPtxasVersion(const string& ptxas_path) { |
| static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); |
| static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) = |
| new std::unordered_set<string>(); |
| |
| tensorflow::mutex_lock lock(mu); |
| if (!seen_ptxas_paths->insert(ptxas_path).second) { |
| // Already checked this ptx binary, nothing to do. |
| return; |
| } |
| |
| tensorflow::SubProcess ptxas; |
| ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"}); |
| ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE); |
| if (!ptxas.Start()) { |
| LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version"; |
| return; |
| } |
| |
| string out; |
| int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out, |
| /*stderr_output=*/nullptr); |
| if (exit_code != 0) { |
| LOG(WARNING) << "Running " << ptxas_path << " --version returned " |
| << exit_code; |
| return; |
| } |
| |
| int64 vmaj, vmin, vdot; |
| string vmaj_str, vmin_str, vdot_str; |
| if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str, |
| &vmin_str, &vdot_str) || |
| !absl::SimpleAtoi(vmaj_str, &vmaj) || |
| !absl::SimpleAtoi(vmin_str, &vmin) || |
| !absl::SimpleAtoi(vdot_str, &vdot)) { |
| LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path |
| << " --version:\n" |
| << out; |
| return; |
| } |
| |
| // We need ptxas >= 9.0 as a hard requirement, because we compile targeting |
| // PTX 6.0. An older ptxas will just fail to compile any of our code. |
| // |
| // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some |
| // address calculations with large offsets (e.g. "load ptr + large_constant"), |
| // b/70245379. |
| // |
| // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way |
| // that appears related to address calculations, b/111107644. ptxas 9.2.88 |
| // appears to work, as far as we can tell. |
| if (vmaj < 9) { |
| LOG(ERROR) |
| << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly " |
| "prefers >= 9.2.88). Compilation of XLA kernels below will likely " |
| "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas " |
| "binary is sufficient."; |
| } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) { |
| LOG(WARNING) |
| << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "." |
| << vdot |
| << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to " |
| "miscompile XLA code, leading to incorrect results or " |
| "invalid-address errors.\n\nYou do not need to update to CUDA " |
| "9.2.88; cherry-picking the ptxas binary is sufficient."; |
| } |
| } |
| |
| port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached( |
| int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) { |
| using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>; |
| static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED); |
| static auto& ptx_cache GUARDED_BY(ptx_cache_mutex) = |
| *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>(); |
| |
| tensorflow::mutex_lock lock(ptx_cache_mutex); |
| PtxCacheKey cache_key{device_ordinal, std::string(ptx), |
| compilation_options.ToTuple()}; |
| auto it = ptx_cache.find(cache_key); |
| if (it == ptx_cache.end()) { |
| TF_ASSIGN_OR_RETURN( |
| std::vector<uint8> compiled, |
| CompileGpuAsm(device_ordinal, ptx, compilation_options)); |
| it = ptx_cache.emplace(cache_key, std::move(compiled)).first; |
| } |
| |
| CHECK(it != ptx_cache.end()); |
| const std::vector<uint8>& compiled = it->second; |
| return absl::MakeSpan(compiled); |
| } |
| |
| port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal, |
| const char* ptx_contents, |
| GpuAsmOpts options) { |
| gpu::GpuDeviceHandle handle; |
| TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle)); |
| int cc_major; |
| int cc_minor; |
| TF_RETURN_IF_ERROR( |
| gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle)); |
| |
| string ptxas_path; |
| auto env = tensorflow::Env::Default(); |
| for (const string& cuda_root : |
| tensorflow::CandidateCudaRoots(options.preferred_cuda_dir)) { |
| ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas"); |
| VLOG(2) << "Looking for ptxas at " << ptxas_path; |
| if (env->FileExists(ptxas_path).ok()) { |
| break; |
| } |
| } |
| if (!env->FileExists(ptxas_path).ok()) { |
| // Rely on subprocess invocation to find the correct binary. |
| ptxas_path = "ptxas"; |
| } |
| VLOG(2) << "Using ptxas at " << ptxas_path; |
| |
| WarnIfBadPtxasVersion(ptxas_path); |
| |
| // Write ptx into a temporary file. |
| string ptx_path; |
| if (!env->LocalTempFilename(&ptx_path)) { |
| return port::InternalError("couldn't get temp PTX file name"); |
| } |
| auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] { |
| TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path)); |
| }); |
| |
| TF_RETURN_IF_ERROR( |
| tensorflow::WriteStringToFile(env, ptx_path, ptx_contents)); |
| VLOG(2) << "ptx written to: " << ptx_path; |
| |
| // Invoke ptxas and collect its output. |
| string cubin_path; |
| if (!env->LocalTempFilename(&cubin_path)) { |
| return port::InternalError("couldn't get temp CUBIN file name"); |
| } |
| auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] { |
| // CUBIN file may never be created, so the failure to delete it should not |
| // produce TF error. |
| tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError(); |
| }); |
| tensorflow::SubProcess ptxas_info_dumper; |
| std::vector<string> ptxas_args = { |
| ptxas_path, ptx_path, "-o", cubin_path, |
| absl::StrCat("-arch=sm_", cc_major, cc_minor)}; |
| if (VLOG_IS_ON(2)) { |
| ptxas_args.push_back("-v"); |
| } |
| if (options.disable_gpuasm_optimizations) { |
| ptxas_args.push_back("-O0"); |
| } |
| ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args); |
| ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR, |
| tensorflow::ACTION_PIPE); |
| if (!ptxas_info_dumper.Start()) { |
| return port::InternalError("Failed to launch ptxas"); |
| } |
| string stderr_output; |
| int exit_status = ptxas_info_dumper.Communicate( |
| /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output); |
| if (exit_status != 0) { |
| return port::InternalError( |
| absl::StrFormat("ptxas exited with non-zero error code %d, output: %s", |
| exit_status, stderr_output)); |
| } |
| |
| // Read in the result of compilation and return it as a byte vector. |
| string cubin; |
| TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(), |
| cubin_path, &cubin)); |
| std::vector<uint8> cubin_vector(cubin.begin(), cubin.end()); |
| return cubin_vector; |
| } |
| |
| #endif |
| |
| } // namespace stream_executor |