blob: ab0137aecabd9a9b8410db6a6dc807e7858448b0 [file] [log] [blame]
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "llvm/Target/TargetMachine.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
#if GOOGLE_CUDA
#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
#include "tensorflow/compiler/xla/service/gpu/nvptx_helper.h"
#endif
#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
#include "tensorflow/compiler/xla/service/hlo_module.h"
#include "tensorflow/compiler/xla/status.h"
#include "tensorflow/compiler/xla/tools/hlo_module_loader.h"
#include "tensorflow/core/platform/init_main.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/util/command_line_flags.h"
#include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
const char* const kUsage = R"(
This tool reads in an HloModule from a file, compiles it using the NVPTX
compiler and prints out the LLVM IR generated by the IR emitter. The LLVM IR is
not optimized by the LLVM pass pipeline, so this tool can be used to unit test
the XLA GPU IR emitters.
Note that the LLVM IR does not contain the *full* module, but only parts that
will be code generated into PTX. The NVPTX compiler also generates a
GpuExecutable on the side that is not printed.
When passed the parameter `--ptx`, the LLVM IR will be optimized and PTX
will be emitted and printed instead of the non-optimized LLVM.
By default SM 70 is targeted. But this can be changed with `--sm=SM`.)";
namespace {
xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
bool generate_ptx, int sm) {
TF_ASSIGN_OR_RETURN(
std::unique_ptr<xla::HloModule> hlo_module,
xla::LoadModuleFromData(/*data=*/hlo_text, /*format=*/"hlo"));
llvm::LLVMContext llvm_context;
// TODO(b/234717222): remove and update tests.
llvm_context.setOpaquePointers(false);
// For now we pretend we're compiling for V100. This can be generalized
// later.
xla::gpu::GpuDeviceInfo gpu_device_info{};
gpu_device_info.threads_per_block_limit = 1024;
gpu_device_info.threads_per_warp = 32;
gpu_device_info.shared_memory_per_block = 49152;
gpu_device_info.core_count = 80;
gpu_device_info.threads_per_core_limit = 2048;
gpu_device_info.block_dim_limit_x = 2147483647;
gpu_device_info.block_dim_limit_y = 65535;
gpu_device_info.block_dim_limit_z = 65535;
tensorflow::se::CudaComputeCapability cuda_compute_capability;
cuda_compute_capability.major = sm / 10;
cuda_compute_capability.minor = sm % 10;
tensorflow::se::RocmComputeCapability rocm_compute_capability("gfx908");
std::string target_triple = "nvptx64-nvidia-cuda";
std::string datalayout = "nvptx64-nvidia-cuda";
std::string platform_name = "CUDA";
stream_executor::Platform::Id platform_id =
stream_executor::cuda::kCudaPlatformId;
TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::Module> llvm_module,
xla::gpu::CompileModuleToLlvmIr(
hlo_module.get(), &llvm_context,
/*target_triple=*/xla::gpu::nvptx::TargetTriple(),
/*data_layout=*/xla::gpu::nvptx::DataLayout(),
/*platform_name=*/platform_name,
/*platform_id=*/platform_id, gpu_device_info,
cuda_compute_capability, rocm_compute_capability,
/*pointer_size=*/8));
if (!generate_ptx) {
llvm_module->print(llvm::outs(), nullptr);
} else {
#if GOOGLE_CUDA
std::string libdevice_dir = xla::gpu::GetLibdeviceDir(hlo_module->config());
TF_ASSIGN_OR_RETURN(std::string ptx,
xla::gpu::nvptx::CompileToPtx(
llvm_module.get(), cuda_compute_capability,
hlo_module->config(), libdevice_dir));
std::cout << ptx << std::endl;
#else
return {tensorflow::error::UNIMPLEMENTED,
"Feature not yet implemented in ROCm"};
#endif
}
return ::tensorflow::OkStatus();
}
xla::Status CompileAndPrintLlvmIrFromFile(const std::string& file_name,
bool ptx, int sm) {
std::string full_text;
TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
file_name, &full_text));
std::vector<std::string> hlo_module_texts =
absl::StrSplit(full_text, "// -----");
for (const std::string& hlo_module_text : hlo_module_texts) {
TF_RETURN_IF_ERROR(CompileAndPrintLlvmIr(hlo_module_text, ptx, sm));
}
return ::tensorflow::OkStatus();
}
} // namespace
int main(int argc, char** argv) {
bool ptx = false;
int sm = 70;
std::vector<tensorflow::Flag> flag_list;
xla::AppendDebugOptionsFlags(&flag_list);
flag_list.emplace_back("ptx", &ptx,
"Print PTX instead of not optimized LLVM.");
flag_list.emplace_back("sm", &sm,
"Specify the SM to target (useful only with --ptx).");
// The usage string includes the message at the top of the file, the
// DebugOptions flags and the flags defined above.
const std::string kUsageString = absl::StrCat(
kUsage, "\n\n", tensorflow::Flags::Usage(argv[0], flag_list));
bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
tensorflow::port::InitMain(kUsageString.c_str(), &argc, &argv);
if (!parse_ok) {
LOG(QFATAL) << kUsageString;
}
QCHECK(argc == 2) << "Must specify a single input file";
TF_CHECK_OK(CompileAndPrintLlvmIrFromFile(argv[1], ptx, sm));
return 0;
}