blob: c528ee4740e4a27c2bc798d5dec17412432f24d1 [file] [log] [blame]
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
//===- kernel_creator.cc ----------------------------------------*- C++ -*-===//
//
// This file implements the function to compile a TF kernel function to gpu
// binary (hsaco for AMD, cubin for NVIDIA) or to a gpu binary with host side.
//
//===----------------------------------------------------------------------===//
#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" // from @llvm-project
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" // from @llvm-project
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" // from @llvm-project
#include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h" // from @llvm-project
#include "mlir/Conversion/SCFToStandard/SCFToStandard.h" // from @llvm-project
#include "mlir/Dialect/GPU/GPUDialect.h" // from @llvm-project
#include "mlir/Dialect/GPU/ParallelLoopMapper.h" // from @llvm-project
#include "mlir/Dialect/GPU/Passes.h" // from @llvm-project
#include "mlir/Dialect/LLVMIR/LLVMDialect.h" // from @llvm-project
#include "mlir/Dialect/LLVMIR/NVVMDialect.h" // from @llvm-project
#include "mlir/Dialect/Linalg/Passes.h" // from @llvm-project
#include "mlir/Dialect/SCF/Passes.h" // from @llvm-project
#include "mlir/Dialect/SCF/SCF.h" // from @llvm-project
#include "mlir/Dialect/SCF/Transforms.h" // from @llvm-project
#include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project
#include "mlir/Parser.h" // from @llvm-project
#include "mlir/Pass/Pass.h" // from @llvm-project
#include "mlir/Pass/PassManager.h" // from @llvm-project
#include "mlir/Transforms/BufferPlacement.h" // from @llvm-project
#include "mlir/Transforms/DialectConversion.h" // from @llvm-project
#include "mlir/Transforms/Passes.h" // from @llvm-project
#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
#include "tensorflow/compiler/xla/util.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/path.h"
namespace tensorflow {
namespace kernel_gen {
namespace {
using tensorflow::Status;
using xla::InternalError;
using xla::StatusOr;
constexpr llvm::StringRef kGpuBinaryAttrName = "gpu.binary";
Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> unroll_factors) {
mlir::PassManager pm(module.getContext());
applyPassManagerCLOptions(pm);
// TODO(b/169357508): renable when pipeline is serializable.
// SetCrashReproducer(pm);
pm.addPass(mlir::mhlo::createLegalizeTFPass(false));
if (gpu_binary_only) {
pm.addNestedPass<mlir::FuncOp>(
mlir::kernel_gen::transforms::CreateMaterializeBroadcastsPass());
pm.addNestedPass<mlir::FuncOp>(
mlir::kernel_gen::transforms::CreateUnfuseBatchNormPass());
pm.addPass(mlir::mhlo::createLegalizeToLhloPass(
/*results_escape_functions=*/true));
// Moving `AllocOp`s and inserting missing `DeallocOp`s
pm.addPass(::mlir::createBufferPlacementPass());
pm.addNestedPass<mlir::FuncOp>(mlir::createCopyRemovalPass());
pm.addPass(mlir::kernel_gen::transforms::CreateShapeToDescriptorsPass());
} else {
pm.addPass(mlir::createTransformUnrankedHloPass());
pm.addPass(mlir::kernel_gen::transforms::CreateShapeToDescriptorsPass());
pm.addPass(mlir::kernel_gen::transforms::CreateBufferizePass());
pm.addPass(mlir::kernel_gen::transforms::CreateParallelLoopsToSequential());
}
// Clean up the IR for further processing.
pm.addPass(mlir::createCanonicalizerPass());
// We have to anticipate later unrolling in tiling to make sure that we get
// the requested tiling after unrolling. Compute the new tiling here if
// needed.
llvm::SmallVector<unsigned, 4> tiling_for_unrolling;
llvm::SmallVector<int64_t, 4> as_int64;
if (!unroll_factors.empty()) {
tiling_for_unrolling.reserve(tile_sizes.size());
for (auto pair : llvm::zip(tile_sizes, unroll_factors)) {
tiling_for_unrolling.push_back(std::get<0>(pair) * std::get<1>(pair));
as_int64.push_back(std::get<1>(pair));
}
} else {
tiling_for_unrolling.append(tile_sizes.begin(), tile_sizes.end());
}
// Transform LHLO operations to LinAlg.
pm.addPass(::mlir::lmhlo::createLegalizeLhloToLinalgPass());
// Fuse linalg operations.
pm.addPass(::mlir::lmhlo::createLhloFuseLinalgPass(
/*use_parallel_loops=*/true, tiling_for_unrolling));
// Transform the Linalg operations inside of the loop nest into parallel
// loops.
pm.addPass(::mlir::createConvertLinalgToParallelLoopsPass());
// Canonicalize the code to simplify index computations. This is needed so
// that loop bounds have the same value.
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
// Fuse the inner-most loops.
pm.addPass(xla::mlir_gpu::createFuseInnerParallelLoopsPass());
// Run CSE to ensure that loads and stores to the same subview get
// recognized as such.
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
// Forward stores to buffers to loads.
pm.addPass(xla::mlir_gpu::createStoreForwardingPass());
// Remove now unused temporary buffers.
pm.addPass(xla::mlir_gpu::createDeadTempBufferRemovalPass());
if (!unroll_factors.empty()) {
pm.addPass(::mlir::createParallelLoopTilingPass(as_int64));
}
// Some basic cleanup.
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
// Greedily map the remaining loop to GPU hardware dimensions.
pm.addPass(xla::mlir_gpu::createMapParallelLoopsPass());
// Apply the mapping.
pm.addPass(mlir::createParallelLoopToGpuPass());
// Embed TF Framework ops.
if (!gpu_binary_only) {
pm.addPass(mlir::kernel_gen::tf_framework::CreateEmbedTFFrameworkPass());
}
// Some basic cleanup.
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
// Make loops with min bounds into a conditional plus static bounds.
// Only do this if we unrolled in the first place.
if (!unroll_factors.empty()) {
pm.addNestedPass<::mlir::FuncOp>(mlir::createForLoopSpecializationPass());
}
// Approximate Tanh using standard operations.
pm.addNestedPass<::mlir::FuncOp>(
::mlir::mhlo::createLegalizeTrigonometricToApproximationPass());
// Move scalar operations into the launch to ensure smaller signatures.
pm.addPass(xla::mlir_gpu::createMoveScalarComputationsIntoGpuLaunchPass());
// Take launches to launches with kernels.
pm.addPass(::mlir::createGpuKernelOutliningPass());
if (gpu_binary_only) {
// Make kernel signature deterministic so that we can call it externally.
pm.addPass(xla::mlir_gpu::createRewriteKernelSignaturePass());
}
pm.addPass(::mlir::createLowerAffinePass());
pm.addPass(::mlir::createLowerToCFGPass());
if (failed(pm.run(module))) {
return InternalError("Lowering to GPU kernels failed.");
}
return Status::OK();
}
Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
llvm::ArrayRef<uint32_t> same_shape,
llvm::StringRef gpu_binary_attr_name,
int32_t architecture) {
mlir::PassManager pm(module.getContext());
applyPassManagerCLOptions(pm);
// TODO(b/169357508): renable when pipeline is serializable.
// SetCrashReproducer(pm);
auto& kernel_pm = pm.nest<mlir::gpu::GPUModuleOp>();
if (gpu_binary_only) {
// Grab the original signature from the single function.
auto func = *module.getBody()->op_begin<mlir::FuncOp>();
kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
mlir::kernel_gen::transforms::CreatePropagateTensorFlowABIKnowledgePass(
func.getType(), same_shape));
}
kernel_pm.addPass(mlir::createStripDebugInfoPass());
kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
gpu_binary_attr_name, architecture));
if (!gpu_binary_only) {
pm.addPass(mlir::kernel_gen::transforms::CreateTFKernelToLLVMPass());
pm.addPass(mlir::createCanonicalizerPass());
pm.addPass(mlir::createCSEPass());
}
return failed(pm.run(module)) ? InternalError("Lowering to LLVM IR failed.")
: Status::OK();
}
} // namespace
StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
int32_t architecture, llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> same_shape,
llvm::ArrayRef<uint32_t> unroll_factors) {
mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
TF_RETURN_IF_ERROR(
LowerTFtoGPU(module.get(), gpu_binary_only, tile_sizes, unroll_factors));
#if !defined(TENSORFLOW_USE_ROCM) && !defined(GOOGLE_CUDA)
return InternalError(
"Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
" Did you specify either --config=rocm or --config=cuda ?");
#endif
#if TENSORFLOW_USE_ROCM
TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToROCDL(module.get()));
#elif GOOGLE_CUDA
TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
#endif
TF_RETURN_IF_ERROR(LowerGPUToLLVM(module.get(), gpu_binary_only, same_shape,
kGpuBinaryAttrName, architecture));
return module;
}
StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module) {
auto gpu_modules = module.getOps<mlir::gpu::GPUModuleOp>();
if (std::distance(gpu_modules.begin(), gpu_modules.end()) != 1) {
return InternalError("There should be exactly one GPU Module");
}
mlir::gpu::GPUModuleOp gpu_mod = *gpu_modules.begin();
auto blob = gpu_mod.getAttrOfType<mlir::StringAttr>(kGpuBinaryAttrName);
if (blob == nullptr) {
return InternalError("No binary blob found in the module");
}
return blob.getValue().str();
}
} // namespace kernel_gen
} // namespace tensorflow