blob: a69fe81b0d33b7e162c4818b809a892ff23235b9 [file] [log] [blame]
//===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
//
// Copyright 2019 The MLIR Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
//
// This file implements a pass to convert gpu kernel functions into a
// corresponding binary blob that can be executed on a CUDA GPU. Currently
// only translates the function itself but no dependencies.
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Function.h"
#include "mlir/IR/Module.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassRegistry.h"
#include "mlir/Support/LogicalResult.h"
#include "mlir/Target/NVVMIR.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/Twine.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
using namespace mlir;
namespace {
// TODO(herhut): Move to shared location.
static constexpr const char *kCubinAnnotation = "nvvm.cubin";
/// A pass converting tagged kernel functions to cubin blobs.
class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
public:
GpuKernelToCubinPass(
CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
: cubinGenerator(cubinGenerator) {}
// Run the dialect converter on the module.
void runOnModule() override {
// Make sure the NVPTX target is initialized.
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();
for (auto function : getModule().getOps<FuncOp>()) {
if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) {
continue;
}
if (failed(translateGpuKernelToCubinAnnotation(function)))
signalPassFailure();
}
}
private:
static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
FuncOp &function);
std::string translateModuleToPtx(llvm::Module &module,
llvm::TargetMachine &target_machine);
OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);
CubinGenerator cubinGenerator;
};
} // anonymous namespace
std::string GpuKernelToCubinPass::translateModuleToPtx(
llvm::Module &module, llvm::TargetMachine &target_machine) {
std::string ptx;
{
llvm::raw_string_ostream stream(ptx);
llvm::buffer_ostream pstream(stream);
llvm::legacy::PassManager codegen_passes;
target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
llvm::TargetMachine::CGFT_AssemblyFile);
codegen_passes.run(module);
}
return ptx;
}
OwnedCubin
GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
FuncOp &function) {
const char data[] = "CUBIN";
return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
}
OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
FuncOp &function) {
std::unique_ptr<llvm::TargetMachine> targetMachine;
{
std::string error;
// TODO(herhut): Make triple configurable.
constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
llvm::Triple triple(cudaTriple);
const llvm::Target *target =
llvm::TargetRegistry::lookupTarget("", triple, error);
if (target == nullptr) {
function.emitError("cannot initialize target triple");
return {};
}
targetMachine.reset(
target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
}
// Set the data layout of the llvm module to match what the ptx target needs.
llvmModule.setDataLayout(targetMachine->createDataLayout());
auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
return cubinGenerator(ptx, function);
}
LogicalResult
GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
Builder builder(function.getContext());
OwningModuleRef module = ModuleOp::create(function.getLoc());
// TODO(herhut): Also handle called functions.
module->push_back(function.clone());
auto llvmModule = translateModuleToNVVMIR(*module);
auto cubin = convertModuleToCubin(*llvmModule, function);
if (!cubin) {
return function.emitError("translation to CUDA binary failed.");
}
function.setAttr(kCubinAnnotation,
builder.getStringAttr({cubin->data(), cubin->size()}));
// Remove the body of the kernel function now that it has been translated.
// The main reason to do this is so that the resulting module no longer
// contains the NVVM instructions (typically contained in the kernel bodies)
// and hence can be compiled into host code by a separate pass.
function.eraseBody();
return success();
}
std::unique_ptr<OpPassBase<ModuleOp>>
mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
}
static PassRegistration<GpuKernelToCubinPass>
pass("test-kernel-to-cubin",
"Convert all kernel functions to CUDA cubin blobs");