lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp - platform/external/tensorflow - Git at Google

 //===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
 //
 // Copyright 2019 The MLIR Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
 //
 // This file implements a pass to convert gpu kernel functions into a
 // corresponding binary blob that can be executed on a CUDA GPU. Currently
 // only translates the function itself but no dependencies.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"

 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Module.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/NVVMIR.h"

 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"

 using namespace mlir;

 namespace {
 // TODO(herhut): Move to shared location.
 static constexpr const char *kCubinAnnotation = "nvvm.cubin";

 /// A pass converting tagged kernel functions to cubin blobs.
 class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
 public:
   GpuKernelToCubinPass(
       CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
       : cubinGenerator(cubinGenerator) {}

   // Run the dialect converter on the module.
   void runOnModule() override {
     // Make sure the NVPTX target is initialized.
     LLVMInitializeNVPTXTarget();
     LLVMInitializeNVPTXTargetInfo();
     LLVMInitializeNVPTXTargetMC();
     LLVMInitializeNVPTXAsmPrinter();

     for (auto function : getModule().getOps<FuncOp>()) {
       if (!gpu::GPUDialect::isKernel(function) || function.isExternal()) {
         continue;
       }
       if (failed(translateGpuKernelToCubinAnnotation(function)))
         signalPassFailure();
     }
   }

 private:
   static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
                                                 FuncOp &function);

   std::string translateModuleToPtx(llvm::Module &module,
                                    llvm::TargetMachine &target_machine);
   OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
   LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);

   CubinGenerator cubinGenerator;
 };

 } // anonymous namespace

 std::string GpuKernelToCubinPass::translateModuleToPtx(
     llvm::Module &module, llvm::TargetMachine &target_machine) {
   std::string ptx;
   {
     llvm::raw_string_ostream stream(ptx);
     llvm::buffer_ostream pstream(stream);
     llvm::legacy::PassManager codegen_passes;
     target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
                                        llvm::TargetMachine::CGFT_AssemblyFile);
     codegen_passes.run(module);
   }

   return ptx;
 }

 OwnedCubin
 GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
                                                   FuncOp &function) {
   const char data[] = "CUBIN";
   return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
 }

 OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
                                                       FuncOp &function) {
   std::unique_ptr<llvm::TargetMachine> targetMachine;
   {
     std::string error;
     // TODO(herhut): Make triple configurable.
     constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
     llvm::Triple triple(cudaTriple);
     const llvm::Target *target =
         llvm::TargetRegistry::lookupTarget("", triple, error);
     if (target == nullptr) {
       function.emitError("cannot initialize target triple");
       return {};
     }
     targetMachine.reset(
         target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
   }

   // Set the data layout of the llvm module to match what the ptx target needs.
   llvmModule.setDataLayout(targetMachine->createDataLayout());

   auto ptx = translateModuleToPtx(llvmModule, *targetMachine);

   return cubinGenerator(ptx, function);
 }

 LogicalResult
 GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
   Builder builder(function.getContext());

   OwningModuleRef module = ModuleOp::create(function.getLoc());

   // TODO(herhut): Also handle called functions.
   module->push_back(function.clone());

   auto llvmModule = translateModuleToNVVMIR(*module);
   auto cubin = convertModuleToCubin(*llvmModule, function);

   if (!cubin) {
     return function.emitError("translation to CUDA binary failed.");
   }

   function.setAttr(kCubinAnnotation,
                    builder.getStringAttr({cubin->data(), cubin->size()}));

   // Remove the body of the kernel function now that it has been translated.
   // The main reason to do this is so that the resulting module no longer
   // contains the NVVM instructions (typically contained in the kernel bodies)
   // and hence can be compiled into host code by a separate pass.
   function.eraseBody();

   return success();
 }

 std::unique_ptr<OpPassBase<ModuleOp>>
 mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
   return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
 }

 static PassRegistration<GpuKernelToCubinPass>
     pass("test-kernel-to-cubin",
          "Convert all kernel functions to CUDA cubin blobs");
	//===- ConvertKernelFuncToCubin.cpp - MLIR GPU lowering passes ------------===//
	//
	// Copyright 2019 The MLIR Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	// =============================================================================
	//
	// This file implements a pass to convert gpu kernel functions into a
	// corresponding binary blob that can be executed on a CUDA GPU. Currently
	// only translates the function itself but no dependencies.
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"

	#include "mlir/Dialect/GPU/GPUDialect.h"
	#include "mlir/IR/Attributes.h"
	#include "mlir/IR/Builders.h"
	#include "mlir/IR/Function.h"
	#include "mlir/IR/Module.h"
	#include "mlir/Pass/Pass.h"
	#include "mlir/Pass/PassRegistry.h"
	#include "mlir/Support/LogicalResult.h"
	#include "mlir/Target/NVVMIR.h"

	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/Target/TargetMachine.h"

	using namespace mlir;

	namespace {
	// TODO(herhut): Move to shared location.
	static constexpr const char *kCubinAnnotation = "nvvm.cubin";

	/// A pass converting tagged kernel functions to cubin blobs.
	class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
	public:
	GpuKernelToCubinPass(
	CubinGenerator cubinGenerator = compilePtxToCubinForTesting)
	: cubinGenerator(cubinGenerator) {}

	// Run the dialect converter on the module.
	void runOnModule() override {
	// Make sure the NVPTX target is initialized.
	LLVMInitializeNVPTXTarget();
	LLVMInitializeNVPTXTargetInfo();
	LLVMInitializeNVPTXTargetMC();
	LLVMInitializeNVPTXAsmPrinter();

	for (auto function : getModule().getOps<FuncOp>()) {
	if (!gpu::GPUDialect::isKernel(function) \|\| function.isExternal()) {
	continue;
	}
	if (failed(translateGpuKernelToCubinAnnotation(function)))
	signalPassFailure();
	}
	}

	private:
	static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
	FuncOp &function);

	std::string translateModuleToPtx(llvm::Module &module,
	llvm::TargetMachine &target_machine);
	OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
	LogicalResult translateGpuKernelToCubinAnnotation(FuncOp &function);

	CubinGenerator cubinGenerator;
	};

	} // anonymous namespace

	std::string GpuKernelToCubinPass::translateModuleToPtx(
	llvm::Module &module, llvm::TargetMachine &target_machine) {
	std::string ptx;
	{
	llvm::raw_string_ostream stream(ptx);
	llvm::buffer_ostream pstream(stream);
	llvm::legacy::PassManager codegen_passes;
	target_machine.addPassesToEmitFile(codegen_passes, pstream, nullptr,
	llvm::TargetMachine::CGFT_AssemblyFile);
	codegen_passes.run(module);
	}

	return ptx;
	}

	OwnedCubin
	GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
	FuncOp &function) {
	const char data[] = "CUBIN";
	return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
	}

	OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
	FuncOp &function) {
	std::unique_ptr<llvm::TargetMachine> targetMachine;
	{
	std::string error;
	// TODO(herhut): Make triple configurable.
	constexpr const char *cudaTriple = "nvptx64-nvidia-cuda";
	llvm::Triple triple(cudaTriple);
	const llvm::Target *target =
	llvm::TargetRegistry::lookupTarget("", triple, error);
	if (target == nullptr) {
	function.emitError("cannot initialize target triple");
	return {};
	}
	targetMachine.reset(
	target->createTargetMachine(triple.str(), "sm_35", "+ptx60", {}, {}));
	}

	// Set the data layout of the llvm module to match what the ptx target needs.
	llvmModule.setDataLayout(targetMachine->createDataLayout());

	auto ptx = translateModuleToPtx(llvmModule, *targetMachine);

	return cubinGenerator(ptx, function);
	}

	LogicalResult
	GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(FuncOp &function) {
	Builder builder(function.getContext());

	OwningModuleRef module = ModuleOp::create(function.getLoc());

	// TODO(herhut): Also handle called functions.
	module->push_back(function.clone());

	auto llvmModule = translateModuleToNVVMIR(*module);
	auto cubin = convertModuleToCubin(*llvmModule, function);

	if (!cubin) {
	return function.emitError("translation to CUDA binary failed.");
	}

	function.setAttr(kCubinAnnotation,
	builder.getStringAttr({cubin->data(), cubin->size()}));

	// Remove the body of the kernel function now that it has been translated.
	// The main reason to do this is so that the resulting module no longer
	// contains the NVVM instructions (typically contained in the kernel bodies)
	// and hence can be compiled into host code by a separate pass.
	function.eraseBody();

	return success();
	}

	std::unique_ptr<OpPassBase<ModuleOp>>
	mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) {
	return std::make_unique<GpuKernelToCubinPass>(cubinGenerator);
	}

	static PassRegistration<GpuKernelToCubinPass>
	pass("test-kernel-to-cubin",
	"Convert all kernel functions to CUDA cubin blobs");