torch/csrc/jit/fuser/compiler.cpp - platform/external/pytorch - Git at Google

 #include <torch/csrc/jit/fuser/compiler.h>

 #include <ATen/ATen.h>
 #include <torch/csrc/jit/assertions.h>
 #include <torch/csrc/jit/code_template.h>
 #include <torch/csrc/jit/fuser/codegen.h>
 #include <torch/csrc/jit/fuser/interface.h>
 #include <torch/csrc/jit/fuser/kernel_cache.h>
 #include <torch/csrc/jit/fuser/tensor_desc.h>
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
 #include <torch/csrc/jit/type.h>
 #include "torch/csrc/jit/fuser/interface.h"

 #if USE_CUDA_FUSER
 #include <torch/csrc/jit/fuser/cuda/fused_kernel.h>
 #endif // USE_CUDA_FUSER

 #if USE_CPU_FUSER
 #include <torch/csrc/jit/fuser/cpu/fused_kernel.h>
 #endif // USE_CUDA_FUSER

 #include <atomic>
 #include <iostream>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <tuple>
 #include <unordered_set>
 #include <utility>

 namespace torch {
 namespace jit {
 namespace fuser {

 // Counter for number of kernels compiled, used for debugging and
 // creating arbitrary kernel names.
 static std::atomic<size_t> next_kernel_id{0};
 static int debug_fusion{-1};

 size_t nCompiledKernels() {
   return next_kernel_id.load();
 }

 int debugFuser() {
   if (debug_fusion < 0) {
     const char* debug_env = getenv("PYTORCH_FUSION_DEBUG");
     debug_fusion = debug_env ? atoi(debug_env) : 0;
   }
   return debug_fusion;
 }

 // If the given node is used once by a chunk node, returns that node.
 // Returns nullptr otherwise.
 static const Node* usedInFusedChunk(const Value* input) {
   const auto& uses = input->uses();
   if (uses.size() == 1) {
     const Node* user = uses[0].user;
     if (user->kind() == prim::ConstantChunk) {
       return user;
     }
   }
   return nullptr;
 }

 static void setInputChunkDescriptors(KernelSpec& spec) {
   spec.inputChunks().reserve((spec.graph())->inputs().size());
   for (const Value* input : (spec.graph())->inputs()) {
     if (const Node* chunk = usedInFusedChunk(input)) {
       spec.inputChunks().emplace_back(
           chunk->i(attr::chunks), chunk->i(attr::dim));
     } else {
       spec.inputChunks().emplace_back(1, 0);
     }
   }
 }

 // Run a DFS traversal to find all inputs that affect a given output value
 static std::vector<int64_t> getInputDependencies(const Value* output) {
   std::vector<const Value*> queue{output};
   std::unordered_set<const Value*> inputs;
   std::unordered_set<const Value*> seen;
   while (!queue.empty()) {
     const Value* val = queue.back();
     queue.pop_back();
     const Node* producer = val->node();
     if (producer->kind() == prim::Param) {
       inputs.insert(val);
       continue;
     }
     for (const Value* input : producer->inputs()) {
       if (/*bool inserted = */ seen.insert(input).second) {
         queue.push_back(input);
       }
     }
   }

   // Convert Value* into offsets into the graph's input list
   std::vector<int64_t> offsets;
   offsets.reserve(inputs.size());
   for (const Value* input : inputs) {
     offsets.push_back(input->offset());
   }

   std::sort(offsets.begin(), offsets.end());
   return offsets;
 }

 static void setInputBroadcastGroups(KernelSpec& spec) {
   std::unordered_set<std::vector<int64_t>, torch::hash<std::vector<int64_t>>>
       broadcast_groups;
   for (const Value* output : (spec.graph())->outputs()) {
     if (output->node()->kind() == prim::FusedConcat) {
       for (const Value* concat_input : output->node()->inputs()) {
         broadcast_groups.insert(getInputDependencies(concat_input));
       }
     } else {
       broadcast_groups.insert(getInputDependencies(output));
     }
   }
   std::copy(
       broadcast_groups.begin(),
       broadcast_groups.end(),
       std::back_inserter(spec.inputBroadcastGroups()));
 }

 // Performs "upfront" compilation where storage is known but shapes are not.
 // Currently identifies how to expand all tensors so that all intermediate
 // tensors are the same shape, simplifying code generation.
 // Broadcast groups and chunks are identified without shape information
 // using logical properties of how each works. In particular, tensors
 // are always expandable to the outputs of pointwise operations they
 // or their descendants are involved in, which means that in a DAG of
 // pointwise operations all tensors are expandable to the (single) output.
 // Note: The logic is slightly complicated by concatenation and chunking.
 static void upfrontCompilation(KernelSpec& spec) {
   setInputBroadcastGroups(spec);
   setInputChunkDescriptors(spec);
 }

 int64_t registerFusion(const Node* fusion_group) {
   auto graph = normalizeGraphForCache(fusion_group->g(attr::Subgraph));

   // Don't re-register the fusion if we can use a pre-existing one
   const auto maybe_spec = lookupGraph(graph);
   if (maybe_spec) {
     return (*maybe_spec)->key();
   }

   // Unconditionally create and register the fusion
   // This is necessary to support our global disable fusions flag: if someone
   // runs some code under no-fusions mode and then runs some code with fusions
   // enabled, the second time around the returned spec from the cache should
   // be a valid spec (must have had upfrontCompilation run on it).
   const auto key = store(graph);
   const auto maybe_retrieved_spec = retrieve(key);
   JIT_ASSERT(maybe_retrieved_spec);
   upfrontCompilation(**maybe_retrieved_spec);

   return key;
 }

 std::shared_ptr<FusedKernel> compileKernel(
     const KernelSpec& spec,
     const ArgSpec& arg_spec,
     const std::vector<int64_t>& map_size,
     const at::Device device) {
   const std::vector<TensorDesc>& input_desc = arg_spec.descs();

   auto graph = spec.graph()->copy();

   c10::optional<at::ScalarType> scalar_type;
   for (size_t i = 0; i < input_desc.size(); i++) {
     const auto& desc = input_desc[i];
     graph->inputs()[i]->setType(TensorType::create(
         desc.scalar_type,
         device,
         desc.nDim())); // TODO: nDim is bad, as it is collapsed
   }

   PropagateInputShapes(graph);

   // Creates chunk and flattened input descriptions
   std::vector<PartitionDesc> chunk_desc;
   std::vector<std::pair<const Value*, const TensorDesc>> flat_inputs;
   {
     size_t input_index = 0;
     for (const auto& p : graph->inputs()) {
       if (const Node* chunk = usedInFusedChunk(p)) {
         int64_t dim = chunk->i(attr::dim);
         int64_t chunks = chunk->i(attr::chunks);
         chunk_desc.emplace_back(input_desc[input_index++], chunks, dim);
         for (const auto* o : chunk->outputs()) {
           flat_inputs.emplace_back(o, *chunk_desc.back().subTensorDesc());
         }
       } else {
         chunk_desc.emplace_back();
         flat_inputs.emplace_back(p, input_desc[input_index++]);
       }
     }
   }

   // Creates output, concat, and flattened output descriptions
   std::vector<TensorDesc> output_desc;
   std::vector<PartitionDesc> concat_desc;
   std::vector<std::pair<const Value*, const TensorDesc>> flat_outputs;
   for (const Value* o : graph->outputs()) {
     // Creates output description
     std::vector<int64_t> sizes = map_size;
     if (o->node()->kind() == prim::FusedConcat) {
       sizes.at(o->node()->i(attr::dim)) *= o->node()->inputs().size();
     }
     auto scalar_type = o->type()->expect<c10::TensorType const>()->scalarType();
     auto type = CompleteTensorType::create(scalar_type, device, sizes);
     output_desc.emplace_back(type);
     const auto& desc = output_desc.back();

     // Creates concat and flattened output descriptions (relies on output desc)
     if (o->node()->kind() != prim::FusedConcat) {
       concat_desc.emplace_back();
       flat_outputs.emplace_back(o, desc);
     } else {
       const auto cat = o->node();
       concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
       for (const auto& c : cat->inputs()) {
         flat_outputs.emplace_back(c, *concat_desc.back().subTensorDesc());
       }
     }
   }

   const std::string name = "kernel_" + std::to_string(next_kernel_id++);
   const bool use_cuda = device.is_cuda();
   std::string code = generateKernel(name, *graph, flat_inputs, flat_outputs, use_cuda);
   std::shared_ptr<FusedKernel> fused_kernel;
   if (use_cuda) {
 #if USE_CUDA_FUSER
     fused_kernel = std::make_shared<cuda::FusedKernelCUDA>(
         device.index(),
         name,
         code,
         input_desc,
         output_desc,
         chunk_desc,
         concat_desc,
         spec.hasRandom());
 #else
     throw std::runtime_error("CUDA Fusion is not supported on this build.");
 #endif // USE_CUDA_FUSER
   } else {
 #if USE_CPU_FUSER
     fused_kernel = std::make_shared<cpu::FusedKernelCPU>(
         name,
         code,
         input_desc,
         output_desc,
         chunk_desc,
         concat_desc,
         spec.hasRandom());
 #else
     throw std::runtime_error("CPU Fusion is not supported on this build.");
 #endif // USE_CPU_FUSER
   }

   return fused_kernel;
 }

 } // namespace fuser
 } // namespace jit
 } // namespace torch
	#include <torch/csrc/jit/fuser/compiler.h>

	#include <ATen/ATen.h>
	#include <torch/csrc/jit/assertions.h>
	#include <torch/csrc/jit/code_template.h>
	#include <torch/csrc/jit/fuser/codegen.h>
	#include <torch/csrc/jit/fuser/interface.h>
	#include <torch/csrc/jit/fuser/kernel_cache.h>
	#include <torch/csrc/jit/fuser/tensor_desc.h>
	#include <torch/csrc/jit/ir.h>
	#include <torch/csrc/jit/passes/canonicalize.h>
	#include <torch/csrc/jit/passes/shape_analysis.h>
	#include <torch/csrc/jit/type.h>
	#include "torch/csrc/jit/fuser/interface.h"

	#if USE_CUDA_FUSER
	#include <torch/csrc/jit/fuser/cuda/fused_kernel.h>
	#endif // USE_CUDA_FUSER

	#if USE_CPU_FUSER
	#include <torch/csrc/jit/fuser/cpu/fused_kernel.h>
	#endif // USE_CUDA_FUSER

	#include <atomic>
	#include <iostream>
	#include <memory>
	#include <sstream>
	#include <stdexcept>
	#include <string>
	#include <tuple>
	#include <unordered_set>
	#include <utility>

	namespace torch {
	namespace jit {
	namespace fuser {

	// Counter for number of kernels compiled, used for debugging and
	// creating arbitrary kernel names.
	static std::atomic<size_t> next_kernel_id{0};
	static int debug_fusion{-1};

	size_t nCompiledKernels() {
	return next_kernel_id.load();
	}

	int debugFuser() {
	if (debug_fusion < 0) {
	const char* debug_env = getenv("PYTORCH_FUSION_DEBUG");
	debug_fusion = debug_env ? atoi(debug_env) : 0;
	}
	return debug_fusion;
	}

	// If the given node is used once by a chunk node, returns that node.
	// Returns nullptr otherwise.
	static const Node* usedInFusedChunk(const Value* input) {
	const auto& uses = input->uses();
	if (uses.size() == 1) {
	const Node* user = uses[0].user;
	if (user->kind() == prim::ConstantChunk) {
	return user;
	}
	}
	return nullptr;
	}

	static void setInputChunkDescriptors(KernelSpec& spec) {
	spec.inputChunks().reserve((spec.graph())->inputs().size());
	for (const Value* input : (spec.graph())->inputs()) {
	if (const Node* chunk = usedInFusedChunk(input)) {
	spec.inputChunks().emplace_back(
	chunk->i(attr::chunks), chunk->i(attr::dim));
	} else {
	spec.inputChunks().emplace_back(1, 0);
	}
	}
	}

	// Run a DFS traversal to find all inputs that affect a given output value
	static std::vector<int64_t> getInputDependencies(const Value* output) {
	std::vector<const Value*> queue{output};
	std::unordered_set<const Value*> inputs;
	std::unordered_set<const Value*> seen;
	while (!queue.empty()) {
	const Value* val = queue.back();
	queue.pop_back();
	const Node* producer = val->node();
	if (producer->kind() == prim::Param) {
	inputs.insert(val);
	continue;
	}
	for (const Value* input : producer->inputs()) {
	if (/bool inserted = / seen.insert(input).second) {
	queue.push_back(input);
	}
	}
	}

	// Convert Value* into offsets into the graph's input list
	std::vector<int64_t> offsets;
	offsets.reserve(inputs.size());
	for (const Value* input : inputs) {
	offsets.push_back(input->offset());
	}

	std::sort(offsets.begin(), offsets.end());
	return offsets;
	}

	static void setInputBroadcastGroups(KernelSpec& spec) {
	std::unordered_set<std::vector<int64_t>, torch::hash<std::vector<int64_t>>>
	broadcast_groups;
	for (const Value* output : (spec.graph())->outputs()) {
	if (output->node()->kind() == prim::FusedConcat) {
	for (const Value* concat_input : output->node()->inputs()) {
	broadcast_groups.insert(getInputDependencies(concat_input));
	}
	} else {
	broadcast_groups.insert(getInputDependencies(output));
	}
	}
	std::copy(
	broadcast_groups.begin(),
	broadcast_groups.end(),
	std::back_inserter(spec.inputBroadcastGroups()));
	}

	// Performs "upfront" compilation where storage is known but shapes are not.
	// Currently identifies how to expand all tensors so that all intermediate
	// tensors are the same shape, simplifying code generation.
	// Broadcast groups and chunks are identified without shape information
	// using logical properties of how each works. In particular, tensors
	// are always expandable to the outputs of pointwise operations they
	// or their descendants are involved in, which means that in a DAG of
	// pointwise operations all tensors are expandable to the (single) output.
	// Note: The logic is slightly complicated by concatenation and chunking.
	static void upfrontCompilation(KernelSpec& spec) {
	setInputBroadcastGroups(spec);
	setInputChunkDescriptors(spec);
	}

	int64_t registerFusion(const Node* fusion_group) {
	auto graph = normalizeGraphForCache(fusion_group->g(attr::Subgraph));

	// Don't re-register the fusion if we can use a pre-existing one
	const auto maybe_spec = lookupGraph(graph);
	if (maybe_spec) {
	return (*maybe_spec)->key();
	}

	// Unconditionally create and register the fusion
	// This is necessary to support our global disable fusions flag: if someone
	// runs some code under no-fusions mode and then runs some code with fusions
	// enabled, the second time around the returned spec from the cache should
	// be a valid spec (must have had upfrontCompilation run on it).
	const auto key = store(graph);
	const auto maybe_retrieved_spec = retrieve(key);
	JIT_ASSERT(maybe_retrieved_spec);
	upfrontCompilation(**maybe_retrieved_spec);

	return key;
	}

	std::shared_ptr<FusedKernel> compileKernel(
	const KernelSpec& spec,
	const ArgSpec& arg_spec,
	const std::vector<int64_t>& map_size,
	const at::Device device) {
	const std::vector<TensorDesc>& input_desc = arg_spec.descs();

	auto graph = spec.graph()->copy();

	c10::optional<at::ScalarType> scalar_type;
	for (size_t i = 0; i < input_desc.size(); i++) {
	const auto& desc = input_desc[i];
	graph->inputs()[i]->setType(TensorType::create(
	desc.scalar_type,
	device,
	desc.nDim())); // TODO: nDim is bad, as it is collapsed
	}

	PropagateInputShapes(graph);

	// Creates chunk and flattened input descriptions
	std::vector<PartitionDesc> chunk_desc;
	std::vector<std::pair<const Value*, const TensorDesc>> flat_inputs;
	{
	size_t input_index = 0;
	for (const auto& p : graph->inputs()) {
	if (const Node* chunk = usedInFusedChunk(p)) {
	int64_t dim = chunk->i(attr::dim);
	int64_t chunks = chunk->i(attr::chunks);
	chunk_desc.emplace_back(input_desc[input_index++], chunks, dim);
	for (const auto* o : chunk->outputs()) {
	flat_inputs.emplace_back(o, *chunk_desc.back().subTensorDesc());
	}
	} else {
	chunk_desc.emplace_back();
	flat_inputs.emplace_back(p, input_desc[input_index++]);
	}
	}
	}

	// Creates output, concat, and flattened output descriptions
	std::vector<TensorDesc> output_desc;
	std::vector<PartitionDesc> concat_desc;
	std::vector<std::pair<const Value*, const TensorDesc>> flat_outputs;
	for (const Value* o : graph->outputs()) {
	// Creates output description
	std::vector<int64_t> sizes = map_size;
	if (o->node()->kind() == prim::FusedConcat) {
	sizes.at(o->node()->i(attr::dim)) *= o->node()->inputs().size();
	}
	auto scalar_type = o->type()->expect<c10::TensorType const>()->scalarType();
	auto type = CompleteTensorType::create(scalar_type, device, sizes);
	output_desc.emplace_back(type);
	const auto& desc = output_desc.back();

	// Creates concat and flattened output descriptions (relies on output desc)
	if (o->node()->kind() != prim::FusedConcat) {
	concat_desc.emplace_back();
	flat_outputs.emplace_back(o, desc);
	} else {
	const auto cat = o->node();
	concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
	for (const auto& c : cat->inputs()) {
	flat_outputs.emplace_back(c, *concat_desc.back().subTensorDesc());
	}
	}
	}

	const std::string name = "kernel_" + std::to_string(next_kernel_id++);
	const bool use_cuda = device.is_cuda();
	std::string code = generateKernel(name, *graph, flat_inputs, flat_outputs, use_cuda);
	std::shared_ptr<FusedKernel> fused_kernel;
	if (use_cuda) {
	#if USE_CUDA_FUSER
	fused_kernel = std::make_shared<cuda::FusedKernelCUDA>(
	device.index(),
	name,
	code,
	input_desc,
	output_desc,
	chunk_desc,
	concat_desc,
	spec.hasRandom());
	#else
	throw std::runtime_error("CUDA Fusion is not supported on this build.");
	#endif // USE_CUDA_FUSER
	} else {
	#if USE_CPU_FUSER
	fused_kernel = std::make_shared<cpu::FusedKernelCPU>(
	name,
	code,
	input_desc,
	output_desc,
	chunk_desc,
	concat_desc,
	spec.hasRandom());
	#else
	throw std::runtime_error("CPU Fusion is not supported on this build.");
	#endif // USE_CPU_FUSER
	}

	return fused_kernel;
	}

	} // namespace fuser
	} // namespace jit
	} // namespace torch