torch/csrc/jit/fusion_compiler.h - platform/external/pytorch - Git at Google

 #pragma once
 #include <torch/csrc/jit/ir.h>
 #include "torch/csrc/utils/disallow_copy.h"
 #include "ATen/ATen.h"
 #include <string>
 #include <algorithm>
 #include <unordered_map>
 #include <vector>

 namespace torch { namespace jit {

 // type information needed by the compiler for input/outputs
 // contiguity[i] is true if the dim i is contiguous with dim i + 1.
 // contiguity.back() == true means strides.back() == 1.
 struct TensorDesc {
   at::ScalarType scalar_type;
   std::vector<bool> contiguity;

   TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
   : scalar_type(type), contiguity(contiguity) {
     nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0);
   }

   TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides)
   : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {}
   TensorDesc(const at::Tensor& t)
     : TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {}
   TensorDesc(TensorType *type)
     : TensorDesc(type->scalarType(), type->sizes(), type->strides()) {}

   // number of dimensions after contiguity compression
   size_t nDim() const {
     return nDim_;
   }

   // do we have inner stride == 1?
   bool lastIsContiguous() const {
     return contiguity.size() == 0 || contiguity.back();
   }

   static std::vector<bool> findContiguous(
     const at::IntList& sizes,
     const at::IntList& strides);

 private:
   size_t nDim_;
 };

 constexpr int kCPUDevice = -1;
 struct AnnotatedGraph {
   // short-term storage only, so it borrows Graph.
   AnnotatedGraph(Graph & graph, int device)
   : graph(&graph), device(device) {}
   Graph* graph = nullptr;
   int device = kCPUDevice;
   std::vector<TensorDesc> input_desc;
   std::vector<TensorDesc> output_desc;
 };

 struct ConcatDesc {
   size_t nSubtensors; // == 1 for outputs that are not concats, otherwise it is the number tensors concatenated
   size_t dim; // dimension along which the concat occurs
   std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
   ConcatDesc()
   : nSubtensors(1), dim(0) {}
   ConcatDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim)
   : nSubtensors(nSubtensors), dim(dim) {
     JIT_ASSERT(nSubtensors > 1);
     std::vector<bool> cont = desc.contiguity;
     if(dim > 0) {
       // when we narrow the concatenated output
       // we make the size[dim] smaller while keeping the stride[dim] the same,
       // meaning: stride[dim - 1] != stride[dim]*size[dim]
       // so dim - 1 is no longer contiguous
       cont[dim - 1] = false;
     }
     subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
   }
 };

 struct CompiledFusionFunction {
   TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction);

   CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph);
   virtual ~CompiledFusionFunction() {}

   // expects outputs to be pre-allocated
   void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);

   // creates new tensors for outputs
   void launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs);
   const std::vector<TensorDesc> & outputDescriptors() const {
     return output_desc;
   }
 protected:
   virtual at::Backend backend() const = 0;

   // arguments is a list of pointers to the arguments for the compiled CUDA/CPU
   // code.
   // The format of arguments is suitable for directly passing to a call to
   // cuLaunchKernel as the kernel arguments.
   // Currently the first argument is a pointer to numel (for passing to
   // CUDA code), and the remainder are pointers to the TensorInfo<T> structs
   // that compiled code uses to load Tensor data.
   // launch_with_tensors handles packing at::Tensors into this arguments array.
   // CPU code uses the same convension so that launch_with_tensors can be shared.
   virtual void launch_raw(uint32_t numel, void ** arguments) = 0;
   std::string name;
   // We keep these around for debugging
   std::string compilation_unit;
   std::vector<TensorDesc> input_desc;
   std::vector<TensorDesc> output_desc;

   // same size as output_desc, describes whether
   // an output is actually a concatenation of
   // many subtensors that the fusion group produces
   std::vector<ConcatDesc> concat_desc;
 };

 struct FusionCompilerConfig {
   std::string cxx = "g++"; // compiler location
   bool debug = false; // emit debugging information about fusions
   bool openmp = true;
 };

 // caching compiler
 struct FusionCompiler {
   TH_DISALLOW_COPY_AND_ASSIGN(FusionCompiler);
   FusionCompiler();

   // ignores types in graph, and uses specific contiguity annotations
   std::shared_ptr<CompiledFusionFunction> getOrCompile(AnnotatedGraph & agraph);
   // uses type annotations in fusion_group to create Annotated graph
   std::shared_ptr<CompiledFusionFunction> getOrCompile(Node * fusion_group);

   // uses inputs/outputs as examples to infer continuity, does not run the graph
   std::shared_ptr<CompiledFusionFunction> getOrCompile(Graph & graph,
                                                        int device,
                                                        at::ArrayRef<at::Tensor> inputs,
                                                        at::ArrayRef<at::Tensor> outputs);
   // debugging function that lets you do everything from compilation to execution
   // in one step.
   // this should not be used in the hot path of execution because it has to serialize
   // the graph each time
   void debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
   bool canCompileOnCPU() const {
     return config_.cxx.size() > 0;
   }
 private:
   FusionCompilerConfig config_;
   std::unordered_map<std::string, std::shared_ptr<CompiledFusionFunction>> cache;
 };

 FusionCompiler & sharedFusionCompiler();

 }}
	#pragma once
	#include <torch/csrc/jit/ir.h>
	#include "torch/csrc/utils/disallow_copy.h"
	#include "ATen/ATen.h"
	#include <string>
	#include <algorithm>
	#include <unordered_map>
	#include <vector>

	namespace torch { namespace jit {

	// type information needed by the compiler for input/outputs
	// contiguity[i] is true if the dim i is contiguous with dim i + 1.
	// contiguity.back() == true means strides.back() == 1.
	struct TensorDesc {
	at::ScalarType scalar_type;
	std::vector<bool> contiguity;

	TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
	: scalar_type(type), contiguity(contiguity) {
	nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0);
	}

	TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides)
	: TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {}
	TensorDesc(const at::Tensor& t)
	: TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {}
	TensorDesc(TensorType *type)
	: TensorDesc(type->scalarType(), type->sizes(), type->strides()) {}

	// number of dimensions after contiguity compression
	size_t nDim() const {
	return nDim_;
	}

	// do we have inner stride == 1?
	bool lastIsContiguous() const {
	return contiguity.size() == 0 \|\| contiguity.back();
	}

	static std::vector<bool> findContiguous(
	const at::IntList& sizes,
	const at::IntList& strides);

	private:
	size_t nDim_;
	};

	constexpr int kCPUDevice = -1;
	struct AnnotatedGraph {
	// short-term storage only, so it borrows Graph.
	AnnotatedGraph(Graph & graph, int device)
	: graph(&graph), device(device) {}
	Graph* graph = nullptr;
	int device = kCPUDevice;
	std::vector<TensorDesc> input_desc;
	std::vector<TensorDesc> output_desc;
	};

	struct ConcatDesc {
	size_t nSubtensors; // == 1 for outputs that are not concats, otherwise it is the number tensors concatenated
	size_t dim; // dimension along which the concat occurs
	std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
	ConcatDesc()
	: nSubtensors(1), dim(0) {}
	ConcatDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim)
	: nSubtensors(nSubtensors), dim(dim) {
	JIT_ASSERT(nSubtensors > 1);
	std::vector<bool> cont = desc.contiguity;
	if(dim > 0) {
	// when we narrow the concatenated output
	// we make the size[dim] smaller while keeping the stride[dim] the same,
	// meaning: stride[dim - 1] != stride[dim]*size[dim]
	// so dim - 1 is no longer contiguous
	cont[dim - 1] = false;
	}
	subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
	}
	};

	struct CompiledFusionFunction {
	TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction);

	CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph);
	virtual ~CompiledFusionFunction() {}

	// expects outputs to be pre-allocated
	void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);

	// creates new tensors for outputs
	void launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs);
	const std::vector<TensorDesc> & outputDescriptors() const {
	return output_desc;
	}
	protected:
	virtual at::Backend backend() const = 0;

	// arguments is a list of pointers to the arguments for the compiled CUDA/CPU
	// code.
	// The format of arguments is suitable for directly passing to a call to
	// cuLaunchKernel as the kernel arguments.
	// Currently the first argument is a pointer to numel (for passing to
	// CUDA code), and the remainder are pointers to the TensorInfo<T> structs
	// that compiled code uses to load Tensor data.
	// launch_with_tensors handles packing at::Tensors into this arguments array.
	// CPU code uses the same convension so that launch_with_tensors can be shared.
	virtual void launch_raw(uint32_t numel, void ** arguments) = 0;
	std::string name;
	// We keep these around for debugging
	std::string compilation_unit;
	std::vector<TensorDesc> input_desc;
	std::vector<TensorDesc> output_desc;

	// same size as output_desc, describes whether
	// an output is actually a concatenation of
	// many subtensors that the fusion group produces
	std::vector<ConcatDesc> concat_desc;
	};

	struct FusionCompilerConfig {
	std::string cxx = "g++"; // compiler location
	bool debug = false; // emit debugging information about fusions
	bool openmp = true;
	};

	// caching compiler
	struct FusionCompiler {
	TH_DISALLOW_COPY_AND_ASSIGN(FusionCompiler);
	FusionCompiler();

	// ignores types in graph, and uses specific contiguity annotations
	std::shared_ptr<CompiledFusionFunction> getOrCompile(AnnotatedGraph & agraph);
	// uses type annotations in fusion_group to create Annotated graph
	std::shared_ptr<CompiledFusionFunction> getOrCompile(Node * fusion_group);

	// uses inputs/outputs as examples to infer continuity, does not run the graph
	std::shared_ptr<CompiledFusionFunction> getOrCompile(Graph & graph,
	int device,
	at::ArrayRef<at::Tensor> inputs,
	at::ArrayRef<at::Tensor> outputs);
	// debugging function that lets you do everything from compilation to execution
	// in one step.
	// this should not be used in the hot path of execution because it has to serialize
	// the graph each time
	void debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
	bool canCompileOnCPU() const {
	return config_.cxx.size() > 0;
	}
	private:
	FusionCompilerConfig config_;
	std::unordered_map<std::string, std::shared_ptr<CompiledFusionFunction>> cache;
	};

	FusionCompiler & sharedFusionCompiler();

	}}