torch/csrc/utils/throughput_benchmark.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <ATen/core/ivalue.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/utils/pybind.h>

 #include <torch/csrc/jit/python/pybind_utils.h>

 #include <iostream>
 #include <memory>
 #include <string>
 #include <vector>

 namespace py = pybind11;

 namespace torch {
 namespace throughput_benchmark {

 /**
  * The struct is used to provide results of a benchmark to the caller
  * In the future all additional statics should be added here.
  */
 struct BenchmarkExecutionStats {
   float latency_avg_ms{-1};
   int64_t num_iters{-1};
 };

 std::ostream& operator<<(
     std::ostream& os,
     const BenchmarkExecutionStats& value);

 /**
  * Use this struct in order to configure a throughput benchmark run.
  * This struct should include parameters related to threading, batching, number
  * of iterations, warm-up, etc. More configs can be added as needed.
  * General rule here is that only things that c++ must(!) to be aware of should
  * be here. If we can keep other parts in python, we should keep them there.
  * This is typical for things that are not perf critical and don't affect
  * execution statistics benchmark returns.
  */
 struct BenchmarkConfig {
  public:
   // Calling threads are those threads that are calling into a module in
   // parallel.
   int num_calling_threads{1};
   // Worker threads are not supported yet. This is just an example that we plan
   // to support some sort of multi-threaded forward calls. We may change this
   // setting in the future to support different intra and inter op parallelizm
   // which is not available in PyTorch yet
   int num_worker_threads{1};
   // Warmup iters are used to make sure we run a module a few times before
   // actually measuring things. This way we avoid cold caches and any other
   // similar problems
   int num_warmup_iters{1};
   // Number of iterations the benchmark should run with. This number is separate
   // from the warmup iterations
   int64_t num_iters{100};
   // If set autograd profiler will be enabled. I.e. this variable would be
   // created before the main benchmark loop (but after the warmup):
   // RecordProfile guard(profiler_output_path);
   std::string profiler_output_path{""};
 };

 namespace detail {

 /**
  * A helper class to abstract out different models we test throughput of
  */
 template <class Input, class Output, class Model>
 class BenchmarkHelper {
  public:
   BenchmarkHelper();
   // NOLINTNEXTLINE(modernize-pass-by-value)
   explicit BenchmarkHelper(Model model) : model_(model), initialized_(true) {}

   // This method to be used in benchmark() method
   // Note that there is no result. This way we don't have to call this under GIL
   // even when running in the nn.Module mode. Otherwise destructor of the result
   // would race with Python
   void runOnce(Input&&) const;
   // This method is to be used when calling from Python dirrectly
   Output runOnce(py::args&&, py::kwargs&&) const;
   // Aggregate input in the format Model expects in order to avoid further
   // conversions at the benchmark time
   void addInput(py::args&&, py::kwargs&&);
   void addInput(Input&&);
   BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;

   bool initialized() const {
     return initialized_;
   }

   // Destructor doesn't require the GIL because it is going to be executed on
   // the PyThon thread
   std::vector<Input> inputs_;
   Model model_;
   bool initialized_{false};
 };

 struct C10_HIDDEN ModuleInput {
   ModuleInput(ModuleInput&& other) = default;

   ModuleInput(const ModuleInput&) = delete;
   ModuleInput& operator=(ModuleInput& other) = delete;
   ModuleInput& operator=(ModuleInput&& other) = delete;

   ModuleInput(py::args&& args, py::kwargs&& kwargs)
       : args(std::move(args)), kwargs(std::move(kwargs)) {}

   py::args args;
   py::kwargs kwargs;
 };
 typedef py::object ModuleOutput;
 typedef std::vector<at::IValue> ScriptModuleInput;
 typedef at::IValue ScriptModuleOutput;

 template <class Input>
 Input cloneInput(const Input& input);

 typedef BenchmarkHelper<ScriptModuleInput, at::IValue, jit::Module>
     ScriptModuleBenchmark;
 template <>
 inline BenchmarkHelper<ScriptModuleInput, at::IValue, jit::Module>::
     BenchmarkHelper()
     : model_("Module", std::make_shared<jit::CompilationUnit>()),
       initialized_(false) {}
 typedef BenchmarkHelper<ModuleInput, py::object, py::object> ModuleBenchmark;
 template <>
 inline BenchmarkHelper<ModuleInput, py::object, py::object>::BenchmarkHelper()
     : initialized_(false) {}

 template <>
 void ScriptModuleBenchmark::runOnce(ScriptModuleInput&& input) const;

 template <>
 ScriptModuleOutput ScriptModuleBenchmark::runOnce(
     py::args&& args,
     py::kwargs&& kwargs) const;

 template <>
 void ModuleBenchmark::runOnce(ModuleInput&& input) const;

 template <>
 ModuleOutput ModuleBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs)
     const;

 template <>
 void ScriptModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);
 template <>
 void ScriptModuleBenchmark::addInput(ScriptModuleInput&& input);

 template <>
 void ModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);

 } // namespace detail

 /**
  * This class is a small c++ component responsible for executing a PyTorch
  * module under an inference server like load. It can emulate multiple calling
  * threads to a single module provided. In the future we plan to enhance this
  * component to support inter and intra-op parallelism as well as multiple
  * models running in a single process.
  *
  * For current available configurations refer to the BenchmkarConfig
  * documentation
  *
  * The class supports working with either nn.Module or ScriptModule.
  * Under the hood it just dispatches to corresponding specialization of
  * class BenchmarkHelper<Input, Output, Model>
  */
 class C10_HIDDEN ThroughputBenchmark {
  public:
   explicit ThroughputBenchmark(jit::Module module);
   explicit ThroughputBenchmark(py::object module);

   // Add one more input example. This input example should be in the exact
   // format the module under test expects. It is responsibility of the module to
   // perform any such format checks, the benchmark doesn't perform any
   // validation of its own
   void addInput(py::args args, py::kwargs kwargs);

   // Equivalent to just running the model dirrectly on the given input
   py::object runOnce(py::args&& args, py::kwargs&& kwargs);

   // The main method of the class allows to perform a multi-threaded benchmark
   // It returns BenchmarkExecutionStats object with a lot of useful statistics
   // about runtime execution. We can enhance this class in the future to provide
   // more information to the user
   BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;

  private:
   detail::ScriptModuleBenchmark script_module_;
   detail::ModuleBenchmark module_;
 };
 } // namespace throughput_benchmark
 } // namespace torch

 #include <torch/csrc/utils/throughput_benchmark-inl.h>
	#pragma once

	#include <ATen/core/ivalue.h>
	#include <pybind11/pybind11.h>
	#include <torch/csrc/jit/api/module.h>
	#include <torch/csrc/utils/pybind.h>

	#include <torch/csrc/jit/python/pybind_utils.h>

	#include <iostream>
	#include <memory>
	#include <string>
	#include <vector>

	namespace py = pybind11;

	namespace torch {
	namespace throughput_benchmark {

	/**
	* The struct is used to provide results of a benchmark to the caller
	* In the future all additional statics should be added here.
	*/
	struct BenchmarkExecutionStats {
	float latency_avg_ms{-1};
	int64_t num_iters{-1};
	};

	std::ostream& operator<<(
	std::ostream& os,
	const BenchmarkExecutionStats& value);

	/**
	* Use this struct in order to configure a throughput benchmark run.
	* This struct should include parameters related to threading, batching, number
	* of iterations, warm-up, etc. More configs can be added as needed.
	* General rule here is that only things that c++ must(!) to be aware of should
	* be here. If we can keep other parts in python, we should keep them there.
	* This is typical for things that are not perf critical and don't affect
	* execution statistics benchmark returns.
	*/
	struct BenchmarkConfig {
	public:
	// Calling threads are those threads that are calling into a module in
	// parallel.
	int num_calling_threads{1};
	// Worker threads are not supported yet. This is just an example that we plan
	// to support some sort of multi-threaded forward calls. We may change this
	// setting in the future to support different intra and inter op parallelizm
	// which is not available in PyTorch yet
	int num_worker_threads{1};
	// Warmup iters are used to make sure we run a module a few times before
	// actually measuring things. This way we avoid cold caches and any other
	// similar problems
	int num_warmup_iters{1};
	// Number of iterations the benchmark should run with. This number is separate
	// from the warmup iterations
	int64_t num_iters{100};
	// If set autograd profiler will be enabled. I.e. this variable would be
	// created before the main benchmark loop (but after the warmup):
	// RecordProfile guard(profiler_output_path);
	std::string profiler_output_path{""};
	};

	namespace detail {

	/**
	* A helper class to abstract out different models we test throughput of
	*/
	template <class Input, class Output, class Model>
	class BenchmarkHelper {
	public:
	BenchmarkHelper();
	// NOLINTNEXTLINE(modernize-pass-by-value)
	explicit BenchmarkHelper(Model model) : model_(model), initialized_(true) {}

	// This method to be used in benchmark() method
	// Note that there is no result. This way we don't have to call this under GIL
	// even when running in the nn.Module mode. Otherwise destructor of the result
	// would race with Python
	void runOnce(Input&&) const;
	// This method is to be used when calling from Python dirrectly
	Output runOnce(py::args&&, py::kwargs&&) const;
	// Aggregate input in the format Model expects in order to avoid further
	// conversions at the benchmark time
	void addInput(py::args&&, py::kwargs&&);
	void addInput(Input&&);
	BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;

	bool initialized() const {
	return initialized_;
	}

	// Destructor doesn't require the GIL because it is going to be executed on
	// the PyThon thread
	std::vector<Input> inputs_;
	Model model_;
	bool initialized_{false};
	};

	struct C10_HIDDEN ModuleInput {
	ModuleInput(ModuleInput&& other) = default;

	ModuleInput(const ModuleInput&) = delete;
	ModuleInput& operator=(ModuleInput& other) = delete;
	ModuleInput& operator=(ModuleInput&& other) = delete;

	ModuleInput(py::args&& args, py::kwargs&& kwargs)
	: args(std::move(args)), kwargs(std::move(kwargs)) {}

	py::args args;
	py::kwargs kwargs;
	};
	typedef py::object ModuleOutput;
	typedef std::vector<at::IValue> ScriptModuleInput;
	typedef at::IValue ScriptModuleOutput;

	template <class Input>
	Input cloneInput(const Input& input);

	typedef BenchmarkHelper<ScriptModuleInput, at::IValue, jit::Module>
	ScriptModuleBenchmark;
	template <>
	inline BenchmarkHelper<ScriptModuleInput, at::IValue, jit::Module>::
	BenchmarkHelper()
	: model_("Module", std::make_shared<jit::CompilationUnit>()),
	initialized_(false) {}
	typedef BenchmarkHelper<ModuleInput, py::object, py::object> ModuleBenchmark;
	template <>
	inline BenchmarkHelper<ModuleInput, py::object, py::object>::BenchmarkHelper()
	: initialized_(false) {}

	template <>
	void ScriptModuleBenchmark::runOnce(ScriptModuleInput&& input) const;

	template <>
	ScriptModuleOutput ScriptModuleBenchmark::runOnce(
	py::args&& args,
	py::kwargs&& kwargs) const;

	template <>
	void ModuleBenchmark::runOnce(ModuleInput&& input) const;

	template <>
	ModuleOutput ModuleBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs)
	const;

	template <>
	void ScriptModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);
	template <>
	void ScriptModuleBenchmark::addInput(ScriptModuleInput&& input);

	template <>
	void ModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);

	} // namespace detail

	/**
	* This class is a small c++ component responsible for executing a PyTorch
	* module under an inference server like load. It can emulate multiple calling
	* threads to a single module provided. In the future we plan to enhance this
	* component to support inter and intra-op parallelism as well as multiple
	* models running in a single process.
	*
	* For current available configurations refer to the BenchmkarConfig
	* documentation
	*
	* The class supports working with either nn.Module or ScriptModule.
	* Under the hood it just dispatches to corresponding specialization of
	* class BenchmarkHelper<Input, Output, Model>
	*/
	class C10_HIDDEN ThroughputBenchmark {
	public:
	explicit ThroughputBenchmark(jit::Module module);
	explicit ThroughputBenchmark(py::object module);

	// Add one more input example. This input example should be in the exact
	// format the module under test expects. It is responsibility of the module to
	// perform any such format checks, the benchmark doesn't perform any
	// validation of its own
	void addInput(py::args args, py::kwargs kwargs);

	// Equivalent to just running the model dirrectly on the given input
	py::object runOnce(py::args&& args, py::kwargs&& kwargs);

	// The main method of the class allows to perform a multi-threaded benchmark
	// It returns BenchmarkExecutionStats object with a lot of useful statistics
	// about runtime execution. We can enhance this class in the future to provide
	// more information to the user
	BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;

	private:
	detail::ScriptModuleBenchmark script_module_;
	detail::ModuleBenchmark module_;
	};
	} // namespace throughput_benchmark
	} // namespace torch

	#include <torch/csrc/utils/throughput_benchmark-inl.h>