torch/lib/THD/base/DataChannel.hpp - platform/external/pytorch - Git at Google

 #pragma once

 #include "ChannelType.h"
 #include "ChannelUtils.hpp"
 #include "DataChannel.h"
 #include "Scalar.hpp"
 #include "init_methods/InitMethod.hpp"

 #include <ATen/ATen.h>

 #include <vector>
 #include <unordered_map>
 #include <utility>


 MAKE_HASHABLE(THDReduceOp, static_cast<int>(t));
 MAKE_HASHABLE(thd::RPCType, static_cast<char>(t));
 MAKE_HASHABLE(at::ScalarType, static_cast<int>(t));


 namespace thd {

 struct DataChannel {

   struct Request {
     Request() {};
     virtual ~Request() {};

     // Checks if request has completed. Non-blocking operation.
     virtual bool isCompleted() = 0;
     // Waits until request completes. Blocking operation.
     virtual void wait() = 0;
   };

   struct Group {
     Group();
     /*
      * Constructs `Group` from provided `ranks` and checks if all ranks are
      * in range: [0, `max_rank`].
      *
      * `ranks` vector should have mapping from new ranks to old ranks (global ranks)
      * eg. ranks = {[0] = 6, [1] = 2} which means that 0 and 1 are new ranks in group
      * and 6, 2 are global ranks corresponding to 0 and 1 respectively.
      */
     Group(std::vector<rank_type> ranks, rank_type max_rank);
     virtual ~Group();

     rank_type size() const;

     /*
      * In contrast to `getGroupRank` this function throws `std::logic_error`
      * when rank is member of this group.
      */
     rank_type mustGetGroupRank(rank_type global_rank) const;
     std::pair<rank_type, bool> getGroupRank(rank_type global_rank) const;

     /*
      * In contrast to `getGlobalRank` this function throws `std::logic_error`
      * when provided `group_rank` is not in range of group.
      */
     rank_type mustGetGlobalRank(rank_type group_rank) const;
     std::pair<rank_type, bool> getGlobalRank(rank_type group_rank) const;

   private:
     // maps new group ranks to old ranks (global ranks)
     std::vector<rank_type> _new2old;

     // maps old ranks (global ranks) to new group ranks
     std::unordered_map<rank_type, rank_type> _old2new;
   };

   DataChannel() {};
   virtual ~DataChannel() {};

   virtual bool init() = 0;

   /**
    * This is required for NCCL backend, since the destroy cannot be done before
    * CUDA is unloaded since DataChannel is a static object.
    */
   virtual void destroy() = 0;

   virtual rank_type getRank() = 0;
   virtual rank_type getNumProcesses() = 0;

  /**
    * All gather inputs from multiple GPUs, each Tensor in input vector should be
    * on a separate GPU.
    *
    * Also note that the output vector is a 1D vector (flattened from 2D),
    * with the size of input.size() * world_size.
    *
    * For instance, rank i 's input[k] tensor would be in
    * output[i * input.size() + k].
    */
   virtual void allGather(std::vector<at::Tensor>& output,
                          std::vector<at::Tensor>& input,
                          THDGroup groupId = THDGroupWORLD) = 0;
   virtual void allGather(std::vector<at::Tensor>& output,
                          at::Tensor& input,
                          THDGroup group_id = THDGroupWORLD) = 0;
   virtual void gather(std::vector<at::Tensor>& output,
                       at::Tensor& input,
                       rank_type dst_rank,
                       THDGroup group_id = THDGroupWORLD) = 0;
   virtual void scatter(std::vector<at::Tensor>& input,
                        at::Tensor& output,
                        rank_type src_rank,
                        THDGroup group_id = THDGroupWORLD) = 0;
   // All reduce multiple GPUs on a number of nodes
   virtual void allReduce(std::vector<at::Tensor>& data,
                          THDReduceOp operation,
                          THDGroup group_id = THDGroupWORLD) = 0;
   virtual void allReduce(at::Tensor& data, THDReduceOp operation,
                          THDGroup group_id = THDGroupWORLD) = 0;
   /**
    * Reduce multiple GPUs on a number of nodes
    * data[0]'s GPU in dstRank will receive the result
    */
   virtual void reduce(std::vector<at::Tensor>& data,
                       THDReduceOp operation,
                       rank_type dstRank,
                       THDGroup groupId = THDGroupWORLD) = 0;
   virtual void reduce(at::Tensor& data,
                       THDReduceOp operation,
                       rank_type dst_rank,
                       THDGroup group_id = THDGroupWORLD) = 0;
   /**
    * Broadcast multiple GPUs on a number of nodes
    * data[0]'s GPU in srcRank will be the source to broadcast
    */
   virtual void broadcast(std::vector<at::Tensor>& data,
                          rank_type srcRank,
                          THDGroup groupId = THDGroupWORLD) = 0;
   virtual void broadcast(at::Tensor& data,
                          rank_type src_rank,
                          THDGroup group_id = THDGroupWORLD) = 0;
   virtual void send(Scalar& value, rank_type src_rank) = 0;
   virtual void send(at::Tensor& data, rank_type dst_rank) = 0;
   virtual void receive(Scalar& value, rank_type src_rank) = 0;
   virtual rank_type receive(at::Tensor& data) = 0; // receive from any source
   virtual void receive(at::Tensor& data, rank_type src_rank) = 0;
   virtual Request* isend(at::Tensor& data, rank_type dst_rank) = 0;
   virtual Request* ireceive(at::Tensor& data, rank_type src_rank) = 0;

   virtual void barrier(THDGroup group_id = THDGroupWORLD) = 0;

   virtual THDGroup newGroup(const std::vector<rank_type>& ranks) = 0;
   virtual void clearGroupCache(THDGroup group_id = THDGroupWORLD) = 0;

   static DataChannel* newChannel(THDChannelType type,
                                  std::string init_method,
                                  int world_size,
                                  std::string group_name, int rank);
 };

 } // namespace thd
	#pragma once

	#include "ChannelType.h"
	#include "ChannelUtils.hpp"
	#include "DataChannel.h"
	#include "Scalar.hpp"
	#include "init_methods/InitMethod.hpp"

	#include <ATen/ATen.h>

	#include <vector>
	#include <unordered_map>
	#include <utility>


	MAKE_HASHABLE(THDReduceOp, static_cast<int>(t));
	MAKE_HASHABLE(thd::RPCType, static_cast<char>(t));
	MAKE_HASHABLE(at::ScalarType, static_cast<int>(t));


	namespace thd {

	struct DataChannel {

	struct Request {
	Request() {};
	virtual ~Request() {};

	// Checks if request has completed. Non-blocking operation.
	virtual bool isCompleted() = 0;
	// Waits until request completes. Blocking operation.
	virtual void wait() = 0;
	};

	struct Group {
	Group();
	/*
	* Constructs `Group` from provided `ranks` and checks if all ranks are
	* in range: [0, `max_rank`].
	*
	* `ranks` vector should have mapping from new ranks to old ranks (global ranks)
	* eg. ranks = {[0] = 6, [1] = 2} which means that 0 and 1 are new ranks in group
	* and 6, 2 are global ranks corresponding to 0 and 1 respectively.
	*/
	Group(std::vector<rank_type> ranks, rank_type max_rank);
	virtual ~Group();

	rank_type size() const;

	/*
	* In contrast to `getGroupRank` this function throws `std::logic_error`
	* when rank is member of this group.
	*/
	rank_type mustGetGroupRank(rank_type global_rank) const;
	std::pair<rank_type, bool> getGroupRank(rank_type global_rank) const;

	/*
	* In contrast to `getGlobalRank` this function throws `std::logic_error`
	* when provided `group_rank` is not in range of group.
	*/
	rank_type mustGetGlobalRank(rank_type group_rank) const;
	std::pair<rank_type, bool> getGlobalRank(rank_type group_rank) const;

	private:
	// maps new group ranks to old ranks (global ranks)
	std::vector<rank_type> _new2old;

	// maps old ranks (global ranks) to new group ranks
	std::unordered_map<rank_type, rank_type> _old2new;
	};

	DataChannel() {};
	virtual ~DataChannel() {};

	virtual bool init() = 0;

	/**
	* This is required for NCCL backend, since the destroy cannot be done before
	* CUDA is unloaded since DataChannel is a static object.
	*/
	virtual void destroy() = 0;

	virtual rank_type getRank() = 0;
	virtual rank_type getNumProcesses() = 0;

	/**
	* All gather inputs from multiple GPUs, each Tensor in input vector should be
	* on a separate GPU.
	*
	* Also note that the output vector is a 1D vector (flattened from 2D),
	* with the size of input.size() * world_size.
	*
	* For instance, rank i 's input[k] tensor would be in
	* output[i * input.size() + k].
	*/
	virtual void allGather(std::vector<at::Tensor>& output,
	std::vector<at::Tensor>& input,
	THDGroup groupId = THDGroupWORLD) = 0;
	virtual void allGather(std::vector<at::Tensor>& output,
	at::Tensor& input,
	THDGroup group_id = THDGroupWORLD) = 0;
	virtual void gather(std::vector<at::Tensor>& output,
	at::Tensor& input,
	rank_type dst_rank,
	THDGroup group_id = THDGroupWORLD) = 0;
	virtual void scatter(std::vector<at::Tensor>& input,
	at::Tensor& output,
	rank_type src_rank,
	THDGroup group_id = THDGroupWORLD) = 0;
	// All reduce multiple GPUs on a number of nodes
	virtual void allReduce(std::vector<at::Tensor>& data,
	THDReduceOp operation,
	THDGroup group_id = THDGroupWORLD) = 0;
	virtual void allReduce(at::Tensor& data, THDReduceOp operation,
	THDGroup group_id = THDGroupWORLD) = 0;
	/**
	* Reduce multiple GPUs on a number of nodes
	* data[0]'s GPU in dstRank will receive the result
	*/
	virtual void reduce(std::vector<at::Tensor>& data,
	THDReduceOp operation,
	rank_type dstRank,
	THDGroup groupId = THDGroupWORLD) = 0;
	virtual void reduce(at::Tensor& data,
	THDReduceOp operation,
	rank_type dst_rank,
	THDGroup group_id = THDGroupWORLD) = 0;
	/**
	* Broadcast multiple GPUs on a number of nodes
	* data[0]'s GPU in srcRank will be the source to broadcast
	*/
	virtual void broadcast(std::vector<at::Tensor>& data,
	rank_type srcRank,
	THDGroup groupId = THDGroupWORLD) = 0;
	virtual void broadcast(at::Tensor& data,
	rank_type src_rank,
	THDGroup group_id = THDGroupWORLD) = 0;
	virtual void send(Scalar& value, rank_type src_rank) = 0;
	virtual void send(at::Tensor& data, rank_type dst_rank) = 0;
	virtual void receive(Scalar& value, rank_type src_rank) = 0;
	virtual rank_type receive(at::Tensor& data) = 0; // receive from any source
	virtual void receive(at::Tensor& data, rank_type src_rank) = 0;
	virtual Request* isend(at::Tensor& data, rank_type dst_rank) = 0;
	virtual Request* ireceive(at::Tensor& data, rank_type src_rank) = 0;

	virtual void barrier(THDGroup group_id = THDGroupWORLD) = 0;

	virtual THDGroup newGroup(const std::vector<rank_type>& ranks) = 0;
	virtual void clearGroupCache(THDGroup group_id = THDGroupWORLD) = 0;

	static DataChannel* newChannel(THDChannelType type,
	std::string init_method,
	int world_size,
	std::string group_name, int rank);
	};

	} // namespace thd