src/enqueue.h - platform/external/pytorch - Git at Google

 /*************************************************************************
  * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *  * Neither the name of NVIDIA CORPORATION nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  ************************************************************************/

 #ifndef enqueue_h_
 #define enqueue_h_

 #include "core.h"

 int getRingIndex(const ncclComm_t comm, int device);
 void lockEventQueue(EventQueue* eq);
 void releaseEventQueue(EventQueue* eq);
 void CUDART_CB freeEvent(cudaStream_t stream, cudaError_t status, void* eq_void);

 /* Syncronize with user stream and launch the collective.
  * All work is performed asynchronously with the host thread.
  * The actual collective should be a functor with the
  * folloaing signature.
  * ncclResult_t collective(void* sendbuff, void* recvbuff,
  *                         int count, ncclDataType_t type, ncclRedOp_t op,
  *                         int root, ncclComm_t comm);
  * Unneeded arguments should be ignored. The collective may
  * assume that the appropriate cuda device has been set. */
 template<typename ColFunc>
 ncclResult_t enqueue(ColFunc colfunc,
                      const void* sendbuff,
                      void* recvbuff,
                      int count,
                      ncclDataType_t type,
                      ncclRedOp_t op,
                      int root,
                      ncclComm_t comm,
                      cudaStream_t stream)
 {
   int curDevice;
   CUDACHECK( cudaGetDevice(&curDevice) );

   // No need for a mutex here because we assume that all enqueue operations happen in a fixed
   // order on all devices. Thus, thread race conditions SHOULD be impossible.
   EventQueue* eq = &comm->events;

   // Ensure that previous collective is complete
   cudaError_t flag = cudaEventQuery(eq->isDone[eq->back]);
   if( flag == cudaErrorNotReady )
     CUDACHECK( cudaStreamWaitEvent(stream, eq->isDone[eq->back], 0) );

   // Launch the collective here
   ncclResult_t ret = colfunc(sendbuff, recvbuff, count, type, op, root, comm, stream);

   eq->back = (eq->back + 1) % MAXQUEUE;
   CUDACHECK( cudaEventRecord(eq->isDone[eq->back], stream) );
   return ret;
 }

 #endif // End include guard
	/*************************************************************************
	* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of NVIDIA CORPORATION nor the names of its
	* contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	************************************************************************/

	#ifndef enqueue_h_
	#define enqueue_h_

	#include "core.h"

	int getRingIndex(const ncclComm_t comm, int device);
	void lockEventQueue(EventQueue* eq);
	void releaseEventQueue(EventQueue* eq);
	void CUDART_CB freeEvent(cudaStream_t stream, cudaError_t status, void* eq_void);

	/* Syncronize with user stream and launch the collective.
	* All work is performed asynchronously with the host thread.
	* The actual collective should be a functor with the
	* folloaing signature.
	* ncclResult_t collective(void* sendbuff, void* recvbuff,
	* int count, ncclDataType_t type, ncclRedOp_t op,
	* int root, ncclComm_t comm);
	* Unneeded arguments should be ignored. The collective may
	* assume that the appropriate cuda device has been set. */
	template<typename ColFunc>
	ncclResult_t enqueue(ColFunc colfunc,
	const void* sendbuff,
	void* recvbuff,
	int count,
	ncclDataType_t type,
	ncclRedOp_t op,
	int root,
	ncclComm_t comm,
	cudaStream_t stream)
	{
	int curDevice;
	CUDACHECK( cudaGetDevice(&curDevice) );

	// No need for a mutex here because we assume that all enqueue operations happen in a fixed
	// order on all devices. Thus, thread race conditions SHOULD be impossible.
	EventQueue* eq = &comm->events;

	// Ensure that previous collective is complete
	cudaError_t flag = cudaEventQuery(eq->isDone[eq->back]);
	if( flag == cudaErrorNotReady )
	CUDACHECK( cudaStreamWaitEvent(stream, eq->isDone[eq->back], 0) );

	// Launch the collective here
	ncclResult_t ret = colfunc(sendbuff, recvbuff, count, type, op, root, comm, stream);

	eq->back = (eq->back + 1) % MAXQUEUE;
	CUDACHECK( cudaEventRecord(eq->isDone[eq->back], stream) );
	return ret;
	}

	#endif // End include guard