torch/lib/THNN/generic/VolumetricAveragePooling.c - platform/external/pytorch - Git at Google

 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
 #else

 static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
                          THNNState *state,
                          THTensor *input,
                          THTensor *gradOutput,
                          int kT,
                          int kW,
                          int kH,
                          int dT,
                          int dW,
                          int dH,
                          int padT,
                          int padW,
                          int padH,
                          bool ceil_mode)
 {
   int64_t nslices;
   int64_t itime;
   int64_t iheight;
   int64_t iwidth;
   int64_t otime;
   int64_t oheight;
   int64_t owidth;
   int ndim = input->nDimension;
   int dimN = 0;
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;

   if (input->nDimension == 5)
   {
     dimN++;
     dimt++;
     dimh++;
     dimw++;
   }

   THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
              "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
              kT, kH, kW);
   THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
              "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
              dT, dH, dW);
   THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
                 "4D or 5D (batch mode) tensor expected for input, but got: %s");

   THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
              && input->size[dimt] >= kT, 2,
              "input image (T: %d H: %d W: %d) smaller than "
              "kernel size (kT: %d kH: %d kW: %d)",
              input->size[dimt], input->size[dimh], input->size[dimw],
              kT, kH, kW);

   // The second argument is argNumber... here is the index of padH.
   THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 11,
             "pad should not be greater than half of kernel size, but got "
             "padT = %d, padW = %d, padH = %d, kT = %d, kW = %d, kH = %d",
             padT, padW, padH, kT, kW, kH);

   /* sizes */
   nslices = input->size[dimN];
   itime   = input->size[dimt];
   iheight = input->size[dimh];
   iwidth  = input->size[dimw];

   if (ceil_mode) {
     otime   = (int64_t)(ceil((float)(itime   - kT + 2*padT) / dT)) + 1;
     oheight = (int64_t)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
     owidth  = (int64_t)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
   }
   else
   {
     otime   = (int64_t)(floor((float)(itime   - kT + 2*padT) / dT)) + 1;
     oheight = (int64_t)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
     owidth  = (int64_t)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
   }

   if (padT || padW || padH)
   {
     // ensure that the last pooling starts inside the image
     // needed to avoid problems in ceil mode
     if ((otime   - 1)*dT >= itime   + padT)
       --otime;
     if ((oheight - 1)*dH >= iheight + padH)
       --oheight;
     if ((owidth  - 1)*dW >= iwidth  + padW)
       --owidth;
   }

   if (otime < 1 || owidth < 1 || oheight < 1)
     THError("Given input size: (%dx%dx%dx%d). "
 	    "Calculated output size: (%dx%dx%dx%d). Output size is too small",
             nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);

   if (gradOutput != NULL) {
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
   }
 }

 static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
           int64_t nslices,
           int64_t itime,
           int64_t iwidth,
           int64_t iheight,
           int64_t otime,
           int64_t owidth,
           int64_t oheight,
           int kT,
           int kW,
           int kH,
           int dT,
           int dW,
           int dH,
           int padT,
           int padW,
           int padH,
           bool count_include_pad)
 {
   int64_t k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
     int64_t i, j, ti;

     /* local pointers. */
     real *ip = input_p + k * itime * iwidth * iheight;
     real *op = output_p + k * otime * owidth * oheight;
     for (i = 0; i < otime * oheight * owidth; ++i)
       *(op + i) = 0;

     /* loop over output */
     for (ti = 0; ti < otime; ti++)
     {
       for (i = 0; i < oheight; i++)
       {
         for (j = 0; j < owidth; j++)
         {
           /* compute pool range. */
           int64_t tstart = ti * dT - padT;
           int64_t hstart = i  * dH - padH;
           int64_t wstart = j  * dW - padW;
           int64_t tend = fminf(tstart + kT, itime + padT);
           int64_t hend = fminf(hstart + kH, iheight + padH);
           int64_t wend = fminf(wstart + kW, iwidth + padW);
           int64_t pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
           tstart = fmaxf(tstart, 0);
           hstart = fmaxf(hstart, 0);
           wstart = fmaxf(wstart, 0);
           tend = fmin(tend, itime);
           hend = fmin(hend, iheight);
           wend = fmin(wend, iwidth);

           int divide_factor;
           if (count_include_pad)
             divide_factor = pool_size;
           else
             divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart);

           /* compute local sum: */
           real sum = 0.0;
           int64_t x, y, z;

           for (z = tstart; z < tend; z++)
           {
             for (y = hstart; y < hend; y++)
             {
               for (x = wstart; x < wend; x++)
               {
                 sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
               }
             }
           }

           /* set output to local max */
           *op++ += sum / divide_factor;
         }
       }
     }
   }
 }

 void THNN_(VolumetricAveragePooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
           int kT,
           int kW,
           int kH,
           int dT,
           int dW,
           int dH,
           int padT,
           int padW,
           int padH,
           bool ceil_mode,
           bool count_include_pad)
 {
   int64_t nslices;
   int64_t itime;
   int64_t iheight;
   int64_t iwidth;
   int64_t otime;
   int64_t oheight;
   int64_t owidth;
   real *input_data;
   real *output_data;

   THNN_(VolumetricAveragePooling_shapeCheck)(
         state, input, NULL, kT, kW, kH,
         dT, dW, dH, padT, padW, padH, ceil_mode);

   int dimN = 0;
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;

   if (input->nDimension == 5)
   {
     dimN++;
     dimt++;
     dimh++;
     dimw++;
   }

   /* sizes */
   nslices = input->size[dimN];
   itime   = input->size[dimt];
   iheight = input->size[dimh];
   iwidth  = input->size[dimw];
   if (ceil_mode)
   {
     otime   = (int64_t)(ceil((float)(itime   - kT + 2*padT) / dT)) + 1;
     oheight = (int64_t)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
     owidth  = (int64_t)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
   }
   else
   {
     otime   = (int64_t)(floor((float)(itime   - kT + 2*padT) / dT)) + 1;
     oheight = (int64_t)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
     owidth  = (int64_t)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
   }
   if (padT || padH || padW)
   {
     // ensure that the last pooling starts inside the image
     // needed to avoid problems in ceil mode
     if ((otime   - 1)*dT >= itime   + padT)
       --otime;
     if ((oheight - 1)*dH >= iheight + padH)
       --oheight;
     if ((owidth  - 1)*dW >= iwidth  + padW)
       --owidth;
   }

   /* get contiguous input */
   input = THTensor_(newContiguous)(input);

   if (input->nDimension == 4) /* non-batch mode */
   {
     /* resize output */
     THTensor_(resize4d)(output, nslices, otime, oheight, owidth);

     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);

     THNN_(VolumetricAveragePooling_updateOutput_frame)(
       input_data, output_data, nslices,
       itime, iwidth, iheight,
       otime, owidth, oheight,
       kT, kW, kH,
       dT, dW, dH,
       padT, padW, padH,
       count_include_pad
     );
   }
   else  /* batch mode */
   {
     int64_t p;
     int64_t nBatch = input->size[0];

     int64_t istride = nslices * itime * iwidth * iheight;
     int64_t ostride = nslices * otime * owidth * oheight;

     /* resize output */
     THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);

     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);

 #pragma omp parallel for private(p)
     for (p=0; p < nBatch; p++)
     {
       THNN_(VolumetricAveragePooling_updateOutput_frame)(
         input_data + p * istride, output_data + p * ostride, nslices,
         itime, iwidth, iheight,
         otime, owidth, oheight,
         kT, kW, kH,
         dT, dW, dH,
         padT, padW, padH,
         count_include_pad
       );
     }
   }

   /* cleanup */
   THTensor_(free)(input);
 }

 static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
           real *gradInput_p,
           real *gradOutput_p,
           int64_t nslices,
           int64_t itime,
           int64_t iwidth,
           int64_t iheight,
           int64_t otime,
           int64_t owidth,
           int64_t oheight,
           int kT,
           int kW,
           int kH,
           int dT,
           int dW,
           int dH,
           int padT,
           int padW,
           int padH,
           bool count_include_pad)
 {
   int64_t k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
     int64_t i, j, ti;

     /* local pointers */
     real *ip = gradInput_p + k * itime * iwidth * iheight;
     real *op = gradOutput_p + k * otime * owidth * oheight;
     for (i = 0; i < itime*iwidth*iheight; i++)
       *(ip + i) = 0;

     /* loop over output */
     for (ti = 0; ti < otime; ti++)
     {
       for (i = 0; i < oheight; i++)
       {
         for (j = 0; j < owidth; j++)
         {
           int64_t tstart = ti * dT - padT;
           int64_t hstart = i  * dH - padH;
           int64_t wstart = j  * dW - padW;
           int64_t tend = fminf(tstart + kT, itime + padT);
           int64_t hend = fminf(hstart + kH, iheight + padH);
           int64_t wend = fminf(wstart + kW, iwidth + padW);
           int64_t pool_size = (tend -tstart) * (hend - hstart) * (wend - wstart);
           tstart = fmaxf(tstart, 0);
           hstart = fmaxf(hstart, 0);
           wstart = fmaxf(wstart, 0);
           tend = fminf(tend, itime);
           hend = fminf(hend, iheight);
           wend = fminf(wend, iwidth);

           int64_t divide_factor;
           if (count_include_pad)
             divide_factor = pool_size;
           else
             divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart);

           /* scatter gradients out to footprint: */
           real val  = *op++;

           int64_t x,y,z;
           for (z = tstart; z < tend; z++)
           {
             for (y = hstart; y < hend; y++)
             {
               for (x = wstart; x < wend; x++)
               {
                 *(ip + z * iheight * iwidth + y * iwidth + x) += val / divide_factor;
               }
             }
           }
         }
       }
     }
   }
 }

 void THNN_(VolumetricAveragePooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
           int kT,
           int kW,
           int kH,
           int dT,
           int dW,
           int dH,
           int padT,
           int padW,
           int padH,
           bool ceil_mode,
           bool count_include_pad)
 {
   int64_t nslices;
   int64_t itime;
   int64_t iheight;
   int64_t iwidth;
   int64_t otime;
   int64_t oheight;
   int64_t owidth;
   real *gradInput_data;
   real *gradOutput_data;

   int dimN = 0;
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;

   THNN_(VolumetricAveragePooling_shapeCheck)(
         state, input, gradOutput, kT, kW, kH,
         dT, dW, dH, padT, padW, padH, ceil_mode);

   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);

   /* resize */
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(zero)(gradInput);

   if (input->nDimension == 5)
   {
     dimN++;
     dimt++;
     dimh++;
     dimw++;
   }

   /* sizes */
   nslices = input->size[dimN];
   itime = input->size[dimt];
   iheight = input->size[dimh];
   iwidth = input->size[dimw];
   otime = gradOutput->size[dimt];
   oheight = gradOutput->size[dimh];
   owidth = gradOutput->size[dimw];

   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);

   /* backprop */
   if (input->nDimension == 4) /* non-batch mode*/
   {
     THNN_(VolumetricAveragePooling_updateGradInput_frame)(
       gradInput_data, gradOutput_data, nslices,
       itime, iwidth, iheight,
       otime, owidth, oheight,
       kT, kW, kH,
       dT, dW, dH,
       padT, padW, padH,
       count_include_pad
     );
   }
   else /* batch mode */
   {
     int64_t p;
     int64_t nBatch = input->size[0];

     int64_t istride = nslices * itime * iwidth * iheight;
     int64_t ostride = nslices * otime * owidth * oheight;

 #pragma omp parallel for private(p)
     for (p = 0; p < nBatch; p++)
     {
       THNN_(VolumetricAveragePooling_updateGradInput_frame)(
         gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
         itime, iwidth, iheight,
         otime, owidth, oheight,
         kT, kW, kH,
         dT, dW, dH,
         padT, padW, padH,
         count_include_pad
       );
     }
   }

   /* cleanup */
   THTensor_(free)(gradOutput);
 }

 #endif
	#ifndef TH_GENERIC_FILE
	#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
	#else

	static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	int kT,
	int kW,
	int kH,
	int dT,
	int dW,
	int dH,
	int padT,
	int padW,
	int padH,
	bool ceil_mode)
	{
	int64_t nslices;
	int64_t itime;
	int64_t iheight;
	int64_t iwidth;
	int64_t otime;
	int64_t oheight;
	int64_t owidth;
	int ndim = input->nDimension;
	int dimN = 0;
	int dimt = 1;
	int dimh = 2;
	int dimw = 3;

	if (input->nDimension == 5)
	{
	dimN++;
	dimt++;
	dimh++;
	dimw++;
	}

	THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
	"kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
	kT, kH, kW);
	THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
	"stride should be greater than zero, but got dT: %d dH: %d dW: %d",
	dT, dH, dW);
	THNN_ARGCHECK(input->nDimension == 4 \|\| input->nDimension == 5, 2, input,
	"4D or 5D (batch mode) tensor expected for input, but got: %s");

	THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
	&& input->size[dimt] >= kT, 2,
	"input image (T: %d H: %d W: %d) smaller than "
	"kernel size (kT: %d kH: %d kW: %d)",
	input->size[dimt], input->size[dimh], input->size[dimw],
	kT, kH, kW);

	// The second argument is argNumber... here is the index of padH.
	THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 11,
	"pad should not be greater than half of kernel size, but got "
	"padT = %d, padW = %d, padH = %d, kT = %d, kW = %d, kH = %d",
	padT, padW, padH, kT, kW, kH);

	/* sizes */
	nslices = input->size[dimN];
	itime = input->size[dimt];
	iheight = input->size[dimh];
	iwidth = input->size[dimw];

	if (ceil_mode) {
	otime = (int64_t)(ceil((float)(itime - kT + 2*padT) / dT)) + 1;
	oheight = (int64_t)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
	owidth = (int64_t)(ceil((float)(iwidth - kW + 2*padW) / dW)) + 1;
	}
	else
	{
	otime = (int64_t)(floor((float)(itime - kT + 2*padT) / dT)) + 1;
	oheight = (int64_t)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
	owidth = (int64_t)(floor((float)(iwidth - kW + 2*padW) / dW)) + 1;
	}

	if (padT \|\| padW \|\| padH)
	{
	// ensure that the last pooling starts inside the image
	// needed to avoid problems in ceil mode
	if ((otime - 1)*dT >= itime + padT)
	--otime;
	if ((oheight - 1)*dH >= iheight + padH)
	--oheight;
	if ((owidth - 1)*dW >= iwidth + padW)
	--owidth;
	}

	if (otime < 1 \|\| owidth < 1 \|\| oheight < 1)
	THError("Given input size: (%dx%dx%dx%d). "
	"Calculated output size: (%dx%dx%dx%d). Output size is too small",
	nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);

	if (gradOutput != NULL) {
	THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
	THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
	THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
	THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
	}
	}

	static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
	real *input_p,
	real *output_p,
	int64_t nslices,
	int64_t itime,
	int64_t iwidth,
	int64_t iheight,
	int64_t otime,
	int64_t owidth,
	int64_t oheight,
	int kT,
	int kW,
	int kH,
	int dT,
	int dW,
	int dH,
	int padT,
	int padW,
	int padH,
	bool count_include_pad)
	{
	int64_t k;
	#pragma omp parallel for private(k)
	for (k = 0; k < nslices; k++)
	{
	int64_t i, j, ti;

	/* local pointers. */
	real ip = input_p + k itime * iwidth * iheight;
	real op = output_p + k otime * owidth * oheight;
	for (i = 0; i < otime * oheight * owidth; ++i)
	*(op + i) = 0;

	/* loop over output */
	for (ti = 0; ti < otime; ti++)
	{
	for (i = 0; i < oheight; i++)
	{
	for (j = 0; j < owidth; j++)
	{
	/* compute pool range. */
	int64_t tstart = ti * dT - padT;
	int64_t hstart = i * dH - padH;
	int64_t wstart = j * dW - padW;
	int64_t tend = fminf(tstart + kT, itime + padT);
	int64_t hend = fminf(hstart + kH, iheight + padH);
	int64_t wend = fminf(wstart + kW, iwidth + padW);
	int64_t pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
	tstart = fmaxf(tstart, 0);
	hstart = fmaxf(hstart, 0);
	wstart = fmaxf(wstart, 0);
	tend = fmin(tend, itime);
	hend = fmin(hend, iheight);
	wend = fmin(wend, iwidth);

	int divide_factor;
	if (count_include_pad)
	divide_factor = pool_size;
	else
	divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart);

	/* compute local sum: */
	real sum = 0.0;
	int64_t x, y, z;

	for (z = tstart; z < tend; z++)
	{
	for (y = hstart; y < hend; y++)
	{
	for (x = wstart; x < wend; x++)
	{
	sum += (ip + z iwidth * iheight + y * iwidth + x);
	}
	}
	}

	/* set output to local max */
	*op++ += sum / divide_factor;
	}
	}
	}
	}
	}

	void THNN_(VolumetricAveragePooling_updateOutput)(
	THNNState *state,
	THTensor *input,
	THTensor *output,
	int kT,
	int kW,
	int kH,
	int dT,
	int dW,
	int dH,
	int padT,
	int padW,
	int padH,
	bool ceil_mode,
	bool count_include_pad)
	{
	int64_t nslices;
	int64_t itime;
	int64_t iheight;
	int64_t iwidth;
	int64_t otime;
	int64_t oheight;
	int64_t owidth;
	real *input_data;
	real *output_data;

	THNN_(VolumetricAveragePooling_shapeCheck)(
	state, input, NULL, kT, kW, kH,
	dT, dW, dH, padT, padW, padH, ceil_mode);

	int dimN = 0;
	int dimt = 1;
	int dimh = 2;
	int dimw = 3;

	if (input->nDimension == 5)
	{
	dimN++;
	dimt++;
	dimh++;
	dimw++;
	}

	/* sizes */
	nslices = input->size[dimN];
	itime = input->size[dimt];
	iheight = input->size[dimh];
	iwidth = input->size[dimw];
	if (ceil_mode)
	{
	otime = (int64_t)(ceil((float)(itime - kT + 2*padT) / dT)) + 1;
	oheight = (int64_t)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
	owidth = (int64_t)(ceil((float)(iwidth - kW + 2*padW) / dW)) + 1;
	}
	else
	{
	otime = (int64_t)(floor((float)(itime - kT + 2*padT) / dT)) + 1;
	oheight = (int64_t)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
	owidth = (int64_t)(floor((float)(iwidth - kW + 2*padW) / dW)) + 1;
	}
	if (padT \|\| padH \|\| padW)
	{
	// ensure that the last pooling starts inside the image
	// needed to avoid problems in ceil mode
	if ((otime - 1)*dT >= itime + padT)
	--otime;
	if ((oheight - 1)*dH >= iheight + padH)
	--oheight;
	if ((owidth - 1)*dW >= iwidth + padW)
	--owidth;
	}

	/* get contiguous input */
	input = THTensor_(newContiguous)(input);

	if (input->nDimension == 4) /* non-batch mode */
	{
	/* resize output */
	THTensor_(resize4d)(output, nslices, otime, oheight, owidth);

	input_data = THTensor_(data)(input);
	output_data = THTensor_(data)(output);

	THNN_(VolumetricAveragePooling_updateOutput_frame)(
	input_data, output_data, nslices,
	itime, iwidth, iheight,
	otime, owidth, oheight,
	kT, kW, kH,
	dT, dW, dH,
	padT, padW, padH,
	count_include_pad
	);
	}
	else /* batch mode */
	{
	int64_t p;
	int64_t nBatch = input->size[0];

	int64_t istride = nslices * itime * iwidth * iheight;
	int64_t ostride = nslices * otime * owidth * oheight;

	/* resize output */
	THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);

	input_data = THTensor_(data)(input);
	output_data = THTensor_(data)(output);

	#pragma omp parallel for private(p)
	for (p=0; p < nBatch; p++)
	{
	THNN_(VolumetricAveragePooling_updateOutput_frame)(
	input_data + p * istride, output_data + p * ostride, nslices,
	itime, iwidth, iheight,
	otime, owidth, oheight,
	kT, kW, kH,
	dT, dW, dH,
	padT, padW, padH,
	count_include_pad
	);
	}
	}

	/* cleanup */
	THTensor_(free)(input);
	}

	static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
	real *gradInput_p,
	real *gradOutput_p,
	int64_t nslices,
	int64_t itime,
	int64_t iwidth,
	int64_t iheight,
	int64_t otime,
	int64_t owidth,
	int64_t oheight,
	int kT,
	int kW,
	int kH,
	int dT,
	int dW,
	int dH,
	int padT,
	int padW,
	int padH,
	bool count_include_pad)
	{
	int64_t k;
	#pragma omp parallel for private(k)
	for (k = 0; k < nslices; k++)
	{
	int64_t i, j, ti;

	/* local pointers */
	real ip = gradInput_p + k itime * iwidth * iheight;
	real op = gradOutput_p + k otime * owidth * oheight;
	for (i = 0; i < itimeiwidthiheight; i++)
	*(ip + i) = 0;

	/* loop over output */
	for (ti = 0; ti < otime; ti++)
	{
	for (i = 0; i < oheight; i++)
	{
	for (j = 0; j < owidth; j++)
	{
	int64_t tstart = ti * dT - padT;
	int64_t hstart = i * dH - padH;
	int64_t wstart = j * dW - padW;
	int64_t tend = fminf(tstart + kT, itime + padT);
	int64_t hend = fminf(hstart + kH, iheight + padH);
	int64_t wend = fminf(wstart + kW, iwidth + padW);
	int64_t pool_size = (tend -tstart) * (hend - hstart) * (wend - wstart);
	tstart = fmaxf(tstart, 0);
	hstart = fmaxf(hstart, 0);
	wstart = fmaxf(wstart, 0);
	tend = fminf(tend, itime);
	hend = fminf(hend, iheight);
	wend = fminf(wend, iwidth);

	int64_t divide_factor;
	if (count_include_pad)
	divide_factor = pool_size;
	else
	divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart);

	/* scatter gradients out to footprint: */
	real val = *op++;

	int64_t x,y,z;
	for (z = tstart; z < tend; z++)
	{
	for (y = hstart; y < hend; y++)
	{
	for (x = wstart; x < wend; x++)
	{
	(ip + z iheight * iwidth + y * iwidth + x) += val / divide_factor;
	}
	}
	}
	}
	}
	}
	}
	}

	void THNN_(VolumetricAveragePooling_updateGradInput)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	THTensor *gradInput,
	int kT,
	int kW,
	int kH,
	int dT,
	int dW,
	int dH,
	int padT,
	int padW,
	int padH,
	bool ceil_mode,
	bool count_include_pad)
	{
	int64_t nslices;
	int64_t itime;
	int64_t iheight;
	int64_t iwidth;
	int64_t otime;
	int64_t oheight;
	int64_t owidth;
	real *gradInput_data;
	real *gradOutput_data;

	int dimN = 0;
	int dimt = 1;
	int dimh = 2;
	int dimw = 3;

	THNN_(VolumetricAveragePooling_shapeCheck)(
	state, input, gradOutput, kT, kW, kH,
	dT, dW, dH, padT, padW, padH, ceil_mode);

	/* get contiguous gradOutput */
	gradOutput = THTensor_(newContiguous)(gradOutput);

	/* resize */
	THTensor_(resizeAs)(gradInput, input);
	THTensor_(zero)(gradInput);

	if (input->nDimension == 5)
	{
	dimN++;
	dimt++;
	dimh++;
	dimw++;
	}

	/* sizes */
	nslices = input->size[dimN];
	itime = input->size[dimt];
	iheight = input->size[dimh];
	iwidth = input->size[dimw];
	otime = gradOutput->size[dimt];
	oheight = gradOutput->size[dimh];
	owidth = gradOutput->size[dimw];

	/* get raw pointers */
	gradInput_data = THTensor_(data)(gradInput);
	gradOutput_data = THTensor_(data)(gradOutput);

	/* backprop */
	if (input->nDimension == 4) /* non-batch mode*/
	{
	THNN_(VolumetricAveragePooling_updateGradInput_frame)(
	gradInput_data, gradOutput_data, nslices,
	itime, iwidth, iheight,
	otime, owidth, oheight,
	kT, kW, kH,
	dT, dW, dH,
	padT, padW, padH,
	count_include_pad
	);
	}
	else /* batch mode */
	{
	int64_t p;
	int64_t nBatch = input->size[0];

	int64_t istride = nslices * itime * iwidth * iheight;
	int64_t ostride = nslices * otime * owidth * oheight;

	#pragma omp parallel for private(p)
	for (p = 0; p < nBatch; p++)
	{
	THNN_(VolumetricAveragePooling_updateGradInput_frame)(
	gradInput_data + p * istride, gradOutput_data + p * ostride, nslices,
	itime, iwidth, iheight,
	otime, owidth, oheight,
	kT, kW, kH,
	dT, dW, dH,
	padT, padW, padH,
	count_include_pad
	);
	}
	}

	/* cleanup */
	THTensor_(free)(gradOutput);
	}

	#endif