THCTensorMath.cu - platform/external/pytorch - Git at Google

 #include "THCTensorMath.h"
 #include "THCGeneral.h"
 #include "THCBlas.h"
 #include "THCTensorRandom.h"

 #include <thrust/fill.h>
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
 #include <thrust/inner_product.h>

 #define NB_THREADS_PER_BLOCK 256

 #ifndef DIVUP
 #define DIVUP(x, y) (((x) + (y) - 1) / (y))
 #endif

 void THCudaTensor_fill(THCudaTensor *self_, float value)
 {
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));

   thrust::fill(self_data, self_data+THCudaTensor_nElement(self), value);

   THCudaTensor_freeCopyTo(self, self_);
 }

 void THCudaTensor_zero(THCudaTensor *self_)
 {
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   THCudaCheck(cudaMemset(THCudaTensor_data(self), 0, sizeof(float)*THCudaTensor_nElement(self)));
   THCudaTensor_freeCopyTo(self, self_);
 }

 void THCudaTensor_zeros(THCudaTensor *r_, THLongStorage *size)
 {
   THCudaTensor_resize(r_, size, NULL);
   THCudaTensor_zero(r_);
 }

 void THCudaTensor_ones(THCudaTensor *r_, THLongStorage *size)
 {
   THCudaTensor_resize(r_, size, NULL);
   THCudaTensor_fill(r_, 1);
 }

 void THCudaTensor_reshape(THCudaTensor *r_, THCudaTensor *t, THLongStorage *size)
 {
   THCudaTensor_resize(r_, size, NULL);
   THCudaTensor_copy(r_, t);
 }

 long THCudaTensor_numel(THCudaTensor *t)
 {
   return THCudaTensor_nElement(t);
 }


 struct addvalue_functor
 {
   const float value;

   addvalue_functor(float value_) : value(value_) {}

     __host__ __device__ float operator()(const float& x) const
   {
     return (x+value);
   }
 };

 void THCudaTensor_add(THCudaTensor *self_, THCudaTensor *src_, float value)
 {
   THCudaTensor_resizeAs(self_, src_);
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   THCudaTensor *src = THCudaTensor_newContiguous(src_);
   long size = THCudaTensor_nElement(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(src));

   thrust::transform(src_data, src_data+size, self_data, addvalue_functor(value));

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(self, self_);
 }

 struct mulvalue_functor
 {
   const float value;
   mulvalue_functor(float value_) : value(value_) {}
     __host__ __device__ float operator()(const float& x) const
   {
     return (x*value);
   }
 };

 void THCudaTensor_mul(THCudaTensor *self_, THCudaTensor *src_, float value)
 {
   THCudaTensor_resizeAs(self_, src_);
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   THCudaTensor *src = THCudaTensor_newContiguous(src_);
   long size = THCudaTensor_nElement(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(src));

   thrust::transform(src_data, src_data+size, self_data, mulvalue_functor(value));

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(self, self_);
 }

 struct divvalue_functor
 {
   const float value;
   divvalue_functor(float value_) : value(value_) {}
     __host__ __device__ float operator()(const float& x) const
   {
     return (x/value);
   }
 };

 void THCudaTensor_div(THCudaTensor *self_, THCudaTensor *src_, float value)
 {
   THCudaTensor_resizeAs(self_, src_);
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   THCudaTensor *src = THCudaTensor_newContiguous(src_);
   long size = THCudaTensor_nElement(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(src));

   thrust::transform(src_data, src_data+size, self_data, divvalue_functor(value));

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(self, self_);
 }

 void THCudaTensor_cadd(THCudaTensor *self_, THCudaTensor* src1, float value, THCudaTensor *src2)
 {
   THCudaTensor_resizeAs(self_, src1);
   THArgCheck(THCudaTensor_nElement(src1) == THCudaTensor_nElement(src2), 3, "size do not match");
   {
     THCudaTensor *self = THCudaTensor_newContiguous(self_);

     if (self_ != src1) {
       src1 = THCudaTensor_newContiguous(src1);
       THCudaTensor_copy(self, src1);
       THCudaTensor_free(src1);
     }

     src2 = THCudaTensor_newContiguous(src2);

     THCudaBlas_axpy(THCudaTensor_nElement(self), value, THCudaTensor_data(src2), 1, THCudaTensor_data(self), 1);

     THCudaTensor_free(src2);
     THCudaTensor_freeCopyTo(self, self_);
   }
 }

 void THCudaTensor_cmul(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
 {
   THCudaTensor_resizeAs(self_, src1);
   THArgCheck(THCudaTensor_nElement(src1) == THCudaTensor_nElement(src2), 3, "size do not match");
   {
     THCudaTensor *self = THCudaTensor_newContiguous(self_);
     long size = THCudaTensor_nElement(self);
     src1 = THCudaTensor_newContiguous(src1);
     src2 = THCudaTensor_newContiguous(src2);
     thrust::device_ptr<float> self_data(THCudaTensor_data(self));
     thrust::device_ptr<float> src1_data(THCudaTensor_data(src1));
     thrust::device_ptr<float> src2_data(THCudaTensor_data(src2));

     thrust::transform(src2_data, src2_data+size, src1_data, self_data, thrust::multiplies<float>());

     THCudaTensor_free(src1);
     THCudaTensor_free(src2);
     THCudaTensor_freeCopyTo(self, self_);
   }
 }

 void THCudaTensor_cdiv(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
 {
   THCudaTensor_resizeAs(self_, src1);
   THArgCheck(THCudaTensor_nElement(src1) == THCudaTensor_nElement(src2), 3, "size does not match");
   {
     THCudaTensor *self = THCudaTensor_newContiguous(self_);
     long size = THCudaTensor_nElement(self);
     src1 = THCudaTensor_newContiguous(src1);
     src2 = THCudaTensor_newContiguous(src2);
     thrust::device_ptr<float> self_data(THCudaTensor_data(self));
     thrust::device_ptr<float> src1_data(THCudaTensor_data(src1));
     thrust::device_ptr<float> src2_data(THCudaTensor_data(src2));

     thrust::transform(src1_data, src1_data+size, src2_data, self_data, thrust::divides<float>());

     THCudaTensor_free(src1);
     THCudaTensor_free(src2);
     THCudaTensor_freeCopyTo(self, self_);
   }
 }

 __global__ void THCudaTensor_kernel_addcmul(float *data, float value, float *src1, float *src2, long size)
 {
   long k = (((blockIdx.y * gridDim.x) + blockIdx.x) * blockDim.x) + threadIdx.x;

   if(k < size)
     data[k] += value*src1[k]*src2[k];
 }


 void THCudaTensor_addcmul(THCudaTensor *self_, THCudaTensor *t, float value, THCudaTensor *src1, THCudaTensor *src2)
 {
   if(self_ != t)
   {
     THCudaTensor_resizeAs(self_, t);
     THCudaTensor_copy(self_, t);
   }
   THCudaTensor_resizeAs(self_, src1);
   THArgCheck(THCudaTensor_nElement(src1) == THCudaTensor_nElement(src2), 3, "size do not match");
   {
     THCudaTensor *self = THCudaTensor_newContiguous(self_);
     long size = THCudaTensor_nElement(self);
     src1 = THCudaTensor_newContiguous(src1);
     src2 = THCudaTensor_newContiguous(src2);

     int nBlockPerRow, nBlockPerColumn, nThreadPerBlock;
     THCudaGetGridSize(&nBlockPerRow, &nBlockPerColumn, &nThreadPerBlock, size);
     dim3 threads(nThreadPerBlock);
     dim3 grid(nBlockPerRow, nBlockPerColumn);

     THCudaTensor_kernel_addcmul<<<grid, threads>>>(THCudaTensor_data(self), value, THCudaTensor_data(src1), THCudaTensor_data(src2), size);

     cudaError errcode = cudaGetLastError();
     if(errcode != cudaSuccess)
       THError(cudaGetErrorString(errcode));

     THCudaTensor_free(src1);
     THCudaTensor_free(src2);
     THCudaTensor_freeCopyTo(self, self_);
   }
 }

 __global__ void THCudaTensor_kernel_addcdiv(float *data, float value, float *src1, float *src2, long size)
 {
   long k = (((blockIdx.y * gridDim.x) + blockIdx.x) * blockDim.x) + threadIdx.x;

   if(k < size)
     data[k] += value*src1[k]/src2[k];
 }


 void THCudaTensor_addcdiv(THCudaTensor *self_, THCudaTensor *t, float value, THCudaTensor *src1, THCudaTensor *src2)
 {
   if(self_ != t)
   {
     THCudaTensor_resizeAs(self_, t);
     THCudaTensor_copy(self_, t);
   }

   THCudaTensor_resizeAs(self_, src1);
   THArgCheck(THCudaTensor_nElement(src1) == THCudaTensor_nElement(src2), 3, "size do not match");
   {
     THCudaTensor *self = THCudaTensor_newContiguous(self_);
     long size = THCudaTensor_nElement(self);
     src1 = THCudaTensor_newContiguous(src1);
     src2 = THCudaTensor_newContiguous(src2);

     int nBlockPerRow, nBlockPerColumn, nThreadPerBlock;
     THCudaGetGridSize(&nBlockPerRow, &nBlockPerColumn, &nThreadPerBlock, size);
     dim3 threads(nThreadPerBlock);
     dim3 grid(nBlockPerRow, nBlockPerColumn);

     THCudaTensor_kernel_addcdiv<<<grid, threads>>>(THCudaTensor_data(self), value, THCudaTensor_data(src1), THCudaTensor_data(src2), size);

     cudaError errcode = cudaGetLastError();
     if(errcode != cudaSuccess)
       THError(cudaGetErrorString(errcode));

     THCudaTensor_free(src1);
     THCudaTensor_free(src2);
     THCudaTensor_freeCopyTo(self, self_);
   }
 }

 float THCudaTensor_dot(THCudaTensor *self, THCudaTensor *src)
 {
   THArgCheck(THCudaTensor_nElement(self) == THCudaTensor_nElement(src), 2, "size do not match");

   {
     self = THCudaTensor_newContiguous(self);
     src = THCudaTensor_newContiguous(src);

     float result = THCudaBlas_dot(THCudaTensor_nElement(self),
                                   THCudaTensor_data(self), 1,
                                   THCudaTensor_data(src), 1);
     THCudaTensor_free(src);
     THCudaTensor_free(self);

     return result;
   }
 }

 float THCudaTensor_minall(THCudaTensor *self)
 {
   self = THCudaTensor_newContiguous(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));

   float result = thrust::reduce(self_data, self_data+THCudaTensor_nElement(self), (float)(THInf), thrust::minimum<float>());

   THCudaTensor_free(self);
   return result;
 }

 float THCudaTensor_maxall(THCudaTensor *self)
 {
   self = THCudaTensor_newContiguous(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));

   float result = thrust::reduce(self_data, self_data+THCudaTensor_nElement(self), (float)(-THInf), thrust::maximum<float>());

   THCudaTensor_free(self);
   return result;
 }

 float THCudaTensor_sumall(THCudaTensor *self)
 {
   self = THCudaTensor_newContiguous(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));

   float result = thrust::reduce(self_data, self_data+THCudaTensor_nElement(self), (float)(0), thrust::plus<float>());

   THCudaTensor_free(self);
   return result;
 }

 float THCudaTensor_prodall(THCudaTensor *self)
 {
   self = THCudaTensor_newContiguous(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));

   float result = thrust::reduce(self_data, self_data+THCudaTensor_nElement(self), (float)(1), thrust::multiplies<float>());

   THCudaTensor_free(self);
   return result;
 }


 struct dim4 {
     unsigned arr[4];

     __host__ dim4(unsigned init=0) {
         for(unsigned i=0; i<4; i++) { arr[i] = init; }
     }

     __host__ __device__ unsigned& operator[](const unsigned& idx) { return arr[idx]; }
 };


 /* Reduce one of the outer dimensions of a tensor
  *
  * For an n-d tensor (n <= 4) where the reduction is *not* along the innermost
  * dimension:
  *
  * - block.x and grid.x make up the innermost dimension;
  * - The reduced dimension is looped over inside a block; and
  * - grid.y and grid.z are the remaining two dimensions (if any).
  * - block.y and block.z are not used as we're limited to 512 or 1024 threads
  *   in the block.
  *
  * For sizes/strides, index 3 is the reduced dimension, while the remaining
  * indices are for the remaining dimensions with index 0 the innermost dimension.
  *
  * Reduction along the innermost dimension is handled in a separate kernel.
  */
 template<class UnaryFunction, class BinaryFunction>
 __global__ void THCudaTensor_kernel_transformReduceOuterDim(float *tgt, float *src_,
         dim4 src_stride, dim4 tgt_stride, dim4 size,
         UnaryFunction unary_op, float init, BinaryFunction binary_op)
 {
   const size_t reduce = 3;

   for(unsigned z = blockIdx.z; z < size[2] ; z += gridDim.z)
   for(unsigned y = blockIdx.y; y < size[1] ; y += gridDim.y)
   for(unsigned col = blockIdx.x * blockDim.x + threadIdx.x; col < size[0]; col += blockDim.x * gridDim.x) {
     float *src = src_ + z * src_stride[2] + y * src_stride[1] + col;
     float acc = init;
     for(unsigned i=0; i < size[reduce]; i++) {
       acc = binary_op(acc, unary_op(*src));
       src += src_stride[reduce];
     }
     tgt[z * tgt_stride[2] + y * tgt_stride[1] + col] = float(acc);
   }
 }


 template<class UnaryFunction, class BinaryFunction>
 __host__ void THCudaTensor_transformReduceOuterDim(THCudaTensor *tgt, THCudaTensor *src,
         long rdim, UnaryFunction unary_op, float init, BinaryFunction binary_op)
 {
   const size_t reduce = 3;
   dim4 src_stride(0);
   dim4 tgt_stride(0);
   dim4 size(1);

   unsigned ndim = THCudaTensor_nDimension(src);
   for(unsigned idim=0, o=ndim-2; idim < ndim; idim++) {
     unsigned odim = idim == rdim ? reduce : o--;
     src_stride[odim] = THCudaTensor_stride(src, idim);
     tgt_stride[odim] = THCudaTensor_stride(tgt, idim);
     size[odim]       = THCudaTensor_size(src, idim);
   }

   const unsigned nThreadPerBlock = 256;
   unsigned nBlockPerColumn = DIVUP(size[0], nThreadPerBlock);
   dim3 threads(nThreadPerBlock);
   unsigned maxGridDim = 1024; // anything < 64k is fine. The choice has no impact on performance.
   dim3 grid(min(maxGridDim, nBlockPerColumn), min(maxGridDim, size[1]), min(maxGridDim, size[2]));

   THCudaTensor_kernel_transformReduceOuterDim<<<grid, threads>>>(THCudaTensor_data(tgt),
           THCudaTensor_data(src), src_stride, tgt_stride, size, unary_op, init, binary_op);
   cudaError errcode = cudaGetLastError();
   if(errcode != cudaSuccess) {
     THError(cudaGetErrorString(errcode));
   }
 }


 /* Reduce the innermost dimension of a tensor
  *
  * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
  *
  * - block.x is the innermost dimension, i.e. dimension 0;
  * - block.y and grid.y make up dimension 1; and
  * - grid.x and grid z are the remaining two outer dimensions (if any)
  *
  * Reduction along other dimensions is handled in a separate kernel.
  */
 template<class UnaryFunction, class BinaryFunction>
 __global__ void THCudaTensor_kernel_transformReduceInnermostDim(float *tgt, float *src_,
         dim4 src_stride, dim4 tgt_stride, dim4 size, UnaryFunction unary_op, float init, BinaryFunction binary_op)
 {
   __shared__ float sbuf[16][32]; // 8kB

   for(unsigned z = blockIdx.z; z < size[3] ; z += gridDim.z)
   for(unsigned x = blockIdx.x; x < size[2] ; x += gridDim.x)
   for(unsigned bRow = blockIdx.y * blockDim.y; bRow < size[1]; bRow += blockDim.y * gridDim.y) {

     float acc = init;
     unsigned row = bRow + threadIdx.y;
     float *src = src_ + z * src_stride[3] + x * src_stride[2] + row * src_stride[1];
     bool reducing = threadIdx.x < blockDim.y && bRow + threadIdx.x < size[1] && threadIdx.y == 0;

     for(unsigned bCol=0; bCol < size[0]; bCol += blockDim.x) {

       sbuf[threadIdx.y][threadIdx.x] = init;
       unsigned col = bCol + threadIdx.x;
       if(row < size[1] && col < size[0]) {
         sbuf[threadIdx.y][threadIdx.x] = unary_op(src[col]);
       }
       __syncthreads();

       float* line = &sbuf[threadIdx.y][0];
       for(unsigned s = 16; s > 1; s >>= 1) {
         if(row < size[1] && threadIdx.x < s) {
           line[threadIdx.x] = binary_op(line[threadIdx.x], line[threadIdx.x + s]);
         }
         __syncthreads();
       }
       if(reducing) {
         sbuf[threadIdx.x][0] = binary_op(sbuf[threadIdx.x][0], sbuf[threadIdx.x][1]);
         acc = binary_op(acc, sbuf[threadIdx.x][0]);
       }
       __syncthreads();
     }

     if(reducing) {
       unsigned row = bRow + threadIdx.x;
       unsigned tgt_offset = z * tgt_stride[3] + x * tgt_stride[2];
       tgt[tgt_offset + row] = acc;
     }
   }
 }

 template<class UnaryFunction, class BinaryFunction>
 __host__ void THCudaTensor_transformReduceInnermostDim(THCudaTensor *tgt, THCudaTensor *src,
         UnaryFunction unary_op, float init, BinaryFunction binary_op)
 {
   dim4 src_stride(0);
   dim4 tgt_stride(0);
   dim4 size(1);

   unsigned ndim = THCudaTensor_nDimension(src);
   for(unsigned dim=0; dim < ndim; dim++) {
     unsigned odim = ndim - 1 - dim;
     src_stride[odim] = THCudaTensor_stride(src, dim);
     tgt_stride[odim] = THCudaTensor_stride(tgt, dim);
     size[odim]       = THCudaTensor_size(src, dim);
   }

   dim3 threads(32, 16);
   unsigned nBlockPerRow = DIVUP(size[1], threads.y);
   unsigned maxGridDim = 1024; // anything < 64k is fine. The choice has no impact on performance.
   dim3 grid(min(maxGridDim, size[2]), min(maxGridDim, nBlockPerRow), min(maxGridDim, size[3]));

   THCudaTensor_kernel_transformReduceInnermostDim<<<grid, threads>>>(THCudaTensor_data(tgt),
           THCudaTensor_data(src), src_stride, tgt_stride, size, unary_op, init, binary_op);
   cudaError errcode = cudaGetLastError();
   if(errcode != cudaSuccess) {
     THError(cudaGetErrorString(errcode));
   }
 }


 template<class UnaryFunction, class BinaryFunction>
 void THCudaTensor_transformReduceDim(THCudaTensor *self_, THCudaTensor *src,
         long dimension, UnaryFunction unary_op, float init, BinaryFunction binary_op)
 {
   THArgCheck(dimension >= 0 && dimension < THCudaTensor_nDimension(src), 3, "dimension out of range");
   THArgCheck(THCudaTensor_nDimension(src) <= 4, 2, "too many dimensions (>4)");

   THLongStorage *dim = THCudaTensor_newSizeOf(src);
   THLongStorage_set(dim, dimension, 1);
   THCudaTensor_resize(self_, dim, NULL);
   THLongStorage_free(dim);

   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   src = THCudaTensor_newContiguous(src);

   if(dimension == THCudaTensor_nDimension(src)-1) {
     THCudaTensor_transformReduceInnermostDim(self, src, unary_op, init, binary_op);
   } else {
     THCudaTensor_transformReduceOuterDim(self, src, dimension, unary_op, init, binary_op);
   }

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(self, self_);
 }


 template<class BinaryFunction>
 void THCudaTensor_reduceDim(THCudaTensor *self_, THCudaTensor *src, long dimension, float init, BinaryFunction binary_op)
 {
   THCudaTensor_transformReduceDim(self_, src, dimension, thrust::identity<float>(), init, binary_op);
 }


 void THCudaTensor_sum(THCudaTensor *self, THCudaTensor *src, long dimension)
 {
   return THCudaTensor_reduceDim(self, src, dimension, 0.0f, thrust::plus<float>());
 }

 void THCudaTensor_prod(THCudaTensor *self, THCudaTensor *src, long dimension)
 {
   return THCudaTensor_reduceDim(self, src, dimension, 1.0f, thrust::multiplies<float>());
 }

 /* a set of reduction kernels that take in Binary ops on thrust pairs (of value, index)
    These are useful when you not only have to do a reduction, but you might have
    to preserve the location of contention (for example min/max operations)
  */
 template<class BinaryFunction>
 __global__ void THCudaTensor_kernel_transformReduceOuterDimIndex(float *tgt1, float *tgt2,
                                                              float *src_,
                                                              dim4 src_stride,
                                                              dim4 tgt1_stride,
                                                              dim4 tgt2_stride,
                                                              dim4 size,
                                                              thrust::pair<float,float> init,
                                                              BinaryFunction binary_op)
 {
   const size_t reduce = 3;

   for(unsigned z = blockIdx.z; z < size[2] ; z += gridDim.z)
   for(unsigned y = blockIdx.y; y < size[1] ; y += gridDim.y)
   for(unsigned col = blockIdx.x * blockDim.x + threadIdx.x; col < size[0]; col += blockDim.x * gridDim.x) {
     float *src = src_ + z * src_stride[2] + y * src_stride[1] + col;
     thrust::pair<float,float> acc = init;
     for(unsigned i=0; i < size[reduce]; i++) {
       acc = binary_op(thrust::make_pair(*src, i+1), acc); // i+1 for 1-indexing
       src += src_stride[reduce];
     }
     tgt1[z * tgt1_stride[2] + y * tgt1_stride[1] + col] = acc.first;
     tgt2[z * tgt2_stride[2] + y * tgt2_stride[1] + col] = acc.second;
   }
 }

 template<class BinaryFunction>
 __host__ void THCudaTensor_transformReduceOuterDimIndex(THCudaTensor *tgt1, THCudaTensor *tgt2,
                                                    THCudaTensor *src,
                                                    long rdim, thrust::pair<float,float> init,
                                                    BinaryFunction binary_op)
 {
   const size_t reduce = 3;
   dim4 src_stride(0);
   dim4 tgt1_stride(0);
   dim4 tgt2_stride(0);
   dim4 size(1);

   unsigned ndim = THCudaTensor_nDimension(src);
   for(unsigned idim=0, o=ndim-2; idim < ndim; idim++) {
     unsigned odim = idim == rdim ? reduce : o--;
     src_stride[odim] = THCudaTensor_stride(src, idim);
     tgt1_stride[odim] = THCudaTensor_stride(tgt1, idim);
     tgt2_stride[odim] = THCudaTensor_stride(tgt2, idim);
     size[odim]       = THCudaTensor_size(src, idim);
   }

   const unsigned nThreadPerBlock = 256;
   unsigned nBlockPerColumn = DIVUP(size[0], nThreadPerBlock);
   dim3 threads(nThreadPerBlock);
   unsigned maxGridDim = 1024; // anything < 64k is fine. The choice has no impact on performance.
   dim3 grid(min(maxGridDim, nBlockPerColumn), min(maxGridDim, size[1]), min(maxGridDim, size[2]));

   THCudaTensor_kernel_transformReduceOuterDimIndex<<<grid, threads>>>(
     THCudaTensor_data(tgt1), THCudaTensor_data(tgt2),
     THCudaTensor_data(src), src_stride, tgt1_stride, tgt2_stride, size, init, binary_op);
   cudaError errcode = cudaGetLastError();
   if(errcode != cudaSuccess) {
     THError(cudaGetErrorString(errcode));
   }
 }

 /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
  *
  * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
  *
  * - block.x is the innermost dimension, i.e. dimension 0;
  * - block.y and grid.y make up dimension 1; and
  * - grid.x and grid z are the remaining two outer dimensions (if any)
  *
  * Reduction along other dimensions is handled in a separate kernel.
  */
 template<class BinaryFunction>
 __global__ void THCudaTensor_kernel_transformReduceInnermostDimIndex(
   float *tgt1, float* tgt2, float *src_,
   dim4 src_stride, dim4 tgt1_stride, dim4 tgt2_stride,
   dim4 size, thrust::pair<float,float> init, BinaryFunction binary_op)
 {
   __shared__ float sbuf[16][32]; // 8kB
   __shared__ float ibuf[16][32]; // 8kB

   for(unsigned z = blockIdx.z; z < size[3] ; z += gridDim.z)
   for(unsigned x = blockIdx.x; x < size[2] ; x += gridDim.x)
   for(unsigned bRow = blockIdx.y * blockDim.y; bRow < size[1]; bRow += blockDim.y * gridDim.y) {

     thrust::pair<float,float> acc = init;
     unsigned row = bRow + threadIdx.y;
     float *src = src_ + z * src_stride[3] + x * src_stride[2] + row * src_stride[1];
     bool reducing = threadIdx.x < blockDim.y && bRow + threadIdx.x < size[1] && threadIdx.y == 0;

     for(unsigned bCol=0; bCol < size[0]; bCol += blockDim.x) {

       sbuf[threadIdx.y][threadIdx.x] = init.first;
       ibuf[threadIdx.y][threadIdx.x] = init.second;
       unsigned col = bCol + threadIdx.x;
       if(row < size[1] && col < size[0]) {
         sbuf[threadIdx.y][threadIdx.x] = src[col];
         ibuf[threadIdx.y][threadIdx.x] = col+1; // +1 for 1-indexing
       }
       __syncthreads();

       float* sline = &sbuf[threadIdx.y][0];
       float* iline = &ibuf[threadIdx.y][0];
       for(unsigned s = 16; s > 1; s >>= 1) {
         if(row < size[1] && threadIdx.x < s) {
           thrust::pair<float,float> arg1 = thrust::make_pair<float,float>(sline[threadIdx.x], iline[threadIdx.x]);
           thrust::pair<float,float> arg2 = thrust::make_pair<float,float>(sline[threadIdx.x + s], iline[threadIdx.x + s]);
           thrust::pair<float,float> res = binary_op(arg1, arg2);
           sline[threadIdx.x] = res.first;
           iline[threadIdx.x] = res.second;
         }
         __syncthreads();
       }
       if(reducing) {
         thrust::pair<float,float> res = binary_op(thrust::make_pair<float,float>(sbuf[threadIdx.x][0], ibuf[threadIdx.x][0]),
                                             thrust::make_pair<float,float>(sbuf[threadIdx.x][1], ibuf[threadIdx.x][1]));
         sbuf[threadIdx.x][0] = res.first;
         ibuf[threadIdx.x][0] = res.second;
         acc = binary_op(acc, res);
       }
       __syncthreads();
     }

     if(reducing) {
       unsigned row = bRow + threadIdx.x;
       unsigned tgt1_offset = z * tgt1_stride[3] + x * tgt1_stride[2];
       unsigned tgt2_offset = z * tgt2_stride[3] + x * tgt2_stride[2];
       tgt1[tgt1_offset + row] = acc.first;
       tgt2[tgt2_offset + row] = acc.second;
     }
   }
 }

 template<class BinaryFunction>
 __host__ void THCudaTensor_transformReduceInnermostDimIndex(
   THCudaTensor *tgt1, THCudaTensor *tgt2, THCudaTensor *src,
   thrust::pair<float,float> init, BinaryFunction binary_op)
 {
   dim4 src_stride(0);
   dim4 tgt1_stride(0);
   dim4 tgt2_stride(0);
   dim4 size(1);

   unsigned ndim = THCudaTensor_nDimension(src);
   for(unsigned dim=0; dim < ndim; dim++) {
     unsigned odim = ndim - 1 - dim;
     src_stride[odim] = THCudaTensor_stride(src, dim);
     tgt1_stride[odim] = THCudaTensor_stride(tgt1, dim);
     tgt2_stride[odim] = THCudaTensor_stride(tgt2, dim);
     size[odim]       = THCudaTensor_size(src, dim);
   }

   dim3 threads(32, 16);
   unsigned nBlockPerRow = DIVUP(size[1], threads.y);
   unsigned maxGridDim = 1024; // anything < 64k is fine. The choice has no impact on performance.
   dim3 grid(min(maxGridDim, size[2]), min(maxGridDim, nBlockPerRow), min(maxGridDim, size[3]));

   THCudaTensor_kernel_transformReduceInnermostDimIndex<<<grid, threads>>>(
     THCudaTensor_data(tgt1), THCudaTensor_data(tgt2),
     THCudaTensor_data(src), src_stride, tgt1_stride, tgt2_stride, size, init, binary_op);
   cudaError errcode = cudaGetLastError();
   if(errcode != cudaSuccess) {
     THError(cudaGetErrorString(errcode));
   }
 }

 template<class BinaryFunction>
 void THCudaTensor_reduceDimIndex(THCudaTensor *tgt1_, THCudaTensor *tgt2_, THCudaTensor *src,
                              long dimension, thrust::pair<float,float> init,
                                      BinaryFunction binary_op)
 {
   THArgCheck(dimension >= 0 && dimension < THCudaTensor_nDimension(src), 3, "dimension out of range");
   THArgCheck(THCudaTensor_nDimension(src) <= 4, 2, "too many dimensions (>4)");

   THLongStorage *dim = THCudaTensor_newSizeOf(src);
   THLongStorage_set(dim, dimension, 1);
   THCudaTensor_resize(tgt1_, dim, NULL);
   THCudaTensor_resize(tgt2_, dim, NULL);
   THLongStorage_free(dim);

   THCudaTensor *tgt1 = THCudaTensor_newContiguous(tgt1_);
   THCudaTensor *tgt2 = THCudaTensor_newContiguous(tgt2_);
   src = THCudaTensor_newContiguous(src);

   if(dimension == THCudaTensor_nDimension(src)-1) {
     THCudaTensor_transformReduceInnermostDimIndex(tgt1, tgt2, src, init, binary_op);
   } else {
     THCudaTensor_transformReduceOuterDimIndex(tgt1, tgt2, src, dimension, init, binary_op);
   }

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(tgt1, tgt1_);
   THCudaTensor_freeCopyTo(tgt2, tgt2_);
 }

 struct maxvalue_functor
 {
   __host__ __device__ thrust::pair<float,float> operator()(const thrust::pair<float,float> &a,
                                                             const thrust::pair<float,float> &b)
   {
     if (a.first > b.first) return a;
     else return b;
   }
 };

 void THCudaTensor_max(THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dimension)
 {
   const float minfloat32 = -3.402823466e+38f;
   thrust::pair<float,float> init = thrust::make_pair<float,float>(minfloat32, -1);
   return THCudaTensor_reduceDimIndex(values, indices, src, dimension, init,
                                  maxvalue_functor());
 }

 struct minvalue_functor
 {
   __host__ __device__ thrust::pair<float,float> operator()(const thrust::pair<float,float> &a,
                                                             const thrust::pair<float,float> &b)
   {
     if (a.first < b.first) return a;
     else return b;
   }
 };

 void THCudaTensor_min(THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dimension)
 {
   const float maxfloat32 = 3.402823466e+38f;
   thrust::pair<float,float> init = thrust::make_pair<float,float>(maxfloat32, -1);
   return THCudaTensor_reduceDimIndex(values, indices, src, dimension, init,
                                      minvalue_functor());
 }


 void THCudaTensor_addmv(THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *mat, THCudaTensor *vec)
 {
   if( (mat->nDimension != 2) || (vec->nDimension != 1) )
     THError("matrix and vector expected");

   if( mat->size[1] != vec->size[0] )
     THError("size mismatch");

   if(t->nDimension != 1)
     THError("size mismatch");

   if(t->size[0] != mat->size[0])
     THError("size mismatch");

   if(r_ != t)
   {
     THCudaTensor_resizeAs(r_, t);
     THCudaTensor_copy(r_, t);
   }

   if(mat->stride[0] == 1)
   {
     THCudaBlas_gemv('n', mat->size[0], mat->size[1],
                     alpha, THCudaTensor_data(mat), mat->stride[1],
                     THCudaTensor_data(vec), vec->stride[0],
                   beta, THCudaTensor_data(r_), r_->stride[0]);
   }
   else if(mat->stride[1] == 1)
   {
     THCudaBlas_gemv('t',  mat->size[1], mat->size[0],
                   alpha, THCudaTensor_data(mat), mat->stride[0],
                   THCudaTensor_data(vec), vec->stride[0],
                   beta, THCudaTensor_data(r_), r_->stride[0]);
   }
   else
   {
     THCudaTensor *cmat = THCudaTensor_newContiguous(mat);

     THCudaBlas_gemv('t',  mat->size[1], mat->size[0],
                   alpha, THCudaTensor_data(cmat), cmat->stride[0],
                   THCudaTensor_data(vec), vec->stride[0],
                   beta, THCudaTensor_data(r_), r_->stride[0]);

     THCudaTensor_free(cmat);
   }
 }

 void THCudaTensor_addmm(THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *m1, THCudaTensor *m2)
 {
   char transpose_r, transpose_m1, transpose_m2;
   THCudaTensor *r__, *m1_, *m2_;

   if( (m1->nDimension != 2) || (m2->nDimension != 2) )
     THError("matrix and matrix expected");

   if(t->nDimension != 2)
     THError("size mismatch");

   if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) || (m1->size[1] != m2->size[0]) )
     THError("size mismatch");

   if(t != r_)
   {
     THCudaTensor_resizeAs(r_, t);
     THCudaTensor_copy(r_, t);
   }

   /* r_ */
   if(r_->stride[0] == 1)
   {
     transpose_r = 'n';
     r__ = r_;
   }
   else if(r_->stride[1] == 1)
   {
     THCudaTensor *swap = m2;
     m2 = m1;
     m1 = swap;
     transpose_r = 't';
     r__ = r_;
   }
   else
   {
     transpose_r = 'n';

     r__ = THCudaTensor_newWithSize2d(r_->size[1], r_->size[0]);
     THCudaTensor_copy(r__, r_);
     THCudaTensor_transpose(r__, NULL, 0, 1);
   }

   /* m1 */
   if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1)
   {
     transpose_m1 = 'n';
     m1_ = m1;
   }
   else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1)
   {
     transpose_m1 = 't';
     m1_ = m1;
   }
   else
   {
     transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
     m1_ = THCudaTensor_newContiguous(m1);
   }

   /* m2 */
   if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1)
   {
     transpose_m2 = 'n';
     m2_ = m2;
   }
   else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1)
   {
     transpose_m2 = 't';
     m2_ = m2;
   }
   else
   {
     transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
     m2_ = THCudaTensor_newContiguous(m2);
   }

   /* do the operation */
   THCudaBlas_gemm(transpose_m1,
                 transpose_m2,
                 r__->size[(transpose_r == 'n' ? 0 : 1)],
                 r__->size[(transpose_r == 'n' ? 1 : 0)],
                 m1_->size[(transpose_r == 'n' ? 1 : 0)],
                 alpha,
                 THCudaTensor_data(m1_),
                 (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
                 THCudaTensor_data(m2_),
                 (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
                 beta,
                 THCudaTensor_data(r__),
                 r__->stride[(transpose_r == 'n' ? 1 : 0)]);

   /* free intermediate variables */
   if(m1_ != m1)
     THCudaTensor_free(m1_);

   if(m2_ != m2)
     THCudaTensor_free(m2_);

   if(r__ != r_)
     THCudaTensor_freeCopyTo(r__, r_);
 }

 void THCudaTensor_addr(THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *vec1, THCudaTensor *vec2)
 {
   if( (vec1->nDimension != 1) || (vec2->nDimension != 1) )
     THError("vector and vector expected");

   if(t->nDimension != 2)
     THError("size mismatch");

   if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) )
     THError("size mismatch");

   if(r_ != t)
   {
     THCudaTensor_resizeAs(r_, t);
     THCudaTensor_copy(r_, t);
   }

   if(beta != 1)
     THCudaTensor_mul(r_, r_, beta);

   if(r_->stride[0] == 1)
   {
     THCudaBlas_ger(vec1->size[0], vec2->size[0],
                  alpha, THCudaTensor_data(vec1), vec1->stride[0],
                  THCudaTensor_data(vec2), vec2->stride[0],
                  THCudaTensor_data(r_), r_->stride[1]);
   }
   else if(r_->stride[1] == 1)
   {
     THCudaBlas_ger(vec2->size[0], vec1->size[0],
                  alpha, THCudaTensor_data(vec2), vec2->stride[0],
                  THCudaTensor_data(vec1), vec1->stride[0],
                  THCudaTensor_data(r_), r_->stride[0]);
   }
   else
   {
     THCudaTensor *cr = THCudaTensor_newClone(r_);

     THCudaBlas_ger(vec2->size[0], vec1->size[0],
                  alpha, THCudaTensor_data(vec2), vec2->stride[0],
                  THCudaTensor_data(vec1), vec1->stride[0],
                  THCudaTensor_data(cr), cr->stride[0]);

     THCudaTensor_freeCopyTo(cr, r_);
   }
 }

 #define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(NAME, CFUNC)                        \
   struct NAME##_functor                                                      \
   {                                                                          \
     __host__ __device__ float operator()(const float& x) const               \
     {                                                                        \
       return CFUNC(x);                                                       \
     }                                                                        \
   };                                                                         \
                                                                              \
   void THCudaTensor_##NAME(THCudaTensor *self_, THCudaTensor *src)           \
   {                                                                          \
     THCudaTensor_resizeAs(self_, src);                                       \
     THCudaTensor *self = THCudaTensor_newContiguous(self_);                  \
     src = THCudaTensor_newContiguous(src);                                   \
     long size = THCudaTensor_nElement(self);                                 \
     thrust::device_ptr<float> self_data(THCudaTensor_data(self));            \
     thrust::device_ptr<float> src_data(THCudaTensor_data(src));              \
                                                                              \
     thrust::transform(src_data, src_data+size, self_data, NAME##_functor()); \
                                                                              \
     THCudaTensor_free(src);                                                  \
     THCudaTensor_freeCopyTo(self, self_);                                    \
   }

 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log, log)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log1p, log1p)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(exp, exp)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(cos, cos)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(acos, acos)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(cosh, cosh)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(sin, sin)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(asin, asin)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(sinh, sinh)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(tan, tan)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(atan, atan)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(tanh, tanh)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(sqrt, sqrt)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(ceil, ceil)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(floor, floor)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(abs, fabs)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(round, roundf)

 struct pow_functor
 {
   const float value;

   pow_functor(float value_) : value(value_) {}

     __host__ __device__ float operator()(const float& x) const
   {
     return pow(x, value);
   }
 };

 void THCudaTensor_pow(THCudaTensor *self_, THCudaTensor *src, float value)
 {
   THCudaTensor_resizeAs(self_, src);
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   src = THCudaTensor_newContiguous(src);
   long size = THCudaTensor_nElement(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(src));

   thrust::transform(src_data, src_data+size, self_data, pow_functor(value));

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(self, self_);
 }

 struct atan2_functor
 {
   __host__ __device__ float operator()(const float& x, const float& y) const
     {
       return atan2f(x, y);
   }
 };

 void THCudaTensor_atan2(THCudaTensor *self_, THCudaTensor *tx, THCudaTensor *ty)
 {
   THCudaTensor_resizeAs(self_, tx);
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   tx = THCudaTensor_newContiguous(tx);
   ty = THCudaTensor_newContiguous(ty);
   long size = THCudaTensor_nElement(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> tx_data(THCudaTensor_data(tx));
   thrust::device_ptr<float> ty_data(THCudaTensor_data(ty));

   thrust::transform(tx_data, tx_data+size, ty_data, self_data, atan2_functor());

   THCudaTensor_free(tx);
   THCudaTensor_free(ty);
   THCudaTensor_freeCopyTo(self, self_);
 }


 struct clamp_functor
 {
   const float min_value;
   const float max_value;

   clamp_functor(float min_value_, float max_value_) : min_value(min_value_), max_value(max_value_) {}

     __host__ __device__ float operator()(const float& x) const
   {
     if (x < min_value) {
       return min_value;
     }
     if (x > max_value) {
       return max_value;
     }
     return x;
   }
 };

 void THCudaTensor_clamp(THCudaTensor *self_, THCudaTensor *src, float min_value,
   float max_value)
 {
   THArgCheck(THCudaTensor_nElement(self_) == THCudaTensor_nElement(src), 2, "sizes do not match");
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   src = THCudaTensor_newContiguous(src);
   long size = THCudaTensor_nElement(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(src));

   thrust::transform(src_data, src_data+size, self_data, clamp_functor(min_value,
     max_value));

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(self, self_);
 }


 struct sign_functor
 {
   __device__ float operator()(const float &v) const {
     return (v > 0) - (v < 0);
   }
 };


 void THCudaTensor_sign(THCudaTensor *self_, THCudaTensor *src)
 {
   THCudaTensor_resizeAs(self_, src);
   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   long size = THCudaTensor_nElement(self);
   src = THCudaTensor_newContiguous(src);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(src));

   thrust::transform(src_data, src_data+size, self_data, sign_functor());

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(self, self_);
 }

 float THCudaTensor_meanall(THCudaTensor *self)
 {
   THArgCheck(self->nDimension > 0, 1, "empty Tensor");
   return THCudaTensor_sumall(self)/THCudaTensor_nElement(self);
 }

 void
 THCudaTensor_mean(THCudaTensor *self, THCudaTensor *src, long dim)
 {
   THCudaTensor_sum(self, src, dim);
   THCudaTensor_div(self, self, THCudaTensor_size(src, dim));
 }

 struct square_functor
 {
   const float mean;

   square_functor(float mean_) : mean(mean_) {}

     __host__ __device__ float operator()(const float& x) const
   {
     return (x-mean)*(x-mean);
   }
 };

 float THCudaTensor_varall(THCudaTensor *self)
 {
   self = THCudaTensor_newContiguous(self);
   long size = THCudaTensor_nElement(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));

   float mean = THCudaTensor_meanall(self);
   float result = thrust::transform_reduce(self_data, self_data+size, square_functor(mean), (float)0, thrust::plus<float>());

   result = result/(THCudaTensor_nElement(self)-1);

   THCudaTensor_free(self);
   return result;
 }

 float THCudaTensor_stdall(THCudaTensor *self)
 {
   return sqrt(THCudaTensor_varall(self));
 }


 template<class Op>
 void THCudaTensor_logicalValue(THCudaTensor *self_, THCudaTensor *src, Op op)
 {
   THCudaTensor_resizeAs(self_, src);

   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   long size = THCudaTensor_nElement(self);
   src = THCudaTensor_newContiguous(src);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(src));

   thrust::transform(src_data, src_data+size, self_data, op);

   THCudaTensor_free(src);
   THCudaTensor_freeCopyTo(self, self_);
 }


 struct partial_less_functor
 {
   const float rhs;
   partial_less_functor(float rhs) : rhs(rhs) {}
   __host__ __device__ bool operator()(const float &lhs) const {return lhs < rhs;}
 };


 void THCudaTensor_ltValue(THCudaTensor *self_, THCudaTensor *src, float value)
 {
   THCudaTensor_logicalValue(self_, src, partial_less_functor(value));
 }


 struct partial_greater_functor
 {
   const float rhs;
   partial_greater_functor(float rhs) : rhs(rhs) {}
   __host__ __device__ bool operator()(const float &lhs) const {return lhs > rhs;}
 };


 void THCudaTensor_gtValue(THCudaTensor *self_, THCudaTensor *src, float value)
 {
   THCudaTensor_logicalValue(self_, src, partial_greater_functor(value));
 }


 struct partial_less_equal_functor
 {
   const float rhs;
   partial_less_equal_functor(float rhs) : rhs(rhs) {}
   __host__ __device__ bool operator()(const float &lhs) const {return lhs <= rhs;}
 };


 void THCudaTensor_leValue(THCudaTensor *self_, THCudaTensor *src, float value)
 {
   THCudaTensor_logicalValue(self_, src, partial_less_equal_functor(value));
 }


 struct partial_greater_equal_functor
 {
   const float rhs;
   partial_greater_equal_functor(float rhs) : rhs(rhs) {}
   __host__ __device__ bool operator()(const float &lhs) const {return lhs >= rhs;}
 };


 void THCudaTensor_geValue(THCudaTensor *self_, THCudaTensor *src, float value)
 {
   THCudaTensor_logicalValue(self_, src, partial_greater_equal_functor(value));
 }


 struct partial_equal_functor
 {
   const float rhs;
   partial_equal_functor(float rhs) : rhs(rhs) {}
   __host__ __device__ bool operator()(const float &lhs) const {return lhs == rhs;}
 };


 void THCudaTensor_eqValue(THCudaTensor *self_, THCudaTensor *src, float value)
 {
   THCudaTensor_logicalValue(self_, src, partial_equal_functor(value));
 }


 struct partial_not_equal_functor
 {
   const float rhs;
   partial_not_equal_functor(float rhs) : rhs(rhs) {}
   __host__ __device__ bool operator()(const float &lhs) const {return lhs != rhs;}
 };


 void THCudaTensor_neValue(THCudaTensor *self_, THCudaTensor *src, float value)
 {
   THCudaTensor_logicalValue(self_, src, partial_not_equal_functor(value));
 }


 template<class Op>
 void THCudaTensor_logicalTensor(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2, Op op)
 {
   THCudaTensor_resizeAs(self_, src1);
   THArgCheck(THCudaTensor_nElement(src1) == THCudaTensor_nElement(src2), 3, "size do not match");

   THCudaTensor *self = THCudaTensor_newContiguous(self_);
   long size = THCudaTensor_nElement(self);
   src1 = THCudaTensor_newContiguous(src1);
   src2 = THCudaTensor_newContiguous(src2);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src1_data(THCudaTensor_data(src1));
   thrust::device_ptr<float> src2_data(THCudaTensor_data(src2));

   thrust::transform(src1_data, src1_data+size, src2_data, self_data, op);

   THCudaTensor_free(src1);
   THCudaTensor_free(src2);
   THCudaTensor_freeCopyTo(self, self_);
 }


 void THCudaTensor_ltTensor(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
 {
   THCudaTensor_logicalTensor(self_, src1, src2, thrust::less<float>());
 }


 void THCudaTensor_gtTensor(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
 {
   THCudaTensor_logicalTensor(self_, src1, src2, thrust::greater<float>());
 }


 void THCudaTensor_leTensor(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
 {
   THCudaTensor_logicalTensor(self_, src1, src2, thrust::less_equal<float>());
 }


 void THCudaTensor_geTensor(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
 {
   THCudaTensor_logicalTensor(self_, src1, src2, thrust::greater_equal<float>());
 }


 void THCudaTensor_eqTensor(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
 {
   THCudaTensor_logicalTensor(self_, src1, src2, thrust::equal_to<float>());
 }


 void THCudaTensor_neTensor(THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
 {
   THCudaTensor_logicalTensor(self_, src1, src2, thrust::not_equal_to<float>());
 }


 struct norm_functor
 {
   const float exponent;

   norm_functor(float exponent_) : exponent(exponent_) {}

     __host__ __device__ float operator()(const float& x) const
   {
     return pow(fabs(x), exponent);
   }
 };


 float THCudaTensor_normall(THCudaTensor *self, float value)
 {
   self = THCudaTensor_newContiguous(self);
   long size = THCudaTensor_nElement(self);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));

   float result;
   if(value == 0.0f) {
     result = thrust::transform_reduce(self_data, self_data+size, partial_not_equal_functor(0.0f), (float)0, thrust::plus<float>());
   } else {
     result = thrust::transform_reduce(self_data, self_data+size, norm_functor(value), (float)0, thrust::plus<float>());
     result = pow(result, (float)1.0/value);
   }

   THCudaTensor_free(self);
   return result;
 }

 void THCudaTensor_norm(THCudaTensor* self, THCudaTensor* src, float value, long dimension)
 {
   if (value == 0.0f) {
     THCudaTensor_transformReduceDim(self, src, dimension, partial_not_equal_functor(0.0f), (float)0, thrust::plus<float>());
   } else {
     THCudaTensor_transformReduceDim(self, src, dimension, norm_functor(value), (float)0, thrust::plus<float>());
     THCudaTensor_pow(self, self, 1/value);
   }
 }

 __global__ void THCudaTensor_kernel_renorm(float *data, const float value, const long size, const float maxnorm)
 {
   __shared__ float buffer[32];
   long tx = threadIdx.x;
   long bx = blockIdx.x;
   long step = blockDim.x;
   float *row = data + size*bx;

   buffer[tx] = 0;

   // get norm of axis
   for (long i=tx; i<size; i+=step)
   {
     buffer[tx] += pow(fabs(row[i]), value);
   }
   // add (reduce)
   for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1)
   {
     __syncthreads();
     if (tx < stride)
       buffer[tx] += buffer[tx+stride];
   }
   // clip norms
   __syncthreads();
   float norm = pow(buffer[0], 1/value);
   if (norm > maxnorm)
   {
     norm = maxnorm / (norm + 1e-7);
     // renormalize
     for (long i=tx; i<size; i+=step)
     {
       row[i] *= norm;
     }
   }
 }

 void THCudaTensor_renorm(THCudaTensor* self, THCudaTensor* src, float value, long dimension, float maxnorm)
 {
   THCudaTensor *self_;
   THCudaTensor *src_ = THCudaTensor_newTranspose(src, dimension, 0);
   THCudaTensor *data = THCudaTensor_newClone(src_);
   long size = THCudaTensor_nElement(data)/data->size[0];

   THArgCheck(dimension >= 0 && dimension < THCudaTensor_nDimension(src), 3, "invalid dimension");
   THArgCheck(value > 0, 2, "non-positive-norm not supported");
   THArgCheck(THCudaTensor_nDimension(src) > 1, 1, "need at least 2 dimensions");

   dim3 grid(data->size[0]);
   dim3 threads(32);

   THCudaTensor_kernel_renorm<<<grid, threads>>>(THCudaTensor_data(data), value, size, maxnorm);

   cudaError errcode = cudaGetLastError();
   if(errcode != cudaSuccess)
     THError(cudaGetErrorString(errcode));

   THCudaTensor_free(src_);
   self_ = THCudaTensor_newTranspose(data, dimension, 0);
   THCudaTensor_resizeAs(self, self_);
   THCudaTensor_freeCopyTo(self_, self);
   THCudaTensor_free(data);
 }

 struct dist_functor
 {
   const float exponent;

   dist_functor(float exponent_) : exponent(exponent_) {}

   __host__ __device__ float operator()(const float& x, const float& y) const
   {
     return pow(fabs(x-y), exponent);
   }
 };

 float THCudaTensor_dist(THCudaTensor *self, THCudaTensor *src, float value)
 {
   self = THCudaTensor_newContiguous(self);
   long size = THCudaTensor_nElement(self);
   src = THCudaTensor_newContiguous(src);
   thrust::device_ptr<float> self_data(THCudaTensor_data(self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(src));

   float result = thrust::inner_product(self_data, self_data+size, src_data, (float) 0,thrust::plus<float>(), dist_functor(value));

   THCudaTensor_free(src);
   THCudaTensor_free(self);

   return pow(result, (float)1.0/value);
 }

 void THCudaTensor_rand(THCudaRNGState* rng_state, THCudaTensor *r_, THLongStorage *size)
 {
   THCudaTensor_resize(r_, size, NULL);
   THCudaTensor_uniform(rng_state, r_, 0, 1);
 }

 void THCudaTensor_randn(THCudaRNGState* rng_state, THCudaTensor *r_, THLongStorage *size)
 {
   THCudaTensor_resize(r_, size, NULL);
   THCudaTensor_normal(rng_state, r_, 0, 1);
 }

 __global__ void THCudaTensor_kernel_indexFill(
    float *tensor, long* stride, float *index, long src_nDim,
    int dim, long idx_size, long tensor_size, long size_dim, float val
 )
 {
   int thread_idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;

   long flat_size = tensor_size / idx_size;

   if (thread_idx < flat_size)
   {
     long coeff = 0;
     for (int i=0; i<idx_size; i++)
     {
       int leftover = thread_idx;
       int srcIdx = 0;
       for (int d=0; d<src_nDim; d++)
       {
         if (d < dim)
         {
           coeff = leftover / (stride[d] / size_dim);
           leftover -= coeff * (stride[d] / size_dim);
           srcIdx += coeff * stride[d];
         }
         else if (d > dim)
         {
           coeff = leftover / stride[d];
           leftover -= coeff * stride[d];
           srcIdx += coeff * stride[d];
         }
       }
         tensor[srcIdx + (int)((index[i])-1)*stride[dim]] = val;
     }
   }
 }

 __global__ void THCudaTensor_kernel_indexCopy(
    float *res, float *src, long* res_stride, float *index,
    long res_nDim, int dim, long idx_size, long src_size, long size_dim
 )
 {
   int thread_idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;

   long flat_size = src_size / idx_size;

   if (thread_idx < flat_size)
   {
     long coeff = 0;
     for (int i=0; i<idx_size; i++)
     {
       int leftover = thread_idx;
       int targetIdx = 0;
       int resIdx = 0;
       for (int d=0; d<res_nDim; d++)
       {
         if (d < dim)
         {
           long stride_d = res_stride[d] / size_dim;
           coeff = leftover / stride_d;
           leftover -= coeff * stride_d;
           targetIdx += coeff * stride_d * idx_size;
           resIdx += coeff * res_stride[d];
         }
         else if (d > dim)
         {
           coeff = leftover / res_stride[d];
           leftover -= coeff * res_stride[d];
           targetIdx += coeff * res_stride[d];
           resIdx += coeff * res_stride[d];
         }
       }
       res[resIdx + ((int)(index[i])-1)*res_stride[dim]] = src[targetIdx + i*res_stride[dim]];
     }
   }
 }

 void THCudaTensor_indexCopy(THCudaTensor *res_, int dim, THLongTensor *indices, THCudaTensor *src)
 {
   THCudaTensor *indices_;
   long *stride_;
   long nIndex = indices->size[0];
   long nRes;

   THArgCheck(indices->nDimension == 1, 3, "expecting vector of indices");
   THArgCheck(dim < src->nDimension, 4, "Indexing dim is out of bounds");
   THArgCheck(src->nDimension > 0, 2, "Source tensor is empty");
   THArgCheck(nIndex == src->size[dim], 4, "length of src.size[dim] is not equal to length of indices");

   src = THCudaTensor_newContiguous(src);
   indices_ = THCudaTensor_newWithSize1d(nIndex);
   THCudaTensor_copyLong(indices_, indices);

   nRes = THCudaTensor_nElement(res_);
   dim3 nthreads(16, 16);
   dim3 nblocks(ceil((float)nRes / nIndex / (16*16)));

   THCudaCheck(cudaMalloc((void**)&stride_, res_->nDimension * sizeof(long)));
   THCudaCheck(cudaMemcpy(stride_, res_->stride, res_->nDimension * sizeof(long), cudaMemcpyHostToDevice));

   THCudaTensor_kernel_indexCopy<<<nblocks, nthreads>>>(
     THCudaTensor_data(res_), THCudaTensor_data(src),
     stride_, THCudaTensor_data(indices_),
     res_->nDimension, dim, nIndex,
     THCudaTensor_nElement(src), res_->size[dim]
   );

   THCudaCheck(cudaFree(stride_));
   THCudaTensor_free(indices_);
   THCudaTensor_free(src);
 }


 void THCudaTensor_indexFill(THCudaTensor *res_, int dim, THLongTensor *indices, float val)
 {
   THCudaTensor *indices_;
   long *stride_;
   long nIndex = indices->size[0];
   long nRes;

   THArgCheck(indices->nDimension == 1, 3, "Index is supposed to be a vector");
   THArgCheck(dim < res_->nDimension,4,"Indexing dim is out of bounds");
   THArgCheck(res_->nDimension > 0, 2, "Source tensor is empty");

   indices_ = THCudaTensor_newWithSize1d(nIndex);
   THCudaTensor_copyLong(indices_, indices);

   nRes = THCudaTensor_nElement(res_) / res_->size[dim] * nIndex;


   dim3 nthreads(16, 16);
   dim3 nblocks(ceil((float)nRes / nIndex / (16*16)));

   THCudaCheck(cudaMalloc((void**)&stride_, res_->nDimension * sizeof(long)));
   THCudaCheck(cudaMemcpy(stride_, res_->stride, res_->nDimension * sizeof(long), cudaMemcpyHostToDevice));

   THCudaTensor_kernel_indexFill<<<nblocks, nthreads>>>(
     THCudaTensor_data(res_), stride_, THCudaTensor_data(indices_),
     res_->nDimension, dim, nIndex, nRes, res_->size[dim], val
   );

   THCudaCheck(cudaFree(stride_));
   THCudaTensor_free(indices_);
 }

 __global__ void THCudaTensor_kernel_indexSelect(
    float *tensor, float *src, long* src_stride, float *index,
    long src_nDim, int dim, long idx_size, long tensor_size, long size_dim
 )
 {
   int thread_idx = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;

   long flat_size = tensor_size / idx_size;

   if (thread_idx < flat_size)
   {
     long coeff = 0;
     for (int i=0; i<idx_size; i++)
     {
       int leftover = thread_idx;
       int targetIdx = 0;
       int srcIdx = 0;
       for (int d=0; d<src_nDim; d++)
       {
         if (d < dim)
         {
           long stride_d = src_stride[d] / size_dim;
           coeff = leftover / stride_d;
           leftover -= coeff * stride_d;
           targetIdx += coeff * stride_d * idx_size;
           srcIdx += coeff * src_stride[d];
         }
         else if (d > dim)
         {
           coeff = leftover / src_stride[d];
           leftover -= coeff * src_stride[d];
           targetIdx += coeff * src_stride[d];
           srcIdx += coeff * src_stride[d];
         }
       }
       tensor[targetIdx + i*src_stride[dim]] = src[srcIdx + ((int)(index[i])-1)*src_stride[dim]];
     }
   }
 }


 void THCudaTensor_indexSelect(THCudaTensor *res_, THCudaTensor *src, int dim, THLongTensor *indices)
 {
   THLongStorage *newSize;
   THCudaTensor *indices_;
   long *stride_;
   long nIndex = indices->size[0];
   long nRes;

   THArgCheck(indices->nDimension == 1, 3, "expecting vector of indices");
   THArgCheck(dim < src->nDimension, 4, "Indexing dim is out of bounds");
   THArgCheck(src->nDimension > 0, 2, "Source tensor is empty");

   newSize = THLongStorage_newWithSize(src->nDimension);
   THLongStorage_rawCopy(newSize, src->size);
   newSize->data[dim] = nIndex;
   THCudaTensor_resize(res_, newSize, NULL);
   THLongStorage_free(newSize);

   indices_ = THCudaTensor_newWithSize1d(nIndex);
   THCudaTensor_copyLong(indices_, indices);

   nRes = THCudaTensor_nElement(res_);
   dim3 nthreads(16, 16);
   dim3 nblocks(ceil((float)nRes / nIndex / (16*16)));

   THCudaCheck(cudaMalloc((void**)&stride_, src->nDimension * sizeof(long)));
   THCudaCheck(cudaMemcpy(stride_, src->stride, src->nDimension * sizeof(long), cudaMemcpyHostToDevice));

   THCudaTensor_kernel_indexSelect<<<nblocks, nthreads>>>(
     THCudaTensor_data(res_), THCudaTensor_data(src),
     stride_, THCudaTensor_data(indices_),
     src->nDimension, dim, indices->size[0], nRes, src->size[dim]
   );

   THCudaCheck(cudaFree(stride_));
   THCudaTensor_free(indices_);
 }