torch/lib/THC/THCApply.cuh - platform/external/pytorch - Git at Google

 #ifndef THC_APPLY_INC
 #define THC_APPLY_INC

 #include "THCTensorCopy.h"
 #include "THCReduceApplyUtils.cuh"
 #include "THCTensorTypeUtils.cuh"

 //
 // This file contains pointwise operation functions and kernels that
 // work on both contiguous and non-contiguous tensor arguments of
 // arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
 // copying or temporary storage.
 //

 // Threads per block for our apply kernel
 // FIXME: use occupancy calculator instead
 #define THC_APPLY_THREADS_PER_BLOCK 32 * 16

 template <typename Op,
           typename Ta,
           typename IndexType,
           int ADims>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(32 * 16, 4)
 #endif
 __global__ void
 kernelPointwiseApply1(TensorInfo<Ta, IndexType> a,
                       IndexType totalElements,
                       Op op) {
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < totalElements;
        linearIndex += gridDim.x * blockDim.x) {
     // Convert `linearIndex` into an offset of `a`
     const IndexType aOffset =
       IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);

     op(&a.data[aOffset]);
   }
 }

 template <typename Op,
           typename Ta, typename Tb,
           typename IndexType,
           int ADims, int BDims>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(32 * 16, 4)
 #endif
 __global__ void
 kernelPointwiseApply2(TensorInfo<Ta, IndexType> a,
                       TensorInfo<Tb, IndexType> b,
                       IndexType totalElements,
                       Op op) {
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < totalElements;
        linearIndex += gridDim.x * blockDim.x) {
     // Convert `linearIndex` into an offset of `a`
     const IndexType aOffset =
       IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);

     // Convert `linearIndex` into an offset of `b`
     const IndexType bOffset =
       IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);

     op(&a.data[aOffset], &b.data[bOffset]);
   }
 }

 template <typename Op,
           typename Ta, typename Tb, typename Tc,
           typename IndexType,
           int ADims, int BDims, int CDims>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(32 * 16, 4)
 #endif
 __global__ void
 kernelPointwiseApply3(TensorInfo<Ta, IndexType> a,
                       TensorInfo<Tb, IndexType> b,
                       TensorInfo<Tc, IndexType> c,
                       IndexType totalElements,
                       Op op) {
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < totalElements;
        linearIndex += gridDim.x * blockDim.x) {
     // Convert `linearIndex` into an offset of `a`
     const IndexType aOffset =
       IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);

     // Convert `linearIndex` into an offset of `b`
     const IndexType bOffset =
       IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);

     // Convert `linearIndex` into an offset of `c`
     const IndexType cOffset =
       IndexToOffset<Tc, IndexType, CDims>::get(linearIndex, c);

     op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset]);
   }
 }

 inline dim3 getApplyBlock() {
   return dim3(THC_APPLY_THREADS_PER_BLOCK);
 }

 inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
   int curDevice = -1;
   cudaGetDevice(&curDevice);

   if (curDevice == -1) {
     return false;
   }

   if(THCState_getCurrentDeviceProperties(state)->major < 3){
     grid = dim3(min((int64_t) THCCeilDiv(totalElements,
                (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (int64_t) 64*1024-1));
     return true;
   }

   grid = dim3((int64_t) THCCeilDiv(totalElements,
               (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) );
   return true;

 }

 template <typename TensorTypeA,
           typename Op>
 bool THC_pointwiseApply1(THCState* state,
                          TensorTypeA* a,
                          const Op& op,
                          TensorArgType aType = ReadWrite) {
   if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS) {
     return false;
   }

   if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }

   const dim3 block = getApplyBlock();

   dim3 grid;
   ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

   if (!getApplyGrid(state, totalElements, grid)) {
     return false;
   }

   // If tensor args have overlapping indices and are read/write, then
   // we must expand the tensor to a contiguous form first, since
   // otherwise there are conflicting writes. Upon copying back to the
   // non-contiguous form, there will be conflicting writes, but at
   // least with copy, one of the updaters will win atomically. This is
   // a sketchy property of the old system as well (writing into all
   // indices of a tensor with overlapping indices should probably be
   // an error, since it is unclear which one should win), but we will
   // preserve this last-writer-wins (in arbitrary copy order) behavior.
   TensorTypeA* oldA = NULL;

   if (aType == ReadWrite &&
       TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
     a = TensorUtils<TensorTypeA>::newContiguous(state, a);
   }

   // It is possible that the tensor dimensions are able to be collapsed,
   // and thus we can reduce the actual code complexity of the copy by
   // exploiting this knowledge statically, since the div/mod is the
   // most expensive part of the operation, more so than memory accesses.
   // For instance, when copying a non-contiguous to a contiguous tensor
   // (or vice versa), the contiguous tensor can be collapsed to one
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
 #define HANDLE_CASE(TYPE, A)                                            \
   kernelPointwiseApply1<Op,                                             \
                         typename TensorUtils<TensorTypeA>::DataType,   \
                         TYPE, A>                                        \
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
       aInfo, (TYPE) totalElements, op);

 #define HANDLE_A_CASE(TYPE, A)                  \
   {                                             \
     if (aInfo.isContiguous()) {                 \
       HANDLE_CASE(TYPE, -2);                    \
     } else {                                    \
       switch (A) {                              \
         case 1:                                 \
         HANDLE_CASE(TYPE, 1);                   \
         break;                                  \
         case 2:                                 \
         HANDLE_CASE(TYPE, 2);                   \
         break;                                  \
         default:                                \
         HANDLE_CASE(TYPE, -1);                  \
         break;                                  \
       }                                         \
     }                                           \
   }

   // Can we use 32-bit integer math in the kernel (the linear ID for the copy
   // and the resulting non-linear offset is all computable using 32-bit math?)
   // We also use unsigned index math in the kernel, as signed div/mod has
   // additional overhead.
   if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a)) {
     TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
       getTensorInfo<TensorTypeA, unsigned int>(state, a);
     aInfo.collapseDims();

     HANDLE_A_CASE(unsigned int, aInfo.dims);
   } else {
     TensorInfo<typename TensorUtils<TensorTypeA>::DataType, uint64_t> aInfo =
       getTensorInfo<TensorTypeA, uint64_t>(state, a);
     aInfo.collapseDims();

     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (aInfo.isContiguous()) {
       kernelPointwiseApply1<Op,
                             typename TensorUtils<TensorTypeA>::DataType,
                             uint64_t, -2>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, (uint64_t) totalElements, op);
     } else {
       kernelPointwiseApply1<Op,
                             typename TensorUtils<TensorTypeA>::DataType,
                             uint64_t, -1>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, (uint64_t) totalElements, op);
     }
   }
 #undef HANDLE_CASE
 #undef HANDLE_A_CASE

   if (oldA) {
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
     TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
     TensorUtils<TensorTypeA>::free(state, a);
     a = oldA;
   }

   return true;
 }

 template <typename TensorTypeA,
           typename TensorTypeB,
           typename Op>
 bool THC_pointwiseApply2(THCState* state,
                          TensorTypeA* a,
                          TensorTypeB* b,
                          const Op& op,
                          TensorArgType aType = ReadWrite,
                          TensorArgType bType = ReadOnly) {
   ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

   if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
     return false;
   }

   if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
       TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS) {
     return false;
   }

   if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }

   const dim3 block = getApplyBlock();

   dim3 grid;
   if (!getApplyGrid(state, totalElements, grid)) {
     return false;
   }

   // If tensor args have overlapping indices and are read/write, then
   // we must expand the tensor to a contiguous form first, since
   // otherwise there are conflicting writes. Upon copying back to the
   // non-contiguous form, there will be conflicting writes, but at
   // least with copy, one of the updaters will win atomically. This is
   // a sketchy property of the old system as well (writing into all
   // indices of a tensor with overlapping indices should probably be
   // an error, since it is unclear which one should win), but we will
   // preserve this last-writer-wins (in arbitrary copy order) behavior.
   TensorTypeA* oldA = NULL;
   TensorTypeB* oldB = NULL;

   if (aType == ReadWrite &&
       TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
     a = TensorUtils<TensorTypeA>::newContiguous(state, a);
   }
   if (bType == ReadWrite &&
       TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
     // Must perform in contiguous space
     oldB = b;
     b = TensorUtils<TensorTypeB>::newContiguous(state, b);
   }

   // It is possible that the tensor dimensions are able to be collapsed,
   // and thus we can reduce the actual code complexity of the copy by
   // exploiting this knowledge statically, since the div/mod is the
   // most expensive part of the operation, more so than memory accesses.
   // For instance, when copying a non-contiguous to a contiguous tensor
   // (or vice versa), the contiguous tensor can be collapsed to one
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
 #define HANDLE_CASE(TYPE, A, B)                                         \
   kernelPointwiseApply2<Op,                                             \
                         typename TensorUtils<TensorTypeA>::DataType,    \
                         typename TensorUtils<TensorTypeB>::DataType,    \
                         TYPE, A, B>                                     \
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
       aInfo, bInfo, (TYPE) totalElements, op);

 #define HANDLE_B_CASE(TYPE, A, B)               \
   {                                             \
     if (bInfo.isContiguous()) {                 \
       HANDLE_CASE(TYPE, A, -2);                 \
     } else {                                    \
       switch (B) {                              \
         case 1:                                 \
         HANDLE_CASE(TYPE, A, 1);                \
         break;                                  \
         case 2:                                 \
         HANDLE_CASE(TYPE, A, 2);                \
         break;                                  \
         default:                                \
         HANDLE_CASE(TYPE, A, -1);               \
         break;                                  \
       }                                         \
     }                                           \
   }

 #define HANDLE_A_CASE(TYPE, A, B)               \
   {                                             \
     if (aInfo.isContiguous()) {                 \
       HANDLE_B_CASE(TYPE, -2, B);               \
     } else {                                    \
       switch (A) {                              \
         case 1:                                 \
         HANDLE_B_CASE(TYPE, 1, B);              \
         break;                                  \
         case 2:                                 \
         HANDLE_B_CASE(TYPE, 2, B);              \
         break;                                  \
         default:                                \
         HANDLE_B_CASE(TYPE, -1, B);             \
         break;                                  \
       }                                         \
     }                                           \
   }

   if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
       TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b)) {
     TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
       getTensorInfo<TensorTypeA, unsigned int>(state, a);
     aInfo.collapseDims();

     TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
       getTensorInfo<TensorTypeB, unsigned int>(state, b);
     bInfo.collapseDims();

     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
   } else {
     TensorInfo<typename TensorUtils<TensorTypeA>::DataType, uint64_t> aInfo =
       getTensorInfo<TensorTypeA, uint64_t>(state, a);
     aInfo.collapseDims();

     TensorInfo<typename TensorUtils<TensorTypeB>::DataType, uint64_t> bInfo =
       getTensorInfo<TensorTypeB, uint64_t>(state, b);
     bInfo.collapseDims();

     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (aInfo.isContiguous() && bInfo.isContiguous()) {
       kernelPointwiseApply2<Op,
                             typename TensorUtils<TensorTypeA>::DataType,
                             typename TensorUtils<TensorTypeB>::DataType,
                             uint64_t, -2, -2>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, bInfo, (uint64_t) totalElements, op);
     } else {
       kernelPointwiseApply2<Op,
                             typename TensorUtils<TensorTypeA>::DataType,
                             typename TensorUtils<TensorTypeB>::DataType,
                             uint64_t, -1, -1>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, bInfo, (uint64_t) totalElements, op);
     }
   }
 #undef HANDLE_CASE
 #undef HANDLE_B_CASE
 #undef HANDLE_A_CASE

   if (oldA) {
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
     TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
     TensorUtils<TensorTypeA>::free(state, a);
     a = oldA;
   }

   if (oldB) {
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
     TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
     TensorUtils<TensorTypeB>::free(state, b);
     b = oldB;
   }

   return true;
 }

 template <typename TensorTypeA,
           typename TensorTypeB,
           typename TensorTypeC,
           typename Op>
 bool THC_pointwiseApply3(THCState* state,
                          TensorTypeA* a,
                          TensorTypeB* b,
                          TensorTypeC* c,
                          const Op& op,
                          TensorArgType aType = ReadWrite,
                          TensorArgType bType = ReadOnly,
                          TensorArgType cType = ReadOnly) {
   ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

   if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) ||
       totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {
     return false;
   }

   if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
       TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS ||
       TensorUtils<TensorTypeC>::getDims(state, c) > MAX_CUTORCH_DIMS) {
     return false;
   }

   if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }

   const dim3 block = getApplyBlock();

   dim3 grid;
   if (!getApplyGrid(state, totalElements, grid)) {
     return false;
   }

   // If tensor args have overlapping indices and are read/write, then
   // we must expand the tensor to a contiguous form first, since
   // otherwise there are conflicting writes. Upon copying back to the
   // non-contiguous form, there will be conflicting writes, but at
   // least with copy, one of the updaters will win atomically. This is
   // a sketchy property of the old system as well (writing into all
   // indices of a tensor with overlapping indices should probably be
   // an error, since it is unclear which one should win), but we will
   // preserve this last-writer-wins (in arbitrary copy order) behavior.
   TensorTypeA* oldA = NULL;
   TensorTypeB* oldB = NULL;
   TensorTypeC* oldC = NULL;

   if (aType == ReadWrite &&
       TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
     a = TensorUtils<TensorTypeA>::newContiguous(state, a);
   }
   if (bType == ReadWrite &&
       TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
     // Must perform in contiguous space
     oldB = b;
     b = TensorUtils<TensorTypeB>::newContiguous(state, b);
   }
   if (cType == ReadWrite &&
       TensorUtils<TensorTypeC>::overlappingIndices(state, c)) {
     // Must perform in contiguous space
     oldC = c;
     c = TensorUtils<TensorTypeC>::newContiguous(state, c);
   }

 #define HANDLE_CASE(TYPE, A, B, C)                                      \
   kernelPointwiseApply3<Op,                                             \
                         typename TensorUtils<TensorTypeA>::DataType,    \
                         typename TensorUtils<TensorTypeB>::DataType,    \
                         typename TensorUtils<TensorTypeC>::DataType,    \
                         TYPE, A, B, C>                                  \
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
       aInfo, bInfo, cInfo, (TYPE) totalElements, op);

 #define HANDLE_C_CASE(TYPE, A, B, C)            \
   {                                             \
     if (cInfo.isContiguous()) {                 \
       HANDLE_CASE(TYPE, A, B, -2);              \
     } else {                                    \
       switch (C) {                              \
         case 1:                                 \
         HANDLE_CASE(TYPE, A, B, 1);             \
         break;                                  \
         case 2:                                 \
         HANDLE_CASE(TYPE, A, B, 2);             \
         break;                                  \
         default:                                \
         HANDLE_CASE(TYPE, A, B, -1);            \
         break;                                  \
       }                                         \
     }                                           \
   }

 #define HANDLE_B_CASE(TYPE, A, B, C)            \
   {                                             \
     if (bInfo.isContiguous()) {                 \
       HANDLE_C_CASE(TYPE, A, -2, C);            \
     } else {                                    \
       switch (B) {                              \
         case 1:                                 \
         HANDLE_C_CASE(TYPE, A, 1, C);           \
         break;                                  \
         case 2:                                 \
         HANDLE_C_CASE(TYPE, A, 2, C);           \
         break;                                  \
         default:                                \
         HANDLE_C_CASE(TYPE, A, -1, C);          \
         break;                                  \
       }                                         \
     }                                           \
   }

 #define HANDLE_A_CASE(TYPE, A, B, C)            \
   {                                             \
     if (aInfo.isContiguous()) {                 \
       HANDLE_B_CASE(TYPE, -2, B, C);            \
     } else {                                    \
       switch (A) {                              \
         case 1:                                 \
         HANDLE_B_CASE(TYPE, 1, B, C);           \
         break;                                  \
         case 2:                                 \
         HANDLE_B_CASE(TYPE, 2, B, C);           \
         break;                                  \
         default:                                \
         HANDLE_B_CASE(TYPE, -1, B, C);          \
         break;                                  \
       }                                         \
     }                                           \
   }

   if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
       TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b) &&
       TensorUtils<TensorTypeC>::canUse32BitIndexMath(state, c)) {
     TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
       getTensorInfo<TensorTypeA, unsigned int>(state, a);
     aInfo.collapseDims();

     TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
       getTensorInfo<TensorTypeB, unsigned int>(state, b);
     bInfo.collapseDims();

     TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned int> cInfo =
       getTensorInfo<TensorTypeC, unsigned int>(state, c);
     cInfo.collapseDims();

     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
   } else {
     TensorInfo<typename TensorUtils<TensorTypeA>::DataType, uint64_t> aInfo =
       getTensorInfo<TensorTypeA, uint64_t>(state, a);
     aInfo.collapseDims();

     TensorInfo<typename TensorUtils<TensorTypeB>::DataType, uint64_t> bInfo =
       getTensorInfo<TensorTypeB, uint64_t>(state, b);
     bInfo.collapseDims();

     TensorInfo<typename TensorUtils<TensorTypeC>::DataType, uint64_t> cInfo =
       getTensorInfo<TensorTypeC, uint64_t>(state, c);
     cInfo.collapseDims();

     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()) {
       kernelPointwiseApply3<Op,
                             typename TensorUtils<TensorTypeA>::DataType,
                             typename TensorUtils<TensorTypeB>::DataType,
                             typename TensorUtils<TensorTypeC>::DataType,
                             uint64_t, -2, -2, -2>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
     } else {
       kernelPointwiseApply3<Op,
                             typename TensorUtils<TensorTypeA>::DataType,
                             typename TensorUtils<TensorTypeB>::DataType,
                             typename TensorUtils<TensorTypeC>::DataType,
                             uint64_t, -1, -1, -1>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
     }
   }
 #undef HANDLE_CASE
 #undef HANDLE_C_CASE
 #undef HANDLE_B_CASE
 #undef HANDLE_A_CASE

   if (oldA) {
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
     TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
     TensorUtils<TensorTypeA>::free(state, a);
     a = oldA;
   }

   if (oldB) {
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
     TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
     TensorUtils<TensorTypeB>::free(state, b);
     b = oldB;
   }

   if (oldC) {
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldC contiguous.
     TensorUtils<TensorTypeC>::copyIgnoringOverlaps(state, oldC, c);
     TensorUtils<TensorTypeC>::free(state, c);
     c = oldC;
   }

   return true;
 }

 #undef THC_APPLY_THREADS_PER_BLOCK

 #endif // THC_APPLY_INC
	#ifndef THC_APPLY_INC
	#define THC_APPLY_INC

	#include "THCTensorCopy.h"
	#include "THCReduceApplyUtils.cuh"
	#include "THCTensorTypeUtils.cuh"

	//
	// This file contains pointwise operation functions and kernels that
	// work on both contiguous and non-contiguous tensor arguments of
	// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
	// copying or temporary storage.
	//

	// Threads per block for our apply kernel
	// FIXME: use occupancy calculator instead
	#define THC_APPLY_THREADS_PER_BLOCK 32 * 16

	template <typename Op,
	typename Ta,
	typename IndexType,
	int ADims>
	#if __CUDA_ARCH__ >= 350
	__launch_bounds__(32 * 16, 4)
	#endif
	__global__ void
	kernelPointwiseApply1(TensorInfo<Ta, IndexType> a,
	IndexType totalElements,
	Op op) {
	for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
	linearIndex < totalElements;
	linearIndex += gridDim.x * blockDim.x) {
	// Convert `linearIndex` into an offset of `a`
	const IndexType aOffset =
	IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);

	op(&a.data[aOffset]);
	}
	}

	template <typename Op,
	typename Ta, typename Tb,
	typename IndexType,
	int ADims, int BDims>
	#if __CUDA_ARCH__ >= 350
	__launch_bounds__(32 * 16, 4)
	#endif
	__global__ void
	kernelPointwiseApply2(TensorInfo<Ta, IndexType> a,
	TensorInfo<Tb, IndexType> b,
	IndexType totalElements,
	Op op) {
	for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
	linearIndex < totalElements;
	linearIndex += gridDim.x * blockDim.x) {
	// Convert `linearIndex` into an offset of `a`
	const IndexType aOffset =
	IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);

	// Convert `linearIndex` into an offset of `b`
	const IndexType bOffset =
	IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);

	op(&a.data[aOffset], &b.data[bOffset]);
	}
	}

	template <typename Op,
	typename Ta, typename Tb, typename Tc,
	typename IndexType,
	int ADims, int BDims, int CDims>
	#if __CUDA_ARCH__ >= 350
	__launch_bounds__(32 * 16, 4)
	#endif
	__global__ void
	kernelPointwiseApply3(TensorInfo<Ta, IndexType> a,
	TensorInfo<Tb, IndexType> b,
	TensorInfo<Tc, IndexType> c,
	IndexType totalElements,
	Op op) {
	for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
	linearIndex < totalElements;
	linearIndex += gridDim.x * blockDim.x) {
	// Convert `linearIndex` into an offset of `a`
	const IndexType aOffset =
	IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);

	// Convert `linearIndex` into an offset of `b`
	const IndexType bOffset =
	IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);

	// Convert `linearIndex` into an offset of `c`
	const IndexType cOffset =
	IndexToOffset<Tc, IndexType, CDims>::get(linearIndex, c);

	op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset]);
	}
	}

	inline dim3 getApplyBlock() {
	return dim3(THC_APPLY_THREADS_PER_BLOCK);
	}

	inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
	int curDevice = -1;
	cudaGetDevice(&curDevice);

	if (curDevice == -1) {
	return false;
	}

	if(THCState_getCurrentDeviceProperties(state)->major < 3){
	grid = dim3(min((int64_t) THCCeilDiv(totalElements,
	(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (int64_t) 64*1024-1));
	return true;
	}

	grid = dim3((int64_t) THCCeilDiv(totalElements,
	(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) );
	return true;

	}

	template <typename TensorTypeA,
	typename Op>
	bool THC_pointwiseApply1(THCState* state,
	TensorTypeA* a,
	const Op& op,
	TensorArgType aType = ReadWrite) {
	if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS) {
	return false;
	}

	if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
	// Zero-dim tensor; do nothing
	return true;
	}

	const dim3 block = getApplyBlock();

	dim3 grid;
	ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

	if (!getApplyGrid(state, totalElements, grid)) {
	return false;
	}

	// If tensor args have overlapping indices and are read/write, then
	// we must expand the tensor to a contiguous form first, since
	// otherwise there are conflicting writes. Upon copying back to the
	// non-contiguous form, there will be conflicting writes, but at
	// least with copy, one of the updaters will win atomically. This is
	// a sketchy property of the old system as well (writing into all
	// indices of a tensor with overlapping indices should probably be
	// an error, since it is unclear which one should win), but we will
	// preserve this last-writer-wins (in arbitrary copy order) behavior.
	TensorTypeA* oldA = NULL;

	if (aType == ReadWrite &&
	TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
	// Must perform in contiguous space
	oldA = a;
	a = TensorUtils<TensorTypeA>::newContiguous(state, a);
	}

	// It is possible that the tensor dimensions are able to be collapsed,
	// and thus we can reduce the actual code complexity of the copy by
	// exploiting this knowledge statically, since the div/mod is the
	// most expensive part of the operation, more so than memory accesses.
	// For instance, when copying a non-contiguous to a contiguous tensor
	// (or vice versa), the contiguous tensor can be collapsed to one
	// dimension, and the loop to translate the linear index to the array
	// index can be similarly collapsed. That is what this unrolling is for.
	#define HANDLE_CASE(TYPE, A) \
	kernelPointwiseApply1<Op, \
	typename TensorUtils<TensorTypeA>::DataType, \
	TYPE, A> \
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
	aInfo, (TYPE) totalElements, op);

	#define HANDLE_A_CASE(TYPE, A) \
	{ \
	if (aInfo.isContiguous()) { \
	HANDLE_CASE(TYPE, -2); \
	} else { \
	switch (A) { \
	case 1: \
	HANDLE_CASE(TYPE, 1); \
	break; \
	case 2: \
	HANDLE_CASE(TYPE, 2); \
	break; \
	default: \
	HANDLE_CASE(TYPE, -1); \
	break; \
	} \
	} \
	}

	// Can we use 32-bit integer math in the kernel (the linear ID for the copy
	// and the resulting non-linear offset is all computable using 32-bit math?)
	// We also use unsigned index math in the kernel, as signed div/mod has
	// additional overhead.
	if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a)) {
	TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
	getTensorInfo<TensorTypeA, unsigned int>(state, a);
	aInfo.collapseDims();

	HANDLE_A_CASE(unsigned int, aInfo.dims);
	} else {
	TensorInfo<typename TensorUtils<TensorTypeA>::DataType, uint64_t> aInfo =
	getTensorInfo<TensorTypeA, uint64_t>(state, a);
	aInfo.collapseDims();

	// For large tensors, we only compile the completely contiguous
	// version and the completely generic version, to reduce
	// compilation time.
	if (aInfo.isContiguous()) {
	kernelPointwiseApply1<Op,
	typename TensorUtils<TensorTypeA>::DataType,
	uint64_t, -2>
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
	aInfo, (uint64_t) totalElements, op);
	} else {
	kernelPointwiseApply1<Op,
	typename TensorUtils<TensorTypeA>::DataType,
	uint64_t, -1>
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
	aInfo, (uint64_t) totalElements, op);
	}
	}
	#undef HANDLE_CASE
	#undef HANDLE_A_CASE

	if (oldA) {
	// Ignore overlaps when copying back; if we use THCTensor_copy
	// instead, it will recursively try and invoke ourselves to make
	// oldA contiguous.
	TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
	TensorUtils<TensorTypeA>::free(state, a);
	a = oldA;
	}

	return true;
	}

	template <typename TensorTypeA,
	typename TensorTypeB,
	typename Op>
	bool THC_pointwiseApply2(THCState* state,
	TensorTypeA* a,
	TensorTypeB* b,
	const Op& op,
	TensorArgType aType = ReadWrite,
	TensorArgType bType = ReadOnly) {
	ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

	if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
	return false;
	}

	if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS \|\|
	TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS) {
	return false;
	}

	if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
	// Zero-dim tensor; do nothing
	return true;
	}

	const dim3 block = getApplyBlock();

	dim3 grid;
	if (!getApplyGrid(state, totalElements, grid)) {
	return false;
	}

	// If tensor args have overlapping indices and are read/write, then
	// we must expand the tensor to a contiguous form first, since
	// otherwise there are conflicting writes. Upon copying back to the
	// non-contiguous form, there will be conflicting writes, but at
	// least with copy, one of the updaters will win atomically. This is
	// a sketchy property of the old system as well (writing into all
	// indices of a tensor with overlapping indices should probably be
	// an error, since it is unclear which one should win), but we will
	// preserve this last-writer-wins (in arbitrary copy order) behavior.
	TensorTypeA* oldA = NULL;
	TensorTypeB* oldB = NULL;

	if (aType == ReadWrite &&
	TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
	// Must perform in contiguous space
	oldA = a;
	a = TensorUtils<TensorTypeA>::newContiguous(state, a);
	}
	if (bType == ReadWrite &&
	TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
	// Must perform in contiguous space
	oldB = b;
	b = TensorUtils<TensorTypeB>::newContiguous(state, b);
	}

	// It is possible that the tensor dimensions are able to be collapsed,
	// and thus we can reduce the actual code complexity of the copy by
	// exploiting this knowledge statically, since the div/mod is the
	// most expensive part of the operation, more so than memory accesses.
	// For instance, when copying a non-contiguous to a contiguous tensor
	// (or vice versa), the contiguous tensor can be collapsed to one
	// dimension, and the loop to translate the linear index to the array
	// index can be similarly collapsed. That is what this unrolling is for.
	#define HANDLE_CASE(TYPE, A, B) \
	kernelPointwiseApply2<Op, \
	typename TensorUtils<TensorTypeA>::DataType, \
	typename TensorUtils<TensorTypeB>::DataType, \
	TYPE, A, B> \
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
	aInfo, bInfo, (TYPE) totalElements, op);

	#define HANDLE_B_CASE(TYPE, A, B) \
	{ \
	if (bInfo.isContiguous()) { \
	HANDLE_CASE(TYPE, A, -2); \
	} else { \
	switch (B) { \
	case 1: \
	HANDLE_CASE(TYPE, A, 1); \
	break; \
	case 2: \
	HANDLE_CASE(TYPE, A, 2); \
	break; \
	default: \
	HANDLE_CASE(TYPE, A, -1); \
	break; \
	} \
	} \
	}

	#define HANDLE_A_CASE(TYPE, A, B) \
	{ \
	if (aInfo.isContiguous()) { \
	HANDLE_B_CASE(TYPE, -2, B); \
	} else { \
	switch (A) { \
	case 1: \
	HANDLE_B_CASE(TYPE, 1, B); \
	break; \
	case 2: \
	HANDLE_B_CASE(TYPE, 2, B); \
	break; \
	default: \
	HANDLE_B_CASE(TYPE, -1, B); \
	break; \
	} \
	} \
	}

	if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
	TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b)) {
	TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
	getTensorInfo<TensorTypeA, unsigned int>(state, a);
	aInfo.collapseDims();

	TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
	getTensorInfo<TensorTypeB, unsigned int>(state, b);
	bInfo.collapseDims();

	HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
	} else {
	TensorInfo<typename TensorUtils<TensorTypeA>::DataType, uint64_t> aInfo =
	getTensorInfo<TensorTypeA, uint64_t>(state, a);
	aInfo.collapseDims();

	TensorInfo<typename TensorUtils<TensorTypeB>::DataType, uint64_t> bInfo =
	getTensorInfo<TensorTypeB, uint64_t>(state, b);
	bInfo.collapseDims();

	// For large tensors, we only compile the completely contiguous
	// version and the completely generic version, to reduce
	// compilation time.
	if (aInfo.isContiguous() && bInfo.isContiguous()) {
	kernelPointwiseApply2<Op,
	typename TensorUtils<TensorTypeA>::DataType,
	typename TensorUtils<TensorTypeB>::DataType,
	uint64_t, -2, -2>
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
	aInfo, bInfo, (uint64_t) totalElements, op);
	} else {
	kernelPointwiseApply2<Op,
	typename TensorUtils<TensorTypeA>::DataType,
	typename TensorUtils<TensorTypeB>::DataType,
	uint64_t, -1, -1>
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
	aInfo, bInfo, (uint64_t) totalElements, op);
	}
	}
	#undef HANDLE_CASE
	#undef HANDLE_B_CASE
	#undef HANDLE_A_CASE

	if (oldA) {
	// Ignore overlaps when copying back; if we use THCTensor_copy
	// instead, it will recursively try and invoke ourselves to make
	// oldA contiguous.
	TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
	TensorUtils<TensorTypeA>::free(state, a);
	a = oldA;
	}

	if (oldB) {
	// Ignore overlaps when copying back; if we use THCTensor_copy
	// instead, it will recursively try and invoke ourselves to make
	// oldB contiguous.
	TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
	TensorUtils<TensorTypeB>::free(state, b);
	b = oldB;
	}

	return true;
	}

	template <typename TensorTypeA,
	typename TensorTypeB,
	typename TensorTypeC,
	typename Op>
	bool THC_pointwiseApply3(THCState* state,
	TensorTypeA* a,
	TensorTypeB* b,
	TensorTypeC* c,
	const Op& op,
	TensorArgType aType = ReadWrite,
	TensorArgType bType = ReadOnly,
	TensorArgType cType = ReadOnly) {
	ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);

	if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) \|\|
	totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {
	return false;
	}

	if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS \|\|
	TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS \|\|
	TensorUtils<TensorTypeC>::getDims(state, c) > MAX_CUTORCH_DIMS) {
	return false;
	}

	if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
	// Zero-dim tensor; do nothing
	return true;
	}

	const dim3 block = getApplyBlock();

	dim3 grid;
	if (!getApplyGrid(state, totalElements, grid)) {
	return false;
	}

	// If tensor args have overlapping indices and are read/write, then
	// we must expand the tensor to a contiguous form first, since
	// otherwise there are conflicting writes. Upon copying back to the
	// non-contiguous form, there will be conflicting writes, but at
	// least with copy, one of the updaters will win atomically. This is
	// a sketchy property of the old system as well (writing into all
	// indices of a tensor with overlapping indices should probably be
	// an error, since it is unclear which one should win), but we will
	// preserve this last-writer-wins (in arbitrary copy order) behavior.
	TensorTypeA* oldA = NULL;
	TensorTypeB* oldB = NULL;
	TensorTypeC* oldC = NULL;

	if (aType == ReadWrite &&
	TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
	// Must perform in contiguous space
	oldA = a;
	a = TensorUtils<TensorTypeA>::newContiguous(state, a);
	}
	if (bType == ReadWrite &&
	TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
	// Must perform in contiguous space
	oldB = b;
	b = TensorUtils<TensorTypeB>::newContiguous(state, b);
	}
	if (cType == ReadWrite &&
	TensorUtils<TensorTypeC>::overlappingIndices(state, c)) {
	// Must perform in contiguous space
	oldC = c;
	c = TensorUtils<TensorTypeC>::newContiguous(state, c);
	}

	#define HANDLE_CASE(TYPE, A, B, C) \
	kernelPointwiseApply3<Op, \
	typename TensorUtils<TensorTypeA>::DataType, \
	typename TensorUtils<TensorTypeB>::DataType, \
	typename TensorUtils<TensorTypeC>::DataType, \
	TYPE, A, B, C> \
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
	aInfo, bInfo, cInfo, (TYPE) totalElements, op);

	#define HANDLE_C_CASE(TYPE, A, B, C) \
	{ \
	if (cInfo.isContiguous()) { \
	HANDLE_CASE(TYPE, A, B, -2); \
	} else { \
	switch (C) { \
	case 1: \
	HANDLE_CASE(TYPE, A, B, 1); \
	break; \
	case 2: \
	HANDLE_CASE(TYPE, A, B, 2); \
	break; \
	default: \
	HANDLE_CASE(TYPE, A, B, -1); \
	break; \
	} \
	} \
	}

	#define HANDLE_B_CASE(TYPE, A, B, C) \
	{ \
	if (bInfo.isContiguous()) { \
	HANDLE_C_CASE(TYPE, A, -2, C); \
	} else { \
	switch (B) { \
	case 1: \
	HANDLE_C_CASE(TYPE, A, 1, C); \
	break; \
	case 2: \
	HANDLE_C_CASE(TYPE, A, 2, C); \
	break; \
	default: \
	HANDLE_C_CASE(TYPE, A, -1, C); \
	break; \
	} \
	} \
	}

	#define HANDLE_A_CASE(TYPE, A, B, C) \
	{ \
	if (aInfo.isContiguous()) { \
	HANDLE_B_CASE(TYPE, -2, B, C); \
	} else { \
	switch (A) { \
	case 1: \
	HANDLE_B_CASE(TYPE, 1, B, C); \
	break; \
	case 2: \
	HANDLE_B_CASE(TYPE, 2, B, C); \
	break; \
	default: \
	HANDLE_B_CASE(TYPE, -1, B, C); \
	break; \
	} \
	} \
	}

	if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
	TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b) &&
	TensorUtils<TensorTypeC>::canUse32BitIndexMath(state, c)) {
	TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
	getTensorInfo<TensorTypeA, unsigned int>(state, a);
	aInfo.collapseDims();

	TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
	getTensorInfo<TensorTypeB, unsigned int>(state, b);
	bInfo.collapseDims();

	TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned int> cInfo =
	getTensorInfo<TensorTypeC, unsigned int>(state, c);
	cInfo.collapseDims();

	HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
	} else {
	TensorInfo<typename TensorUtils<TensorTypeA>::DataType, uint64_t> aInfo =
	getTensorInfo<TensorTypeA, uint64_t>(state, a);
	aInfo.collapseDims();

	TensorInfo<typename TensorUtils<TensorTypeB>::DataType, uint64_t> bInfo =
	getTensorInfo<TensorTypeB, uint64_t>(state, b);
	bInfo.collapseDims();

	TensorInfo<typename TensorUtils<TensorTypeC>::DataType, uint64_t> cInfo =
	getTensorInfo<TensorTypeC, uint64_t>(state, c);
	cInfo.collapseDims();

	// For large tensors, we only compile the completely contiguous
	// version and the completely generic version, to reduce
	// compilation time.
	if (aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()) {
	kernelPointwiseApply3<Op,
	typename TensorUtils<TensorTypeA>::DataType,
	typename TensorUtils<TensorTypeB>::DataType,
	typename TensorUtils<TensorTypeC>::DataType,
	uint64_t, -2, -2, -2>
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
	aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
	} else {
	kernelPointwiseApply3<Op,
	typename TensorUtils<TensorTypeA>::DataType,
	typename TensorUtils<TensorTypeB>::DataType,
	typename TensorUtils<TensorTypeC>::DataType,
	uint64_t, -1, -1, -1>
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
	aInfo, bInfo, cInfo, (uint64_t) totalElements, op);
	}
	}
	#undef HANDLE_CASE
	#undef HANDLE_C_CASE
	#undef HANDLE_B_CASE
	#undef HANDLE_A_CASE

	if (oldA) {
	// Ignore overlaps when copying back; if we use THCTensor_copy
	// instead, it will recursively try and invoke ourselves to make
	// oldA contiguous.
	TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
	TensorUtils<TensorTypeA>::free(state, a);
	a = oldA;
	}

	if (oldB) {
	// Ignore overlaps when copying back; if we use THCTensor_copy
	// instead, it will recursively try and invoke ourselves to make
	// oldB contiguous.
	TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
	TensorUtils<TensorTypeB>::free(state, b);
	b = oldB;
	}

	if (oldC) {
	// Ignore overlaps when copying back; if we use THCTensor_copy
	// instead, it will recursively try and invoke ourselves to make
	// oldC contiguous.
	TensorUtils<TensorTypeC>::copyIgnoringOverlaps(state, oldC, c);
	TensorUtils<TensorTypeC>::free(state, c);
	c = oldC;
	}

	return true;
	}

	#undef THC_APPLY_THREADS_PER_BLOCK

	#endif // THC_APPLY_INC