| #include "THC.h" |
| #include "THCTensorMath.h" |
| #include "THCGeneral.h" |
| #include "THCBlas.h" |
| #include "THCTensorCopy.h" |
| #include "THCTensorRandom.h" |
| #include "THCApply.cuh" |
| #include "THCReduce.cuh" |
| #include "THCDeviceUtils.cuh" |
| #include <algorithm> // for std::min |
| |
| // We prefer this kernel to avoid reloading index points if the number |
| // of indices is a small number. |
| // This kernel in fact works for all choices of problem size, but if |
| // the number of indices chosen is large, then the |
| // indexCopyLargeIndex kernel is a better choice to increase |
| // parallelism. |
| template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim> |
| __global__ void indexCopySmallIndex(TensorInfo<T, IndexType> dst, |
| TensorInfo<T, IndexType> src, |
| TensorInfo<long, IndexType> indices, |
| int dstCopyDim, |
| int srcCopyDim, |
| IndexType innerSize, |
| long dstCopyDimSize) { |
| // In order to avoid reloading the index that we are copying, load |
| // it once to handle all of the points that are being selected, so |
| // it can be reused as much as possible. This kernel is chosen when |
| // this is a good choice (small number of chosen indices), since |
| // re-accessing indices in addition to src elements can be slow. |
| for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) { |
| // Lua indices begin at 1 |
| IndexType dstIndex = |
| indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE; |
| |
| if (dstIndex < dstCopyDimSize) { |
| // We stride over the output ignoring the indexed dimension |
| // (innerSize), whose offset calculation is handled differently |
| for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; |
| linearIndex < innerSize; |
| linearIndex += gridDim.x * blockDim.x) { |
| IndexType dstOffset = |
| IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst); |
| |
| dstOffset += dstIndex * dst.strides[dstCopyDim]; |
| |
| IndexType srcOffset = |
| IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src); |
| srcOffset += srcIndex * src.strides[srcCopyDim]; |
| |
| dst.data[dstOffset] = src.data[srcOffset]; |
| } |
| } |
| } |
| } |
| |
| // We prefer this kernel to balance parallelism across index points, |
| // if there are a large number of indices. |
| // This kernel in fact works for all choices of problem size, but if |
| // the number of indices chosen is small, then the |
| // indexCopySmallIndex kernel is a better choice to reduce memory |
| // accesses. |
| template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim> |
| __global__ void indexCopyLargeIndex(TensorInfo<T, IndexType> dst, |
| TensorInfo<T, IndexType> src, |
| TensorInfo<long, IndexType> indices, |
| int dstCopyDim, |
| int srcCopyDim, |
| IndexType innerSize, |
| long dstCopyDimSize) { |
| // We stride over the output including the indexed dimension |
| // (totalSize), and calculate the destination index point based on that |
| for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; |
| linearIndex < innerSize * indices.sizes[0]; |
| linearIndex += gridDim.x * blockDim.x) { |
| IndexType srcIndex = linearIndex / innerSize; |
| IndexType elementInSlice = linearIndex % innerSize; |
| |
| // Lua indices begin at 1 |
| IndexType dstIndex = |
| indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE; |
| |
| if (dstIndex < dstCopyDimSize) { |
| IndexType dstOffset = |
| IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst); |
| dstOffset += dstIndex * dst.strides[dstCopyDim]; |
| |
| IndexType srcOffset = |
| IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src); |
| srcOffset += srcIndex * src.strides[srcCopyDim]; |
| |
| dst.data[dstOffset] = src.data[srcOffset]; |
| } |
| } |
| } |
| |
| // We prefer this kernel to avoid reloading index points if the number |
| // of indices is a small number. |
| // This kernel in fact works for all choices of problem size, but if |
| // the number of indices chosen is large, then the |
| // indexAddLargeIndex kernel is a better choice to increase |
| // parallelism. |
| template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim> |
| __global__ void indexAddSmallIndex(TensorInfo<T, IndexType> dst, |
| TensorInfo<T, IndexType> src, |
| TensorInfo<long, IndexType> indices, |
| int dstAddDim, |
| int srcAddDim, |
| IndexType innerSize, |
| long dstAddDimSize) { |
| // In order to avoid reloading the index that we are copying, load |
| // it once to handle all of the points that are being selected, so |
| // it can be reused as much as possible. This kernel is chosen when |
| // this is a good choice (small number of chosen indices), since |
| // re-accessing indices in addition to src elements can be slow. |
| for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) { |
| // Lua indices begin at 1 |
| IndexType dstIndex = |
| indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE; |
| |
| if (dstIndex < dstAddDimSize) { |
| // We stride over the output ignoring the indexed dimension |
| // (innerSize), whose offset calculation is handled differently |
| for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; |
| linearIndex < innerSize; |
| linearIndex += gridDim.x * blockDim.x) { |
| IndexType dstOffset = |
| IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst); |
| dstOffset += dstIndex * dst.strides[dstAddDim]; |
| |
| IndexType srcOffset = |
| IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src); |
| srcOffset += srcIndex * src.strides[srcAddDim]; |
| |
| atomicAdd(&dst.data[dstOffset], src.data[srcOffset]); |
| } |
| } |
| } |
| } |
| |
| // We prefer this kernel to balance parallelism across index points, |
| // if there are a large number of indices. |
| // This kernel in fact works for all choices of problem size, but if |
| // the number of indices chosen is small, then the |
| // indexAddSmallIndex kernel is a better choice to reduce memory |
| // accesses. |
| template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim> |
| __global__ void indexAddLargeIndex(TensorInfo<T, IndexType> dst, |
| TensorInfo<T, IndexType> src, |
| TensorInfo<long, IndexType> indices, |
| int dstAddDim, |
| int srcAddDim, |
| IndexType innerSize, |
| long dstAddDimSize) { |
| // We stride over the output including the indexed dimension |
| // (totalSize), and calculate the destination index point based on that |
| for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; |
| linearIndex < innerSize * indices.sizes[0]; |
| linearIndex += gridDim.x * blockDim.x) { |
| IndexType srcIndex = linearIndex / innerSize; |
| IndexType elementInSlice = linearIndex % innerSize; |
| |
| // Lua indices begin at 1 |
| IndexType dstIndex = |
| indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE; |
| |
| if (dstIndex < dstAddDimSize) { |
| IndexType dstOffset = |
| IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst); |
| dstOffset += dstIndex * dst.strides[dstAddDim]; |
| |
| IndexType srcOffset = |
| IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src); |
| srcOffset += srcIndex * src.strides[srcAddDim]; |
| |
| atomicAdd(&dst.data[dstOffset], src.data[srcOffset]); |
| } |
| } |
| } |
| |
| // We prefer this kernel to avoid reloading index points if the number |
| // of indices is a small number. |
| // This kernel in fact works for all choices of problem size, but if |
| // the number of indices chosen is large, then the |
| // indexFillLargeIndex kernel is a better choice to increase |
| // parallelism. |
| template <typename T, typename IndexType, int DstDim, int IdxDim> |
| __global__ void indexFillSmallIndex(TensorInfo<T, IndexType> dst, |
| TensorInfo<long, IndexType> indices, |
| int dstFillDim, |
| IndexType innerSize, |
| long dstFillDimSize, |
| T val) { |
| // In order to avoid reloading the index that we are copying, load |
| // it once to handle all of the points that are being selected, so |
| // it can be reused as much as possible. This kernel is chosen when |
| // this is a good choice (small number of chosen indices), since |
| // re-accessing indices in addition to src elements can be slow. |
| for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) { |
| // Lua indices begin at 1 |
| IndexType dstIndex_ = |
| indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE; |
| |
| if (dstIndex < dstFillDimSize) { |
| // We stride over the output ignoring the indexed dimension |
| // (innerSize), whose offset calculation is handled differently |
| for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; |
| linearIndex < innerSize; |
| linearIndex += gridDim.x * blockDim.x) { |
| IndexType dstOffset = |
| IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst); |
| dstOffset += dstIndex_ * dst.strides[dstFillDim]; |
| |
| dst.data[dstOffset] = val; |
| } |
| } |
| } |
| } |
| |
| // We prefer this kernel to balance parallelism across index points, |
| // if there are a large number of indices. |
| // This kernel in fact works for all choices of problem size, but if |
| // the number of indices chosen is small, then the |
| // indexFillSmallIndex kernel is a better choice to reduce memory |
| // accesses. |
| template <typename T, typename IndexType, int DstDim, int IdxDim> |
| __global__ void indexFillLargeIndex(TensorInfo<T, IndexType> dst, |
| TensorInfo<long, IndexType> indices, |
| int dstFillDim, |
| IndexType innerSize, |
| long dstFillDimSize, |
| T val) { |
| // We stride over the output including the indexed dimension |
| // (totalSize), and calculate the destination index point based on that |
| for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; |
| linearIndex < innerSize * indices.sizes[0]; |
| linearIndex += gridDim.x * blockDim.x) { |
| IndexType dstIndex = linearIndex / innerSize; |
| IndexType elementInSlice = linearIndex % innerSize; |
| |
| // Lua indices begin at 1 |
| IndexType dstIndex_ = |
| indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE; |
| |
| if (dstIndex_ < dstFillDimSize) { |
| IndexType dstOffset = |
| IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst); |
| dstOffset += dstIndex_ * dst.strides[dstFillDim]; |
| |
| dst.data[dstOffset] = val; |
| } |
| } |
| } |
| |
| // We prefer this kernel to avoid reloading index points if the number |
| // of indices is a small number. |
| // This kernel in fact works for all choices of problem size, but if |
| // the number of indices chosen is large, then the |
| // indexSelectLargeIndex kernel is a better choice to increase |
| // parallelism. |
| template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim> |
| __global__ void indexSelectSmallIndex(TensorInfo<T, IndexType> dst, |
| TensorInfo<T, IndexType> src, |
| TensorInfo<long, IndexType> indices, |
| int dstSelectDim, |
| int srcSelectDim, |
| IndexType innerSize, |
| long srcSelectDimSize) { |
| // In order to avoid reloading the index that we are copying, load |
| // it once to handle all of the points that are being selected, so |
| // it can be reused as much as possible. This kernel is chosen when |
| // this is a good choice (small number of chosen indices), since |
| // re-accessing indices in addition to src elements can be slow. |
| for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) { |
| // Lua indices begin at 1 |
| IndexType srcIndex = |
| indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE; |
| |
| if (srcIndex < srcSelectDimSize) { |
| // We stride over the output ignoring the indexed dimension |
| // (innerSize), whose offset calculation is handled differently |
| for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; |
| linearIndex < innerSize; |
| linearIndex += gridDim.x * blockDim.x) { |
| IndexType dstOffset = |
| IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst); |
| dstOffset += dstIndex * dst.strides[dstSelectDim]; |
| |
| IndexType srcOffset = |
| IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src); |
| srcOffset += srcIndex * src.strides[srcSelectDim]; |
| |
| dst.data[dstOffset] = src.data[srcOffset]; |
| } |
| } |
| } |
| } |
| |
| // We prefer this kernel to balance parallelism across index points, |
| // if there are a large number of indices. |
| // This kernel in fact works for all choices of problem size, but if |
| // the number of indices chosen is small, then the |
| // indexSelectSmallIndex kernel is a better choice to reduce memory |
| // accesses. |
| template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim> |
| __global__ void indexSelectLargeIndex(TensorInfo<T, IndexType> dst, |
| TensorInfo<T, IndexType> src, |
| TensorInfo<long, IndexType> indices, |
| int dstSelectDim, |
| int srcSelectDim, |
| IndexType totalSize, |
| IndexType innerSize, |
| long srcSelectDimSize) { |
| // We stride over the output including the indexed dimension |
| // (totalSize), and calculate the destination index point based on that |
| for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; |
| linearIndex < totalSize; |
| linearIndex += gridDim.x * blockDim.x) { |
| IndexType dstIndex = linearIndex / innerSize; |
| IndexType elementInSlice = linearIndex % innerSize; |
| |
| // Lua indices begin at 1 |
| IndexType srcIndex = |
| indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE; |
| |
| if (srcIndex < srcSelectDimSize) { |
| IndexType dstOffset = |
| IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst); |
| dstOffset += dstIndex * dst.strides[dstSelectDim]; |
| |
| IndexType srcOffset = |
| IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src); |
| srcOffset += srcIndex * src.strides[srcSelectDim]; |
| |
| dst.data[dstOffset] = src.data[srcOffset]; |
| } |
| } |
| } |
| |
| #include "generic/THCTensorIndex.cu" |
| #include "THCGenerateAllTypes.h" |