blob: 5769d846ad3f4c63ae406edd5ac35140f03a7304 [file] [log] [blame]
#include "THC.h"
#include "THCTensorMath.h"
#include "THCGeneral.h"
#include "THCBlas.h"
#include "THCTensorCopy.h"
#include "THCTensorRandom.h"
#include "THCApply.cuh"
#include "THCReduce.cuh"
#include "THCDeviceUtils.cuh"
#include <algorithm> // for std::min
// We prefer this kernel to avoid reloading index points if the number
// of indices is a small number.
// This kernel in fact works for all choices of problem size, but if
// the number of indices chosen is large, then the
// indexCopyLargeIndex kernel is a better choice to increase
// parallelism.
template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
__global__ void indexCopySmallIndex(TensorInfo<T, IndexType> dst,
TensorInfo<T, IndexType> src,
TensorInfo<long, IndexType> indices,
int dstCopyDim,
int srcCopyDim,
IndexType innerSize,
long dstCopyDimSize) {
// In order to avoid reloading the index that we are copying, load
// it once to handle all of the points that are being selected, so
// it can be reused as much as possible. This kernel is chosen when
// this is a good choice (small number of chosen indices), since
// re-accessing indices in addition to src elements can be slow.
for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
// Lua indices begin at 1
IndexType dstIndex =
indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
if (dstIndex < dstCopyDimSize) {
// We stride over the output ignoring the indexed dimension
// (innerSize), whose offset calculation is handled differently
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < innerSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstOffset =
IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
dstOffset += dstIndex * dst.strides[dstCopyDim];
IndexType srcOffset =
IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
srcOffset += srcIndex * src.strides[srcCopyDim];
dst.data[dstOffset] = src.data[srcOffset];
}
}
}
}
// We prefer this kernel to balance parallelism across index points,
// if there are a large number of indices.
// This kernel in fact works for all choices of problem size, but if
// the number of indices chosen is small, then the
// indexCopySmallIndex kernel is a better choice to reduce memory
// accesses.
template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
__global__ void indexCopyLargeIndex(TensorInfo<T, IndexType> dst,
TensorInfo<T, IndexType> src,
TensorInfo<long, IndexType> indices,
int dstCopyDim,
int srcCopyDim,
IndexType innerSize,
long dstCopyDimSize) {
// We stride over the output including the indexed dimension
// (totalSize), and calculate the destination index point based on that
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < innerSize * indices.sizes[0];
linearIndex += gridDim.x * blockDim.x) {
IndexType srcIndex = linearIndex / innerSize;
IndexType elementInSlice = linearIndex % innerSize;
// Lua indices begin at 1
IndexType dstIndex =
indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
if (dstIndex < dstCopyDimSize) {
IndexType dstOffset =
IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
dstOffset += dstIndex * dst.strides[dstCopyDim];
IndexType srcOffset =
IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
srcOffset += srcIndex * src.strides[srcCopyDim];
dst.data[dstOffset] = src.data[srcOffset];
}
}
}
// We prefer this kernel to avoid reloading index points if the number
// of indices is a small number.
// This kernel in fact works for all choices of problem size, but if
// the number of indices chosen is large, then the
// indexAddLargeIndex kernel is a better choice to increase
// parallelism.
template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
__global__ void indexAddSmallIndex(TensorInfo<T, IndexType> dst,
TensorInfo<T, IndexType> src,
TensorInfo<long, IndexType> indices,
int dstAddDim,
int srcAddDim,
IndexType innerSize,
long dstAddDimSize) {
// In order to avoid reloading the index that we are copying, load
// it once to handle all of the points that are being selected, so
// it can be reused as much as possible. This kernel is chosen when
// this is a good choice (small number of chosen indices), since
// re-accessing indices in addition to src elements can be slow.
for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
// Lua indices begin at 1
IndexType dstIndex =
indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
if (dstIndex < dstAddDimSize) {
// We stride over the output ignoring the indexed dimension
// (innerSize), whose offset calculation is handled differently
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < innerSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstOffset =
IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
dstOffset += dstIndex * dst.strides[dstAddDim];
IndexType srcOffset =
IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
srcOffset += srcIndex * src.strides[srcAddDim];
atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
}
}
}
}
// We prefer this kernel to balance parallelism across index points,
// if there are a large number of indices.
// This kernel in fact works for all choices of problem size, but if
// the number of indices chosen is small, then the
// indexAddSmallIndex kernel is a better choice to reduce memory
// accesses.
template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
__global__ void indexAddLargeIndex(TensorInfo<T, IndexType> dst,
TensorInfo<T, IndexType> src,
TensorInfo<long, IndexType> indices,
int dstAddDim,
int srcAddDim,
IndexType innerSize,
long dstAddDimSize) {
// We stride over the output including the indexed dimension
// (totalSize), and calculate the destination index point based on that
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < innerSize * indices.sizes[0];
linearIndex += gridDim.x * blockDim.x) {
IndexType srcIndex = linearIndex / innerSize;
IndexType elementInSlice = linearIndex % innerSize;
// Lua indices begin at 1
IndexType dstIndex =
indices.data[IndexToOffset<long, IndexType, IdxDim>::get(srcIndex, indices)] - TH_INDEX_BASE;
if (dstIndex < dstAddDimSize) {
IndexType dstOffset =
IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
dstOffset += dstIndex * dst.strides[dstAddDim];
IndexType srcOffset =
IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
srcOffset += srcIndex * src.strides[srcAddDim];
atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
}
}
}
// We prefer this kernel to avoid reloading index points if the number
// of indices is a small number.
// This kernel in fact works for all choices of problem size, but if
// the number of indices chosen is large, then the
// indexFillLargeIndex kernel is a better choice to increase
// parallelism.
template <typename T, typename IndexType, int DstDim, int IdxDim>
__global__ void indexFillSmallIndex(TensorInfo<T, IndexType> dst,
TensorInfo<long, IndexType> indices,
int dstFillDim,
IndexType innerSize,
long dstFillDimSize,
T val) {
// In order to avoid reloading the index that we are copying, load
// it once to handle all of the points that are being selected, so
// it can be reused as much as possible. This kernel is chosen when
// this is a good choice (small number of chosen indices), since
// re-accessing indices in addition to src elements can be slow.
for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
// Lua indices begin at 1
IndexType dstIndex_ =
indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
if (dstIndex < dstFillDimSize) {
// We stride over the output ignoring the indexed dimension
// (innerSize), whose offset calculation is handled differently
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < innerSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstOffset =
IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
dstOffset += dstIndex_ * dst.strides[dstFillDim];
dst.data[dstOffset] = val;
}
}
}
}
// We prefer this kernel to balance parallelism across index points,
// if there are a large number of indices.
// This kernel in fact works for all choices of problem size, but if
// the number of indices chosen is small, then the
// indexFillSmallIndex kernel is a better choice to reduce memory
// accesses.
template <typename T, typename IndexType, int DstDim, int IdxDim>
__global__ void indexFillLargeIndex(TensorInfo<T, IndexType> dst,
TensorInfo<long, IndexType> indices,
int dstFillDim,
IndexType innerSize,
long dstFillDimSize,
T val) {
// We stride over the output including the indexed dimension
// (totalSize), and calculate the destination index point based on that
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < innerSize * indices.sizes[0];
linearIndex += gridDim.x * blockDim.x) {
IndexType dstIndex = linearIndex / innerSize;
IndexType elementInSlice = linearIndex % innerSize;
// Lua indices begin at 1
IndexType dstIndex_ =
indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
if (dstIndex_ < dstFillDimSize) {
IndexType dstOffset =
IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
dstOffset += dstIndex_ * dst.strides[dstFillDim];
dst.data[dstOffset] = val;
}
}
}
// We prefer this kernel to avoid reloading index points if the number
// of indices is a small number.
// This kernel in fact works for all choices of problem size, but if
// the number of indices chosen is large, then the
// indexSelectLargeIndex kernel is a better choice to increase
// parallelism.
template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
__global__ void indexSelectSmallIndex(TensorInfo<T, IndexType> dst,
TensorInfo<T, IndexType> src,
TensorInfo<long, IndexType> indices,
int dstSelectDim,
int srcSelectDim,
IndexType innerSize,
long srcSelectDimSize) {
// In order to avoid reloading the index that we are copying, load
// it once to handle all of the points that are being selected, so
// it can be reused as much as possible. This kernel is chosen when
// this is a good choice (small number of chosen indices), since
// re-accessing indices in addition to src elements can be slow.
for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
// Lua indices begin at 1
IndexType srcIndex =
indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
if (srcIndex < srcSelectDimSize) {
// We stride over the output ignoring the indexed dimension
// (innerSize), whose offset calculation is handled differently
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < innerSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstOffset =
IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
dstOffset += dstIndex * dst.strides[dstSelectDim];
IndexType srcOffset =
IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
srcOffset += srcIndex * src.strides[srcSelectDim];
dst.data[dstOffset] = src.data[srcOffset];
}
}
}
}
// We prefer this kernel to balance parallelism across index points,
// if there are a large number of indices.
// This kernel in fact works for all choices of problem size, but if
// the number of indices chosen is small, then the
// indexSelectSmallIndex kernel is a better choice to reduce memory
// accesses.
template <typename T, typename IndexType, int DstDim, int SrcDim, int IdxDim>
__global__ void indexSelectLargeIndex(TensorInfo<T, IndexType> dst,
TensorInfo<T, IndexType> src,
TensorInfo<long, IndexType> indices,
int dstSelectDim,
int srcSelectDim,
IndexType totalSize,
IndexType innerSize,
long srcSelectDimSize) {
// We stride over the output including the indexed dimension
// (totalSize), and calculate the destination index point based on that
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstIndex = linearIndex / innerSize;
IndexType elementInSlice = linearIndex % innerSize;
// Lua indices begin at 1
IndexType srcIndex =
indices.data[IndexToOffset<long, IndexType, IdxDim>::get(dstIndex, indices)] - TH_INDEX_BASE;
if (srcIndex < srcSelectDimSize) {
IndexType dstOffset =
IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
dstOffset += dstIndex * dst.strides[dstSelectDim];
IndexType srcOffset =
IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
srcOffset += srcIndex * src.strides[srcSelectDim];
dst.data[dstOffset] = src.data[srcOffset];
}
}
}
#include "generic/THCTensorIndex.cu"
#include "THCGenerateAllTypes.h"