blob: d59972285250a73261f2fabe32a42dcc4cccae12 [file] [log] [blame]
#include "THCGeneral.h"
#include "THCTensorMath.h"
#include "THCTensorCopy.h"
#include <algorithm>
#ifdef USE_MAGMA
#include <magma.h>
#else
#include "THCBlas.h"
#endif
#ifndef DIVUP
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
#endif
#define NoMagma(name) "No CUDA implementation of '" #name "'. Install MAGMA and rebuild cutorch (http://icl.cs.utk.edu/magma/)"
void THCMagma_init(THCState *state)
{
#ifdef USE_MAGMA
magma_init();
#endif
}
#ifdef USE_MAGMA
static inline float* th_magma_smalloc_pinned(size_t n)
{
float* ptr;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&ptr, n))
THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456);
return ptr;
}
static inline int* th_magma_imalloc_pinned(size_t n)
{
int* ptr;
if (MAGMA_SUCCESS != magma_imalloc_pinned(&ptr, n))
THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", n/268435456);
return ptr;
}
static void THCudaTensor_copyArray1d(THCState *state, THCudaTensor *self, float *src, int k)
{
long size[1] = { k };
long stride[1] = { 1 };
THCudaTensor_rawResize(state, self, 1, size, stride);
size_t len = k * sizeof(float);
THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
}
static void THCudaTensor_copyArray2d(THCState *state, THCudaTensor *self, float *src, int m, int n)
{
long size[2] = { m, n };
long stride[2] = { 1, m };
THCudaTensor_rawResize(state, self, 2, size, stride);
size_t len = m * n * sizeof(float);
THCudaCheck(cudaMemcpy(self->storage->data + self->storageOffset, src, len, cudaMemcpyHostToDevice));
}
static void THCudaTensor_copyTensor2d(THCState *state, float *dst, THCudaTensor *self)
{
THAssert(self->nDimension == 2);
size_t len = THCudaTensor_nElement(state, self)*sizeof(float);
THCudaTensor *temp = THCudaTensor_newTranspose(state, self, 0, 1);
THCudaTensor *selfc = THCudaTensor_newContiguous(state, temp);
THCudaCheck(cudaMemcpy(dst, selfc->storage->data + selfc->storageOffset, len, cudaMemcpyDeviceToHost));
THCudaTensor_free(state, temp);
THCudaTensor_free(state, selfc);
}
#endif
static THCudaTensor* THCudaTensor_newColumnMajor(THCState *state, THCudaTensor *self, THCudaTensor *src)
{
THAssert(src->nDimension == 2);
if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0])
{
THCudaTensor_retain(state, self);
return self;
}
if (self == src)
self = THCudaTensor_new(state);
else
THCudaTensor_retain(state, self);
long size[2] = { src->size[0], src->size[1] };
long stride[2] = { 1, src->size[0] };
THCudaTensor_rawResize(state, self, 2, size, stride);
THCudaTensor_copy(state, self, src);
return self;
}
void THCudaTensor_gesv(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_)
{
#ifdef USE_MAGMA
THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
THArgCheck(b_->nDimension == 2, 2, "b should be 2 dimensional");
THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square");
THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible");
int n = a_->size[0];
int nrhs = b_->size[1];
THCudaTensor *a = THCudaTensor_newColumnMajor(state, ra_, a_);
THCudaTensor *b = THCudaTensor_newColumnMajor(state, rb_, b_);
float *a_data = THCudaTensor_data(state, a);
float *b_data = THCudaTensor_data(state, b);
int *ipiv = th_magma_imalloc_pinned(n);
int info;
magma_sgesv_gpu(n, nrhs, a_data, n, ipiv, b_data, n, &info);
if (info < 0)
THError("MAGMA gesv : Argument %d : illegal value", -info);
else if (info > 0)
THError("MAGMA gesv : U(%d,%d) is zero, singular U.", info, info);
magma_free_pinned(ipiv);
THCudaTensor_freeCopyTo(state, a, ra_);
THCudaTensor_freeCopyTo(state, b, rb_);
#else
THError(NoMagma(gesv));
#endif
}
void THCudaTensor_gels(THCState *state, THCudaTensor *rb_, THCudaTensor *ra_, THCudaTensor *b_, THCudaTensor *a_)
{
#ifdef USE_MAGMA
THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
THArgCheck(b_->nDimension == 2, 1, "b should be 2 dimensional");
THArgCheck(a_->size[0] == b_->size[0], 2, "size incompatible A,b");
THArgCheck(a_->size[0] >= a_->size[1], 2, "A should have m >= n");
THCudaTensor *a = THCudaTensor_newColumnMajor(state, ra_, a_);
THCudaTensor *b = THCudaTensor_newColumnMajor(state, rb_, b_);
float *a_data = THCudaTensor_data(state, a);
float *b_data = THCudaTensor_data(state, b);
int m = a->size[0];
int n = a->size[1];
int nrhs = b->size[1];
float wkopt;
int info;
magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, &wkopt, -1, &info);
float *hwork = th_magma_smalloc_pinned((size_t)wkopt);
magma_sgels_gpu(MagmaNoTrans, m, n, nrhs, a_data, m, b_data, m, hwork, (int)wkopt, &info);
magma_free_pinned(hwork);
if (info != 0)
THError("MAGMA gels : Argument %d : illegal value", -info);
THCudaTensor_freeCopyTo(state, a, ra_);
THCudaTensor_freeCopyTo(state, b, rb_);
#else
THError(NoMagma(gels));
#endif
}
void THCudaTensor_syev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a, const char *jobzs, const char *uplos)
{
#ifdef USE_MAGMA
int n = a->size[0];
int lda = n;
magma_uplo_t uplo = uplos[0] == 'U' ? MagmaUpper : MagmaLower;
magma_vec_t jobz = jobzs[0] == 'N' ? MagmaNoVec : MagmaVec;
THCudaTensor *input = THCudaTensor_newColumnMajor(state, rv_, a);
float *input_data = THCudaTensor_data(state, input);
// eigen values and workspace
float *w = th_magma_smalloc_pinned(n);
float *wA = th_magma_smalloc_pinned(lda);
// compute optimal size of work array
int info;
float lwork;
int liwork;
magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, &lwork, -1, &liwork, -1, &info);
float *work = th_magma_smalloc_pinned((size_t)lwork);
int *iwork = th_magma_imalloc_pinned(liwork);
// compute eigenvalues and, optionally, eigenvectors
magma_ssyevd_gpu(jobz, uplo, n, input_data, lda, w, wA, n, work, (int) lwork, iwork, liwork, &info);
// copy eigen values from w to re_
if (info == 0)
THCudaTensor_copyArray1d(state, re_, w, n);
magma_free_pinned(iwork);
magma_free_pinned(work);
magma_free_pinned(wA);
magma_free_pinned(w);
// check error value
if (info > 0)
THError("MAGMA syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
else if (info < 0)
THError("MAGMA syev : Argument %d : illegal value", -info);
THCudaTensor_freeCopyTo(state, input, rv_);
#else
THError(NoMagma(syev));
#endif
}
void THCudaTensor_geev(THCState *state, THCudaTensor *re_, THCudaTensor *rv_, THCudaTensor *a_, const char *jobvrs)
{
#ifdef USE_MAGMA
THArgCheck(a_->nDimension == 2, 3, "A should be 2 dimensional");
THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square");
magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec;
int n = a_->size[0];
float *a_data = th_magma_smalloc_pinned(n * n);
THCudaTensor_copyTensor2d(state, a_data, a_);
float *wr = th_magma_smalloc_pinned(n);
float *wi = th_magma_smalloc_pinned(n);
float *vr_data = NULL;
int ldvr = 1;
if (jobvr == MagmaVec)
{
vr_data = th_magma_smalloc_pinned(n * n);
ldvr = n;
}
float wkopt;
int info;
magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, &wkopt, -1, &info);
int lwork = (int) wkopt;
float *work_data = th_magma_smalloc_pinned(lwork);
magma_sgeev(MagmaNoVec, jobvr, n, a_data, n, wr, wi, NULL, 1, vr_data, ldvr, work_data, lwork, &info);
if (info > 0)
THError("MAGMA geev : Failed to converge. %d off-diagonal elements of an didn't converge to zero", info);
else if (info < 0)
THError("MAGMA geev : Argument %d : illegal value", -info);
{
THCudaTensor_resize2d(state, re_, 2, n);
THCudaTensor *re = THCudaTensor_newContiguous(state, re_);
THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset, wr, n*sizeof(float), cudaMemcpyHostToDevice));
THCudaCheck(cudaMemcpy(re->storage->data + re->storageOffset + n, wi, n*sizeof(float), cudaMemcpyHostToDevice));
THCudaTensor_freeCopyTo(state, re, re_);
THCudaTensor_transpose(state, re_, NULL, 0, 1);
}
if (jobvr == MagmaVec)
THCudaTensor_copyArray2d(state, rv_, vr_data, n, n);
magma_free_pinned(work_data);
magma_free_pinned(vr_data);
magma_free_pinned(wi);
magma_free_pinned(wr);
magma_free_pinned(a_data);
#else
THError(NoMagma(geev));
#endif
}
void THCudaTensor_gesvd(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *a, const char *jobu)
{
#ifdef USE_MAGMA
THCudaTensor *ra_ = THCudaTensor_new(state);
THCudaTensor_gesvd2(state, ru_, rs_, rv_, ra_, a, jobu);
THCudaTensor_free(state, ra_);
#else
THError(NoMagma(gesvd));
#endif
}
void THCudaTensor_gesvd2(THCState *state, THCudaTensor *ru_, THCudaTensor *rs_, THCudaTensor *rv_, THCudaTensor *ra_, THCudaTensor *a, const char *jobus)
{
#ifdef USE_MAGMA
THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
magma_vec_t jobu = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
magma_vec_t jobvt = jobu;
int m = a->size[0];
int n = a->size[1];
int k = m < n ? m : n;
int j = (jobu == MagmaAllVec) ? m : k;
float *a_data = th_magma_smalloc_pinned(m * n);
THCudaTensor_copyTensor2d(state, a_data, a);
float *rs_data = th_magma_smalloc_pinned(k);
float *ru_data = th_magma_smalloc_pinned(m * j);
float *rv_data = th_magma_smalloc_pinned(n * n);
float wkopt;
int info;
magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, &wkopt, -1, &info);
int lwork = (int) wkopt;
float *work_data = th_magma_smalloc_pinned(lwork);
magma_sgesvd(jobu, jobvt, m, n, a_data, m, rs_data, ru_data, m, rv_data, n, work_data, lwork, &info);
if (info > 0)
THError("MAGMA gesvd : %d superdiagonals failed to converge", info);
else if (info < 0)
THError("MAGMA gesvd : Argument %d : illegal value", -info);
THCudaTensor_copyArray2d(state, rv_, rv_data, n, n);
THCudaTensor_transpose(state, rv_, NULL, 0, 1);
THCudaTensor_copyArray2d(state, ru_, ru_data, m, j);
THCudaTensor_copyArray1d(state, rs_, rs_data, k);
THCudaTensor_copyArray2d(state, ra_, a_data, m, n);
magma_free_pinned(work_data);
magma_free_pinned(rv_data);
magma_free_pinned(ru_data);
magma_free_pinned(rs_data);
magma_free_pinned(a_data);
#else
THError(NoMagma(gesvd2));
#endif
}
void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a)
{
#ifdef USE_MAGMA
THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
int info;
int n = a->size[0];
int lwork = n * magma_get_sgetri_nb(n);
THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a);
float *input_data = THCudaTensor_data(state, input);
int *ipiv = th_magma_imalloc_pinned(n);
THCudaTensor *work = THCudaTensor_newWithSize1d(state, lwork);
float *work_data = THCudaTensor_data(state, work);
// Run LU
magma_sgetrf_gpu(n, n, input_data, n, ipiv, &info);
if (info > 0)
THError("MAGMA getrf : U(%d,%d) is 0, U is singular", info, info);
else if (info < 0)
THError("MAGMA getrf : Argument %d : illegal value", -info);
// Inverse
magma_sgetri_gpu(n, input_data, n, ipiv, work_data, lwork, &info);
if (info > 0)
THError("MAGMA getri : U(%d,%d) is 0, U is singular", info, info);
else if (info < 0)
THError("MAGMA getri : Argument %d : illegal value", -info);
THCudaTensor_free(state, work);
magma_free_pinned(ipiv);
THCudaTensor_freeCopyTo(state, input, ra_);
#else
THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
int n = a->size[0];
// input
THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a);
// output
THCudaTensor *output = THCudaTensor_newColumnMajor(state, ra_, a);
size_t matrices_size = sizeof(float*);
float **matrices1 = (float **)THAlloc(matrices_size);
const float **matrices1_const = (const float **)THAlloc(matrices_size);
float **matrices2 = (float **)THAlloc(matrices_size);
matrices1[0] = THCudaTensor_data(state, input);
matrices1_const[0] = THCudaTensor_data(state, input);
matrices2[0] = THCudaTensor_data(state, output);
// Copy pointers to device.
float **d_matrices1, **d_matrices2;
const float **d_matrices1_const;
THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size));
THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1_const, matrices_size));
THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size));
THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size,
cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
THCudaCheck(cudaMemcpyAsync(d_matrices1_const, matrices1_const, matrices_size,
cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size,
cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
int info;
int *info_gpu;
THCudaCheck(THCudaMalloc(state, (void**)&info_gpu, sizeof(int)));
int *ipiv_gpu;
THCudaCheck(THCudaMalloc(state, (void**)&ipiv_gpu, n * sizeof(int)));
// Run LU
THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
if (info > 0)
THError("CUBLAS getrf : U(%d,%d) is 0, U is singular", info, info);
else if (info < 0)
THError("CUBLAS getrf : Argument %d : illegal value", -info);
// Inverse
THCudaBlas_Sgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
if (info > 0)
THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info);
else if (info < 0)
THError("CUBLAS getri : Argument %d : illegal value", -info);
THCudaCheck(THCudaFree(state, ipiv_gpu));
THCudaCheck(THCudaFree(state, info_gpu));
THCudaTensor_freeCopyTo(state, output, input);
#endif
}
__global__ void THCudaTensor_copyUpperSymmetric(float *input, int n, int len)
{
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < len; idx += 65535) {
const int r = idx % n;
const int c = idx / n;
if (r > c) {
input[idx] = input[r*n + c];
}
}
}
void THCudaTensor_potri(THCState *state, THCudaTensor *ra_, THCudaTensor *a)
{
#ifdef USE_MAGMA
THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
int n = a->size[0];
THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a);
float *input_data = THCudaTensor_data(state, input);
int info;
magma_spotrf_gpu(MagmaUpper, n, input_data, n, &info);
if (info > 0)
THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info);
else if (info < 0)
THError("MAGMA potrf : Argument %d : illegal value", -info);
magma_spotri_gpu(MagmaUpper, n, input_data, n, &info);
if (info > 0)
THError("MAGMA potri : A(%d,%d) is 0, A cannot be factorized", info, info);
else if (info < 0)
THError("MAGMA potri : Argument %d : illegal value", -info);
cudaStream_t stream = THCState_getCurrentStream(state);
const int len = n*n;
dim3 blocks(std::min(DIVUP(len, 128), 65535));
dim3 threads(128);
THCudaTensor_copyUpperSymmetric<<<blocks, threads, 0, stream>>>(input_data, n, len);
THCudaTensor_freeCopyTo(state, input, ra_);
#else
THError(NoMagma(potri));
#endif
}
void THCudaTensor_potrf(THCState *state, THCudaTensor *ra_, THCudaTensor *a)
{
#ifdef USE_MAGMA
THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
int n = a->size[0];
THCudaTensor *input = THCudaTensor_newColumnMajor(state, ra_, a);
float *input_data = THCudaTensor_data(state, input);
int info;
magma_spotrf_gpu(MagmaUpper, n, input_data, n, &info);
// check error value
if (info > 0)
THError("MAGMA potrf : A(%d,%d) is 0, A cannot be factorized", info, info);
else if (info < 0)
THError("MAGMA potrf : Argument %d : illegal value", -info);
THCudaTensor_triu(state, ra_, input, 0);
THCudaTensor_free(state, input);
#else
THError(NoMagma(potrf));
#endif
}
void THCudaTensor_potrs(THCState *state, THCudaTensor *rb_, THCudaTensor *b, THCudaTensor *a)
{
#ifdef USE_MAGMA
THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
int n = a->size[0];
int nrhs = b->size[1];
THCudaTensor *b_ = THCudaTensor_newColumnMajor(state, rb_, b);
float *b_data = THCudaTensor_data(state, b_);
THCudaTensor *a_ = THCudaTensor_newColumnMajor(state, a, a);
float *a_data = THCudaTensor_data(state, a_);
int info;
magma_spotrs_gpu(MagmaUpper, n, nrhs, a_data, n, b_data, n, &info);
// check error value
if (info < 0)
THError("MAGMA potrs : Argument %d : illegal value", -info);
THCudaTensor_freeCopyTo(state, b_, rb_);
THCudaTensor_free(state, a_);
#else
THError(NoMagma(potrs));
#endif
}
void THCudaTensor_qr(THCState *state, THCudaTensor *rq_, THCudaTensor *rr_, THCudaTensor *a_)
{
#ifdef USE_MAGMA
THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional");
THCudaTensor *a = THCudaTensor_newColumnMajor(state, rr_, a_);
int m = a->size[0];
int n = a->size[1];
int k = (m < n ? m : n);
#ifdef MAGMA_V2
int nb = magma_get_sgeqrf_nb(m, n);
#else
int nb = magma_get_sgeqrf_nb(m);
#endif
float *a_data = THCudaTensor_data(state, a);
float *tau_data = th_magma_smalloc_pinned(n*n);
THCudaTensor *work = THCudaTensor_newWithSize1d(state, (2*k + ((n+31)/32)*32)*nb);
float *work_data = THCudaTensor_data(state, work);
int info;
magma_sgeqrf_gpu(m, n, a_data, m, tau_data, work_data, &info);
if (info != 0)
THError("MAGMA geqrf : Argument %d : illegal value.", -info);
THCudaTensor *q = THCudaTensor_newColumnMajor(state, rq_, a);
float *q_data = THCudaTensor_data(state, q);
THCudaTensor_narrow(state, a, a, 0, 0, k);
THCudaTensor_triu(state, rr_, a, 0);
THCudaTensor_free(state, a);
magma_sorgqr_gpu(m, n, k, q_data, m, tau_data, work_data, nb, &info);
if (info != 0)
THError("MAGMA orgqr : Argument %d : illegal value.", -info);
THCudaTensor_free(state, work);
magma_free_pinned(tau_data);
THCudaTensor_narrow(state, q, q, 1, 0, k);
THCudaTensor_freeCopyTo(state, q, rq_);
#else
THError(NoMagma(qr));
#endif
}