| #include "THCTensorMath.h" |
| #include "THCGeneral.h" |
| #include "THCBlas.h" |
| #include "THCTensorCopy.h" |
| #include "THCTensorRandom.h" |
| #include "THCApply.cuh" |
| #include "THCReduce.cuh" |
| |
| float THCudaTensor_dot(THCState *state, THCudaTensor *self, THCudaTensor *src) |
| { |
| THAssert(THCudaTensor_checkGPU(state, 2, self, src)); |
| THArgCheck(THCudaTensor_nElement(state, self) == THCudaTensor_nElement(state, src), 2, "sizes do not match"); |
| |
| { |
| self = THCudaTensor_newContiguous(state, self); |
| src = THCudaTensor_newContiguous(state, src); |
| |
| float result = THCudaBlas_dot(state, |
| THCudaTensor_nElement(state, self), |
| THCudaTensor_data(state, self), 1, |
| THCudaTensor_data(state, src), 1); |
| THCudaTensor_free(state, src); |
| THCudaTensor_free(state, self); |
| |
| return result; |
| } |
| } |
| |
| void THCudaTensor_addmv(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *mat, THCudaTensor *vec) |
| { |
| THAssert(THCudaTensor_checkGPU(state, 4, r_, t, mat, vec)); |
| if( (mat->nDimension != 2) || (vec->nDimension != 1) ) |
| THError("matrix and vector expected"); |
| |
| if( mat->size[1] != vec->size[0] ) |
| THError("size mismatch"); |
| |
| if(t->nDimension != 1) |
| THError("size mismatch"); |
| |
| if(t->size[0] != mat->size[0]) |
| THError("size mismatch"); |
| |
| if(r_ != t) |
| { |
| THCudaTensor_resizeAs(state, r_, t); |
| THCudaTensor_copy(state, r_, t); |
| } |
| |
| if(mat->stride[0] == 1) |
| { |
| THCudaBlas_gemv(state, 'n', mat->size[0], mat->size[1], |
| alpha, THCudaTensor_data(state, mat), mat->stride[1], |
| THCudaTensor_data(state, vec), vec->stride[0], |
| beta, THCudaTensor_data(state, r_), r_->stride[0]); |
| } |
| else if(mat->stride[1] == 1) |
| { |
| THCudaBlas_gemv(state, 't', mat->size[1], mat->size[0], |
| alpha, THCudaTensor_data(state, mat), mat->stride[0], |
| THCudaTensor_data(state, vec), vec->stride[0], |
| beta, THCudaTensor_data(state, r_), r_->stride[0]); |
| } |
| else |
| { |
| THCudaTensor *cmat = THCudaTensor_newContiguous(state, mat); |
| |
| THCudaBlas_gemv(state, 't', mat->size[1], mat->size[0], |
| alpha, THCudaTensor_data(state, cmat), cmat->stride[0], |
| THCudaTensor_data(state, vec), vec->stride[0], |
| beta, THCudaTensor_data(state, r_), r_->stride[0]); |
| |
| THCudaTensor_free(state, cmat); |
| } |
| } |
| |
| void THCudaTensor_addmm(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *m1, THCudaTensor *m2) |
| { |
| THAssert(THCudaTensor_checkGPU(state, 4, r_, t, m1, m2)); |
| char transpose_r, transpose_m1, transpose_m2; |
| THCudaTensor *r__, *m1_, *m2_; |
| |
| if( (m1->nDimension != 2) || (m2->nDimension != 2) ) |
| THError("matrix and matrix expected"); |
| |
| if(t->nDimension != 2) |
| THError("size mismatch"); |
| |
| if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) || (m1->size[1] != m2->size[0]) ) |
| THError("size mismatch"); |
| |
| if(t != r_) |
| { |
| THCudaTensor_resizeAs(state, r_, t); |
| THCudaTensor_copy(state, r_, t); |
| } |
| |
| /* r_ */ |
| if(r_->stride[0] == 1 && |
| r_->stride[1] != 0) |
| { |
| transpose_r = 'n'; |
| r__ = r_; |
| } |
| else if(r_->stride[1] == 1 && |
| r_->stride[0] != 0) |
| { |
| THCudaTensor *swap = m2; |
| m2 = m1; |
| m1 = swap; |
| transpose_r = 't'; |
| r__ = r_; |
| } |
| else |
| { |
| transpose_r = 'n'; |
| |
| r__ = THCudaTensor_newWithSize2d(state, r_->size[1], r_->size[0]); |
| THCudaTensor_copy(state, r__, r_); |
| THCudaTensor_transpose(state, r__, NULL, 0, 1); |
| } |
| |
| /* m1 */ |
| if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && |
| m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0) |
| { |
| transpose_m1 = 'n'; |
| m1_ = m1; |
| } |
| else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && |
| m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0) |
| { |
| transpose_m1 = 't'; |
| m1_ = m1; |
| } |
| else |
| { |
| transpose_m1 = (transpose_r == 'n' ? 't' : 'n'); |
| m1_ = THCudaTensor_newContiguous(state, m1); |
| } |
| |
| /* m2 */ |
| if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && |
| m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0) |
| { |
| transpose_m2 = 'n'; |
| m2_ = m2; |
| } |
| else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && |
| m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0) |
| { |
| transpose_m2 = 't'; |
| m2_ = m2; |
| } |
| else |
| { |
| transpose_m2 = (transpose_r == 'n' ? 't' : 'n'); |
| m2_ = THCudaTensor_newContiguous(state, m2); |
| } |
| |
| /* do the operation */ |
| THCudaBlas_gemm(state, |
| transpose_m1, |
| transpose_m2, |
| r__->size[(transpose_r == 'n' ? 0 : 1)], |
| r__->size[(transpose_r == 'n' ? 1 : 0)], |
| m1_->size[(transpose_r == 'n' ? 1 : 0)], |
| alpha, |
| THCudaTensor_data(state, m1_), |
| (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), |
| THCudaTensor_data(state, m2_), |
| (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), |
| beta, |
| THCudaTensor_data(state, r__), |
| r__->stride[(transpose_r == 'n' ? 1 : 0)]); |
| |
| /* free intermediate variables */ |
| if(m1_ != m1) |
| THCudaTensor_free(state, m1_); |
| |
| if(m2_ != m2) |
| THCudaTensor_free(state, m2_); |
| |
| if(r__ != r_) |
| THCudaTensor_freeCopyTo(state, r__, r_); |
| } |
| |
| void THCudaTensor_addr(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *vec1, THCudaTensor *vec2) |
| { |
| THAssert(THCudaTensor_checkGPU(state, 4, r_, t, vec1, vec2)); |
| if( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) |
| THError("vector and vector expected"); |
| |
| if(t->nDimension != 2) |
| THError("size mismatch"); |
| |
| if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) |
| THError("size mismatch"); |
| |
| if(r_ != t) |
| { |
| THCudaTensor_resizeAs(state, r_, t); |
| THCudaTensor_copy(state, r_, t); |
| } |
| |
| if(beta != 1) |
| THCudaTensor_mul(state, r_, r_, beta); |
| |
| if(r_->stride[0] == 1) |
| { |
| THCudaBlas_ger(state, vec1->size[0], vec2->size[0], |
| alpha, THCudaTensor_data(state, vec1), vec1->stride[0], |
| THCudaTensor_data(state, vec2), vec2->stride[0], |
| THCudaTensor_data(state, r_), r_->stride[1]); |
| } |
| else if(r_->stride[1] == 1) |
| { |
| THCudaBlas_ger(state, vec2->size[0], vec1->size[0], |
| alpha, THCudaTensor_data(state, vec2), vec2->stride[0], |
| THCudaTensor_data(state, vec1), vec1->stride[0], |
| THCudaTensor_data(state, r_), r_->stride[0]); |
| } |
| else |
| { |
| THCudaTensor *cr = THCudaTensor_newClone(state, r_); |
| |
| THCudaBlas_ger(state, vec2->size[0], vec1->size[0], |
| alpha, THCudaTensor_data(state, vec2), vec2->stride[0], |
| THCudaTensor_data(state, vec1), vec1->stride[0], |
| THCudaTensor_data(state, cr), cr->stride[0]); |
| |
| THCudaTensor_freeCopyTo(state, cr, r_); |
| } |
| } |
| |
| void THCudaTensor_baddbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t, |
| float alpha, THCudaTensor *batch1, THCudaTensor *batch2) { |
| THAssert(THCudaTensor_checkGPU(state, 4, result, t, batch1, batch2)); |
| THArgCheck(THCudaTensor_nDimension(state, t) == 3, 4, "expected 3D tensor"); |
| THArgCheck(THCudaTensor_nDimension(state, batch1) == 3, 6, "expected 3D tensor"); |
| THArgCheck(THCudaTensor_nDimension(state, batch2) == 3, 7, "expected 3D tensor"); |
| THArgCheck(THCudaTensor_size(state, t, 0) == THCudaTensor_size(state, batch1, 0), 6, |
| "equal number of batches expected"); |
| THArgCheck(THCudaTensor_size(state, t, 0) == THCudaTensor_size(state, batch2, 0), 7, |
| "equal number of batches expected"); |
| THArgCheck(THCudaTensor_size(state, t, 1) == THCudaTensor_size(state, batch1, 1), 6, |
| "wrong matrix size"); |
| THArgCheck(THCudaTensor_size(state, t, 2) == THCudaTensor_size(state, batch2, 2), 7, |
| "wrong matrix size"); |
| THArgCheck(THCudaTensor_size(state, batch1, 2) == THCudaTensor_size(state, batch2, 1), 6, |
| "wrong matrix size"); |
| |
| if (t != result) { |
| THCudaTensor_resizeAs(state, result, t); |
| THCudaTensor_copy(state, result, t); |
| } |
| |
| bool transpose_result; |
| char transpose_batch1, transpose_batch2; |
| long lda, ldb, ldc; |
| THCudaTensor *result_, *batch1_, *batch2_; |
| if (result->stride[1] == 1) |
| { |
| transpose_result = false; |
| result_ = result; |
| ldc = result_->stride[2]; |
| } |
| else if (result->stride[2] == 1) |
| { |
| transpose_result = true; |
| |
| THCudaTensor *swap = batch2; |
| batch2 = batch1; |
| batch1 = swap; |
| |
| result_ = result; |
| ldc = result_->stride[1]; |
| } |
| else |
| { |
| transpose_result = false; |
| |
| result_ = THCudaTensor_newWithSize3d(state, result->size[0], result->size[2], result->size[1]); |
| THCudaTensor_copy(state, result_, result); |
| THCudaTensor_transpose(state, result_, NULL, 1, 2); |
| |
| ldc = result_->stride[2]; |
| } |
| |
| if (batch1->stride[transpose_result ? 2 : 1] == 1) |
| { |
| transpose_batch1 = 'n'; |
| batch1_ = batch1; |
| lda = batch1_->stride[transpose_result ? 1 : 2]; |
| } |
| else if (batch1->stride[transpose_result ? 1 : 2] == 1) |
| { |
| transpose_batch1 = 't'; |
| batch1_ = batch1; |
| lda = batch1_->stride[transpose_result ? 2 : 1]; |
| } |
| else |
| { |
| transpose_batch1 = transpose_result ? 'n' : 't'; |
| batch1_ = THCudaTensor_newContiguous(state, batch1); |
| lda = batch1_->stride[1]; |
| } |
| |
| if (batch2->stride[transpose_result ? 2 : 1] == 1) |
| { |
| transpose_batch2 = 'n'; |
| batch2_ = batch2; |
| ldb = batch2_->stride[transpose_result ? 1 : 2]; |
| } |
| else if (batch2->stride[transpose_result ? 1 : 2] == 1) |
| { |
| transpose_batch2 = 't'; |
| batch2_ = batch2; |
| ldb = batch2_->stride[transpose_result ? 2 : 1]; |
| } |
| else |
| { |
| transpose_batch2 = transpose_result ? 'n' : 't'; |
| batch2_ = THCudaTensor_newContiguous(state, batch2); |
| ldb = batch2_->stride[1]; |
| } |
| |
| // Compute pointers to matrices in each batch. |
| long num_batches = result_->size[0]; |
| size_t matrices_size = num_batches * sizeof(float*); |
| const float **matrices1 = (const float **)THAlloc(matrices_size); |
| const float **matrices2 = (const float **)THAlloc(matrices_size); |
| float **result_matrices = (float **)THAlloc(matrices_size); |
| for (int i = 0; i < num_batches; ++i) |
| { |
| matrices1[i] = THCudaTensor_data(state, batch1_) + i * batch1_->stride[0]; |
| matrices2[i] = THCudaTensor_data(state, batch2_) + i * batch2_->stride[0]; |
| result_matrices[i] = THCudaTensor_data(state, result_) + i * result_->stride[0]; |
| } |
| |
| // Copy pointers to device. |
| const float **d_matrices1, **d_matrices2; |
| float **d_result_matrices; |
| THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size)); |
| THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size)); |
| THCudaCheck(THCudaMalloc(state, (void**)&d_result_matrices, matrices_size)); |
| |
| THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size, |
| cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); |
| THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size, |
| cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); |
| THCudaCheck(cudaMemcpyAsync(d_result_matrices, result_matrices, matrices_size, |
| cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); |
| |
| THCudaBlas_gemmBatched( |
| state, |
| transpose_batch1, |
| transpose_batch2, |
| result_->size[transpose_result ? 2 : 1], |
| result_->size[transpose_result ? 1 : 2], |
| batch1_->size[transpose_result ? 1 : 2], |
| alpha, |
| d_matrices1, lda, |
| d_matrices2, ldb, |
| beta, |
| d_result_matrices, ldc, |
| num_batches); |
| |
| THCudaFree(state, d_matrices1); |
| THCudaFree(state, d_matrices2); |
| THCudaFree(state, d_result_matrices); |
| THFree(matrices1); |
| THFree(matrices2); |
| THFree(result_matrices); |
| |
| if (batch1_ != batch1) |
| THCudaTensor_free(state, batch1_); |
| |
| if (batch2_ != batch2) |
| THCudaTensor_free(state, batch2_); |
| |
| if (result_ != result) |
| THCudaTensor_freeCopyTo(state, result_, result); |
| } |