| #ifndef TH_GENERIC_FILE |
| #define TH_GENERIC_FILE "generic/SparseLinear.c" |
| #else |
| |
| #ifdef _OPENMP |
| #include <omp.h> |
| #endif |
| |
| #define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0]) |
| #define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1]) |
| |
| static bool THNN_(checkLegacyInput)(THTensor* t) |
| { |
| return t->nDimension == 3 && t->size[2] == 2; |
| } |
| |
| static bool THNN_(checkInput)(THTensor* t) |
| { |
| return t->nDimension == 2 && t->size[1] == 3; |
| } |
| |
| static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1) |
| { |
| return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1; |
| } |
| |
| static bool THNN_(checkSize1D)(THTensor* t, long size0) |
| { |
| return t->nDimension == 1 && t->size[0] == size0; |
| } |
| |
| static void THNN_(set1d)(THTensor *t, long x0, real value) { |
| THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value); |
| } |
| static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) { |
| return THStorage_(get)(t->storage, t->storageOffset + |
| x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]); |
| } |
| static real THNN_(get2d)(const THTensor *t, long x0, long x1) { |
| return THStorage_(get)(t->storage, t->storageOffset + |
| x0*t->stride[0] + x1*t->stride[1]); |
| } |
| |
| void THNN_(SparseLinear_updateOutput)( |
| THNNState *state, |
| THTensor *input, |
| THTensor *output, |
| THTensor *weight, |
| THTensor *bias) |
| { |
| long h, i, j, hp0, hp1; |
| long outDim = THTensor_(size)(weight, 0); |
| long inDim = THTensor_(size)(weight, 1); |
| long batchSize = THTensor_(size)(output, 0); |
| |
| THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); |
| THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); |
| THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); |
| |
| long nnz = THTensor_(size)(input, 0); |
| |
| THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1); |
| THLongTensor_zero(csr); |
| |
| weight = THTensor_(newContiguous)(weight); |
| |
| //#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) |
| for (i=0; i<nnz; i++) { |
| hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1; |
| hp1 = (i+1 == nnz) ? |
| batchSize : |
| (long)(THNN_(get2d)(input, i+1, 0)) - 1; |
| if (hp0 != hp1) for (h = hp0; h < hp1; h++) { |
| THLongTensor_set1d(csr, h+1, i+1); |
| } |
| } |
| |
| |
| // output = weight * input + bias |
| THTensor_(zero)(output); |
| #pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000) |
| for (h = 0; h < batchSize; h++) { |
| long i_start = THLongTensor_get1d(csr, h); |
| long i_end = THLongTensor_get1d(csr, h+1); |
| for (i = i_start; i < i_end; i++) { |
| real val = THNN_(get2d)(input, i, 2); |
| if (val == 0) { |
| continue; |
| } |
| |
| long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; |
| if (offset >= 0 && offset < inDim) { |
| THBlas_(axpy)(outDim, |
| val, |
| COL_PTR2(weight, offset), weight->stride[0], |
| ROW_PTR2(output, h), output->stride[1]); |
| } else { |
| THError("index out of bound. updateOutput: %d not between 1 and %d", |
| offset + 1, inDim); |
| } |
| } |
| } |
| |
| THTensor* output_row = THTensor_(new)(); |
| for (h = 0; h < batchSize; h++) { |
| THTensor_(select)(output_row, output, 0, h); |
| THTensor_(cadd)(output_row, bias, 1.0, output_row); |
| } |
| THTensor_(free)(output_row); |
| THLongTensor_free(csr); |
| THTensor_(free)(weight); |
| } |
| |
| void THNN_(SparseLinear_legacyUpdateOutput)( |
| THNNState *state, |
| THTensor *input, |
| THTensor *output, |
| THTensor *weight, |
| THTensor *bias) |
| { |
| long h, i; |
| long outDim = THTensor_(size)(weight, 0); |
| long inDim = THTensor_(size)(weight, 1); |
| |
| THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); |
| THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); |
| THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); |
| |
| weight = THTensor_(newContiguous)(weight); |
| |
| long batchSize = THTensor_(size)(input, 0); |
| long nnz = THTensor_(size)(input, 1); |
| THTensor_(resize2d)(output, batchSize, outDim); |
| |
| // output = weight * input + bias |
| THTensor_(zero)(output); |
| #pragma omp parallel for private(h, i) schedule(static) if ( \ |
| batchSize > 1 && batchSize * nnz * outDim > 10000) |
| for (h = 0; h < batchSize; h++) { |
| for (i = 0; i < nnz; i++) { |
| real val = THNN_(get3d)(input, h, i, 1); |
| if (val == 0) { |
| continue; |
| } |
| |
| long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1; |
| if (offset >= 0 && offset < inDim) { |
| THBlas_(axpy)(outDim, |
| val, |
| COL_PTR2(weight, offset), weight->stride[0], |
| ROW_PTR2(output, h), output->stride[1]); |
| } else { |
| THError("index out of bound. updateOutput: %d not between 1 and %d", |
| offset + 1, inDim); |
| } |
| } |
| } |
| |
| THTensor* output_row = THTensor_(new)(); |
| for (h = 0; h < batchSize; h++) { |
| THTensor_(select)(output_row, output, 0, h); |
| THTensor_(cadd)(output_row, bias, 1.0, output_row); |
| } |
| THTensor_(free)(output_row); |
| THTensor_(free)(weight); |
| } |
| |
| void THNN_(SparseLinear_accGradParameters)( |
| THNNState *state, |
| THTensor *input, |
| THTensor *gradOutput, |
| THTensor *gradWeight, |
| THTensor *gradBias, |
| THTensor *weight, |
| THTensor *bias, |
| accreal weightDecay_, |
| accreal scale_) |
| { |
| real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); |
| real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); |
| long h, i, col, hp0, hp1; |
| long outDim = THTensor_(size)(weight, 0); |
| long inDim = THTensor_(size)(weight, 1); |
| |
| THArgCheck(THNN_(checkInput)(input), 2, |
| "input must be in coo format, nnz x 3"); |
| THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, |
| "gradWeight size wrong"); |
| THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, |
| "gradBias size wrong"); |
| THArgCheck(THTensor_(isContiguous)(gradOutput), 1, |
| "gradOutput must be contiguous"); |
| |
| long nnz = THTensor_(size)(input, 0); |
| |
| THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1); |
| THLongTensor_zero(csc); |
| weight = THTensor_(newContiguous)(weight); |
| |
| #pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) |
| for (i = 0; i < nnz; i++) { |
| hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1; |
| hp1 = (i+1 == nnz) ? |
| inDim : |
| (long)(THNN_(get2d)(input, i+1, 1)) - 1; |
| if (hp0 != hp1) for (h = hp0; h < hp1; h++) { |
| THLongTensor_set1d(csc, h+1, i+1); |
| } |
| } |
| |
| // gradWeight += gradOutput * input |
| #pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000) |
| for (col = 0; col < inDim; col++) { |
| long i_start = THLongTensor_get1d(csc, col); |
| long i_end = THLongTensor_get1d(csc, col+1); |
| for (i = i_start; i < i_end; i++) { |
| real val = scale * THNN_(get2d)(input, i, 2); |
| |
| h = (long)(THNN_(get2d)(input, i, 0)) - 1; |
| long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; |
| if (offset >= 0 && offset < inDim) { |
| THBlas_(axpy)(outDim, |
| val, |
| ROW_PTR2(gradOutput, h), gradOutput->stride[1], |
| COL_PTR2(gradWeight, offset), gradWeight->stride[0]); |
| } else { |
| THError( |
| "index out of bound. accGradParameters: %d not between 1 and %d", |
| offset + 1, |
| inDim); |
| } |
| } |
| } |
| |
| // gradBias += gradOutput |
| THTensor* buf = THTensor_(new)(); |
| THTensor_(sum)(buf, gradOutput, 0); |
| THTensor_(cadd)(gradBias, gradBias, scale, buf); |
| THTensor_(free)(buf); |
| THLongTensor_free(csc); |
| |
| if (weightDecay != 0) { |
| THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); |
| } |
| THTensor_(free)(weight); |
| } |
| |
| void THNN_(SparseLinear_legacyAccGradParameters)( |
| THNNState *state, |
| THTensor *input, |
| THTensor *gradOutput, |
| THTensor *gradWeight, |
| THTensor *gradBias, |
| THTensor *weight, |
| THTensor *bias, |
| accreal weightDecay_, |
| accreal scale_) |
| { |
| real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); |
| real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); |
| long h, i; |
| long outDim = THTensor_(size)(weight, 0); |
| long inDim = THTensor_(size)(weight, 1); |
| |
| THArgCheck(THNN_(checkLegacyInput)(input), 2, |
| "input size must be batchsize x nnz x 2"); |
| THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, |
| "gradWeight size wrong"); |
| THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, |
| "gradBias size wrong"); |
| THArgCheck(THTensor_(isContiguous)(gradOutput), 1, |
| "gradOutput must be contiguous"); |
| |
| long batchSize = THTensor_(size)(input, 0); |
| long nnz = THTensor_(size)(input, 1); |
| THTensor_(resize2d)(gradOutput, batchSize, outDim); |
| |
| // gradWeight += gradOutput * input |
| #pragma omp parallel for private(h, i) schedule(static) if (\ |
| batchSize * nnz * outDim > 10000) |
| for (i = 0; i < nnz; i++) { |
| for (h = 0; h < batchSize; h++) { |
| real val = scale * THNN_(get3d)(input, h, i, 1); |
| if (val == 0) { |
| continue; |
| } |
| |
| long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1; |
| if (offset >= 0 && offset < inDim) { |
| THBlas_(axpy)(outDim, |
| val, |
| ROW_PTR2(gradOutput, h), gradOutput->stride[1], |
| COL_PTR2(gradWeight, offset), gradWeight->stride[0]); |
| } else { |
| THError( |
| "index out of bound. accGradParameters: %d not between 1 and %d", |
| offset + 1, |
| inDim); |
| } |
| } |
| } |
| |
| // gradBias += gradOutput |
| THTensor* gradOutput_row = THTensor_(new)(); |
| for (h = 0; h < batchSize; h++) { |
| THTensor_(select)(gradOutput_row, gradOutput, 0, h); |
| THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row); |
| } |
| THTensor_(free)(gradOutput_row); |
| |
| if (weightDecay != 0) { |
| THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); |
| } |
| } |
| |
| void THNN_(SparseLinear_updateParameters)( |
| THNNState *state, |
| THTensor *weight, |
| THTensor *bias, |
| THTensor *gradWeight, |
| THTensor *gradBias, |
| THTensor *lastInput, |
| accreal learningRate_) |
| { |
| real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); |
| long h, i; |
| long outDim = weight->size[0]; |
| long inDim = weight->size[1]; |
| |
| THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, |
| "gradWeight size wrong"); |
| THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); |
| THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); |
| THArgCheck(THNN_(checkInput)(lastInput), 6, |
| "input must be in coo format, nnz x 3"); |
| |
| |
| long nnz = THTensor_(size)(lastInput, 0); |
| |
| // collect unique offsets of non-0 val in input |
| THTensor* offsets = THTensor_(newWithSize1d)(nnz); |
| long cnt = 0; |
| for (i = 0; i < nnz; i++) { |
| real val = THNN_(get2d)(lastInput, i, 2); |
| if (val == 0) { |
| continue; |
| } |
| long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1; |
| if (offset >= 0 && offset < inDim) { |
| THNN_(set1d)(offsets, cnt++, offset); |
| } else { |
| THError( |
| "index out of bound. updateParameters: %d not between 1 and %d", |
| offset + 1, |
| inDim); |
| } |
| } |
| if (cnt == 0) return; |
| THTensor_(resize1d)(offsets, cnt); |
| |
| THTensor* uniqueOffsets = THTensor_(new)(); |
| THLongTensor* ri = THLongTensor_new(); |
| THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); |
| THLongTensor_free(ri); |
| THTensor_(free)(offsets); |
| |
| cnt = 1; |
| real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); |
| for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { |
| if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { |
| uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; |
| } |
| } |
| THTensor_(resize1d)(uniqueOffsets, cnt); |
| |
| // weight += -learningRate * gradWeight |
| THTensor_(cadd)(bias, bias, -learningRate, gradBias); |
| #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) |
| for (i = 0; i < cnt; i++) { |
| long offset = (long)uniqueOffsets_p[i]; |
| THBlas_(axpy)(outDim, |
| -learningRate, |
| COL_PTR2(gradWeight, offset), gradWeight->stride[0], |
| COL_PTR2(weight, offset), weight->stride[0]); |
| } |
| |
| THTensor_(free)(uniqueOffsets); |
| } |
| |
| void THNN_(SparseLinear_legacyUpdateParameters)( |
| THNNState *state, |
| THTensor *weight, |
| THTensor *bias, |
| THTensor *gradWeight, |
| THTensor *gradBias, |
| THTensor *lastInput, |
| accreal learningRate_) |
| { |
| real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); |
| long h, i; |
| long outDim = weight->size[0]; |
| long inDim = weight->size[1]; |
| |
| THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, |
| "gradWeight size wrong"); |
| THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); |
| THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); |
| THArgCheck(THNN_(checkLegacyInput)(lastInput), 6, |
| "input size must be batchsize x nnz x 2"); |
| |
| |
| long batchSize = THTensor_(size)(lastInput, 0); |
| long nnz = THTensor_(size)(lastInput, 1); |
| |
| // collect unique offsets of non-0 val in input |
| THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz); |
| long cnt = 0; |
| for (h = 0; h < batchSize; h++) { |
| for (i = 0; i < nnz; i++) { |
| real val = THNN_(get3d)(lastInput, h, i, 1); |
| if (val == 0 ) { |
| continue; |
| } |
| long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1; |
| if (offset >= 0 && offset < inDim) { |
| THNN_(set1d)(offsets, cnt++, offset); |
| } else { |
| THError( |
| "index out of bound. updateParameters: %d not between 1 and %d", |
| offset + 1, |
| inDim); |
| } |
| } |
| } |
| THTensor_(resize1d)(offsets, cnt); |
| |
| THTensor* uniqueOffsets = THTensor_(new)(); |
| THLongTensor* ri = THLongTensor_new(); |
| THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); |
| THLongTensor_free(ri); |
| THTensor_(free)(offsets); |
| |
| cnt = 1; |
| real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); |
| for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { |
| if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { |
| uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; |
| } |
| } |
| THTensor_(resize1d)(uniqueOffsets, cnt); |
| |
| // weight += -learningRate * gradWeight |
| THTensor_(cadd)(bias, bias, -learningRate, gradBias); |
| #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) |
| for (i = 0; i < cnt; i++) { |
| long offset = (long)uniqueOffsets_p[i]; |
| THBlas_(axpy)(outDim, |
| -learningRate, |
| COL_PTR2(gradWeight, offset), gradWeight->stride[0], |
| COL_PTR2(weight, offset), weight->stride[0]); |
| } |
| |
| THTensor_(free)(uniqueOffsets); |
| } |
| |
| void THNN_(SparseLinear_zeroGradParameters)( |
| THNNState *state, |
| THTensor *gradWeight, |
| THTensor *gradBias, |
| THTensor *lastInput) |
| { |
| long h, i, j; |
| |
| long outDim = gradWeight->size[0]; |
| long inDim = gradWeight->size[1]; |
| |
| THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); |
| THArgCheck(THNN_(checkInput)(lastInput), 4, |
| "input must be in coo format, nnz x 3"); |
| |
| THTensor_(zero)(gradBias); |
| |
| long nnz = THTensor_(size)(lastInput, 0); |
| |
| #pragma omp parallel for private(i, j) schedule(static) if ( \ |
| nnz * outDim > 10000) |
| for (i = 0; i < nnz; i++) { |
| if (THNN_(get2d)(lastInput, i, 2) == 0 ) { |
| continue; |
| } |
| |
| long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1; |
| if (offset >= 0 && offset < inDim) { |
| real* pGradWeight = COL_PTR2(gradWeight, offset); |
| if (gradWeight->stride[0] == 1) { |
| THVector_(fill)(pGradWeight, 0, outDim); |
| } else { |
| long stride = gradWeight->stride[0]; |
| for (j = 0; j < outDim; ++j) { |
| pGradWeight[j * stride] = 0; |
| } |
| } |
| } else { |
| THError( |
| "index out of bound. zeroGradParameters: %d not between 1 and %d", |
| offset + 1, |
| inDim); |
| } |
| } |
| } |
| |
| void THNN_(SparseLinear_legacyZeroGradParameters)( |
| THNNState *state, |
| THTensor *gradWeight, |
| THTensor *gradBias, |
| THTensor *lastInput) |
| { |
| long h, i, j; |
| |
| long outDim = gradWeight->size[0]; |
| long inDim = gradWeight->size[1]; |
| |
| THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); |
| THArgCheck(THNN_(checkLegacyInput)(lastInput), 4, |
| "input size must be batchsize x nnz x 2"); |
| |
| THTensor_(zero)(gradBias); |
| |
| long batchSize = THTensor_(size)(lastInput, 0); |
| long nnz = THTensor_(size)(lastInput, 1); |
| |
| #pragma omp parallel for private(h, i, j) schedule(static) if ( \ |
| batchSize > 1 && batchSize * nnz * outDim > 10000) |
| for (h = 0; h < batchSize; h++) { |
| for (i = 0; i < nnz; i++) { |
| if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) { |
| continue; |
| } |
| |
| long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1; |
| if (offset >= 0 && offset < inDim) { |
| real* pGradWeight = COL_PTR2(gradWeight, offset); |
| if (gradWeight->stride[0] == 1) { |
| THVector_(fill)(pGradWeight, 0, outDim); |
| } else { |
| long stride = gradWeight->stride[0]; |
| for (j = 0; j < outDim; ++j) { |
| pGradWeight[j * stride] = 0; |
| } |
| } |
| } else { |
| THError( |
| "index out of bound. zeroGradParameters: %d not between 1 and %d", |
| offset + 1, |
| inDim); |
| } |
| } |
| } |
| } |
| |
| #undef ROW_PTR2 |
| #undef COL_PTR2 |
| |
| #endif |