blob: d9cec8cf3a791f723cfc1f4efa8040f08ee4c831 [file] [log] [blame]
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/SparseLinear.c"
#else
#ifdef _OPENMP
#include <omp.h>
#endif
#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
static bool THNN_(checkLegacyInput)(THTensor* t)
{
return t->nDimension == 3 && t->size[2] == 2;
}
static bool THNN_(checkInput)(THTensor* t)
{
return t->nDimension == 2 && t->size[1] == 3;
}
static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
{
return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
}
static bool THNN_(checkSize1D)(THTensor* t, long size0)
{
return t->nDimension == 1 && t->size[0] == size0;
}
static void THNN_(set1d)(THTensor *t, long x0, real value) {
THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
}
static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
return THStorage_(get)(t->storage, t->storageOffset +
x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
}
static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
return THStorage_(get)(t->storage, t->storageOffset +
x0*t->stride[0] + x1*t->stride[1]);
}
void THNN_(SparseLinear_updateOutput)(
THNNState *state,
THTensor *input,
THTensor *output,
THTensor *weight,
THTensor *bias)
{
long h, i, j, hp0, hp1;
long outDim = THTensor_(size)(weight, 0);
long inDim = THTensor_(size)(weight, 1);
long batchSize = THTensor_(size)(output, 0);
THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
long nnz = THTensor_(size)(input, 0);
THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
THLongTensor_zero(csr);
weight = THTensor_(newContiguous)(weight);
//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
for (i=0; i<nnz; i++) {
hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
hp1 = (i+1 == nnz) ?
batchSize :
(long)(THNN_(get2d)(input, i+1, 0)) - 1;
if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
THLongTensor_set1d(csr, h+1, i+1);
}
}
// output = weight * input + bias
THTensor_(zero)(output);
#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
for (h = 0; h < batchSize; h++) {
long i_start = THLongTensor_get1d(csr, h);
long i_end = THLongTensor_get1d(csr, h+1);
for (i = i_start; i < i_end; i++) {
real val = THNN_(get2d)(input, i, 2);
if (val == 0) {
continue;
}
long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
if (offset >= 0 && offset < inDim) {
THBlas_(axpy)(outDim,
val,
COL_PTR2(weight, offset), weight->stride[0],
ROW_PTR2(output, h), output->stride[1]);
} else {
THError("index out of bound. updateOutput: %d not between 1 and %d",
offset + 1, inDim);
}
}
}
THTensor* output_row = THTensor_(new)();
for (h = 0; h < batchSize; h++) {
THTensor_(select)(output_row, output, 0, h);
THTensor_(cadd)(output_row, bias, 1.0, output_row);
}
THTensor_(free)(output_row);
THLongTensor_free(csr);
THTensor_(free)(weight);
}
void THNN_(SparseLinear_legacyUpdateOutput)(
THNNState *state,
THTensor *input,
THTensor *output,
THTensor *weight,
THTensor *bias)
{
long h, i;
long outDim = THTensor_(size)(weight, 0);
long inDim = THTensor_(size)(weight, 1);
THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
weight = THTensor_(newContiguous)(weight);
long batchSize = THTensor_(size)(input, 0);
long nnz = THTensor_(size)(input, 1);
THTensor_(resize2d)(output, batchSize, outDim);
// output = weight * input + bias
THTensor_(zero)(output);
#pragma omp parallel for private(h, i) schedule(static) if ( \
batchSize > 1 && batchSize * nnz * outDim > 10000)
for (h = 0; h < batchSize; h++) {
for (i = 0; i < nnz; i++) {
real val = THNN_(get3d)(input, h, i, 1);
if (val == 0) {
continue;
}
long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
if (offset >= 0 && offset < inDim) {
THBlas_(axpy)(outDim,
val,
COL_PTR2(weight, offset), weight->stride[0],
ROW_PTR2(output, h), output->stride[1]);
} else {
THError("index out of bound. updateOutput: %d not between 1 and %d",
offset + 1, inDim);
}
}
}
THTensor* output_row = THTensor_(new)();
for (h = 0; h < batchSize; h++) {
THTensor_(select)(output_row, output, 0, h);
THTensor_(cadd)(output_row, bias, 1.0, output_row);
}
THTensor_(free)(output_row);
THTensor_(free)(weight);
}
void THNN_(SparseLinear_accGradParameters)(
THNNState *state,
THTensor *input,
THTensor *gradOutput,
THTensor *gradWeight,
THTensor *gradBias,
THTensor *weight,
THTensor *bias,
accreal weightDecay_,
accreal scale_)
{
real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
long h, i, col, hp0, hp1;
long outDim = THTensor_(size)(weight, 0);
long inDim = THTensor_(size)(weight, 1);
THArgCheck(THNN_(checkInput)(input), 2,
"input must be in coo format, nnz x 3");
THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
"gradWeight size wrong");
THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
"gradBias size wrong");
THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
"gradOutput must be contiguous");
long nnz = THTensor_(size)(input, 0);
THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
THLongTensor_zero(csc);
weight = THTensor_(newContiguous)(weight);
#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
for (i = 0; i < nnz; i++) {
hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
hp1 = (i+1 == nnz) ?
inDim :
(long)(THNN_(get2d)(input, i+1, 1)) - 1;
if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
THLongTensor_set1d(csc, h+1, i+1);
}
}
// gradWeight += gradOutput * input
#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
for (col = 0; col < inDim; col++) {
long i_start = THLongTensor_get1d(csc, col);
long i_end = THLongTensor_get1d(csc, col+1);
for (i = i_start; i < i_end; i++) {
real val = scale * THNN_(get2d)(input, i, 2);
h = (long)(THNN_(get2d)(input, i, 0)) - 1;
long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
if (offset >= 0 && offset < inDim) {
THBlas_(axpy)(outDim,
val,
ROW_PTR2(gradOutput, h), gradOutput->stride[1],
COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
} else {
THError(
"index out of bound. accGradParameters: %d not between 1 and %d",
offset + 1,
inDim);
}
}
}
// gradBias += gradOutput
THTensor* buf = THTensor_(new)();
THTensor_(sum)(buf, gradOutput, 0);
THTensor_(cadd)(gradBias, gradBias, scale, buf);
THTensor_(free)(buf);
THLongTensor_free(csc);
if (weightDecay != 0) {
THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
}
THTensor_(free)(weight);
}
void THNN_(SparseLinear_legacyAccGradParameters)(
THNNState *state,
THTensor *input,
THTensor *gradOutput,
THTensor *gradWeight,
THTensor *gradBias,
THTensor *weight,
THTensor *bias,
accreal weightDecay_,
accreal scale_)
{
real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
long h, i;
long outDim = THTensor_(size)(weight, 0);
long inDim = THTensor_(size)(weight, 1);
THArgCheck(THNN_(checkLegacyInput)(input), 2,
"input size must be batchsize x nnz x 2");
THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
"gradWeight size wrong");
THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
"gradBias size wrong");
THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
"gradOutput must be contiguous");
long batchSize = THTensor_(size)(input, 0);
long nnz = THTensor_(size)(input, 1);
THTensor_(resize2d)(gradOutput, batchSize, outDim);
// gradWeight += gradOutput * input
#pragma omp parallel for private(h, i) schedule(static) if (\
batchSize * nnz * outDim > 10000)
for (i = 0; i < nnz; i++) {
for (h = 0; h < batchSize; h++) {
real val = scale * THNN_(get3d)(input, h, i, 1);
if (val == 0) {
continue;
}
long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
if (offset >= 0 && offset < inDim) {
THBlas_(axpy)(outDim,
val,
ROW_PTR2(gradOutput, h), gradOutput->stride[1],
COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
} else {
THError(
"index out of bound. accGradParameters: %d not between 1 and %d",
offset + 1,
inDim);
}
}
}
// gradBias += gradOutput
THTensor* gradOutput_row = THTensor_(new)();
for (h = 0; h < batchSize; h++) {
THTensor_(select)(gradOutput_row, gradOutput, 0, h);
THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
}
THTensor_(free)(gradOutput_row);
if (weightDecay != 0) {
THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
}
}
void THNN_(SparseLinear_updateParameters)(
THNNState *state,
THTensor *weight,
THTensor *bias,
THTensor *gradWeight,
THTensor *gradBias,
THTensor *lastInput,
accreal learningRate_)
{
real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
long h, i;
long outDim = weight->size[0];
long inDim = weight->size[1];
THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
"gradWeight size wrong");
THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
THArgCheck(THNN_(checkInput)(lastInput), 6,
"input must be in coo format, nnz x 3");
long nnz = THTensor_(size)(lastInput, 0);
// collect unique offsets of non-0 val in input
THTensor* offsets = THTensor_(newWithSize1d)(nnz);
long cnt = 0;
for (i = 0; i < nnz; i++) {
real val = THNN_(get2d)(lastInput, i, 2);
if (val == 0) {
continue;
}
long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
if (offset >= 0 && offset < inDim) {
THNN_(set1d)(offsets, cnt++, offset);
} else {
THError(
"index out of bound. updateParameters: %d not between 1 and %d",
offset + 1,
inDim);
}
}
if (cnt == 0) return;
THTensor_(resize1d)(offsets, cnt);
THTensor* uniqueOffsets = THTensor_(new)();
THLongTensor* ri = THLongTensor_new();
THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
THLongTensor_free(ri);
THTensor_(free)(offsets);
cnt = 1;
real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
}
}
THTensor_(resize1d)(uniqueOffsets, cnt);
// weight += -learningRate * gradWeight
THTensor_(cadd)(bias, bias, -learningRate, gradBias);
#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
for (i = 0; i < cnt; i++) {
long offset = (long)uniqueOffsets_p[i];
THBlas_(axpy)(outDim,
-learningRate,
COL_PTR2(gradWeight, offset), gradWeight->stride[0],
COL_PTR2(weight, offset), weight->stride[0]);
}
THTensor_(free)(uniqueOffsets);
}
void THNN_(SparseLinear_legacyUpdateParameters)(
THNNState *state,
THTensor *weight,
THTensor *bias,
THTensor *gradWeight,
THTensor *gradBias,
THTensor *lastInput,
accreal learningRate_)
{
real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
long h, i;
long outDim = weight->size[0];
long inDim = weight->size[1];
THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
"gradWeight size wrong");
THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
"input size must be batchsize x nnz x 2");
long batchSize = THTensor_(size)(lastInput, 0);
long nnz = THTensor_(size)(lastInput, 1);
// collect unique offsets of non-0 val in input
THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
long cnt = 0;
for (h = 0; h < batchSize; h++) {
for (i = 0; i < nnz; i++) {
real val = THNN_(get3d)(lastInput, h, i, 1);
if (val == 0 ) {
continue;
}
long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
if (offset >= 0 && offset < inDim) {
THNN_(set1d)(offsets, cnt++, offset);
} else {
THError(
"index out of bound. updateParameters: %d not between 1 and %d",
offset + 1,
inDim);
}
}
}
THTensor_(resize1d)(offsets, cnt);
THTensor* uniqueOffsets = THTensor_(new)();
THLongTensor* ri = THLongTensor_new();
THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
THLongTensor_free(ri);
THTensor_(free)(offsets);
cnt = 1;
real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
}
}
THTensor_(resize1d)(uniqueOffsets, cnt);
// weight += -learningRate * gradWeight
THTensor_(cadd)(bias, bias, -learningRate, gradBias);
#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
for (i = 0; i < cnt; i++) {
long offset = (long)uniqueOffsets_p[i];
THBlas_(axpy)(outDim,
-learningRate,
COL_PTR2(gradWeight, offset), gradWeight->stride[0],
COL_PTR2(weight, offset), weight->stride[0]);
}
THTensor_(free)(uniqueOffsets);
}
void THNN_(SparseLinear_zeroGradParameters)(
THNNState *state,
THTensor *gradWeight,
THTensor *gradBias,
THTensor *lastInput)
{
long h, i, j;
long outDim = gradWeight->size[0];
long inDim = gradWeight->size[1];
THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
THArgCheck(THNN_(checkInput)(lastInput), 4,
"input must be in coo format, nnz x 3");
THTensor_(zero)(gradBias);
long nnz = THTensor_(size)(lastInput, 0);
#pragma omp parallel for private(i, j) schedule(static) if ( \
nnz * outDim > 10000)
for (i = 0; i < nnz; i++) {
if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
continue;
}
long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
if (offset >= 0 && offset < inDim) {
real* pGradWeight = COL_PTR2(gradWeight, offset);
if (gradWeight->stride[0] == 1) {
THVector_(fill)(pGradWeight, 0, outDim);
} else {
long stride = gradWeight->stride[0];
for (j = 0; j < outDim; ++j) {
pGradWeight[j * stride] = 0;
}
}
} else {
THError(
"index out of bound. zeroGradParameters: %d not between 1 and %d",
offset + 1,
inDim);
}
}
}
void THNN_(SparseLinear_legacyZeroGradParameters)(
THNNState *state,
THTensor *gradWeight,
THTensor *gradBias,
THTensor *lastInput)
{
long h, i, j;
long outDim = gradWeight->size[0];
long inDim = gradWeight->size[1];
THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
"input size must be batchsize x nnz x 2");
THTensor_(zero)(gradBias);
long batchSize = THTensor_(size)(lastInput, 0);
long nnz = THTensor_(size)(lastInput, 1);
#pragma omp parallel for private(h, i, j) schedule(static) if ( \
batchSize > 1 && batchSize * nnz * outDim > 10000)
for (h = 0; h < batchSize; h++) {
for (i = 0; i < nnz; i++) {
if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
continue;
}
long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
if (offset >= 0 && offset < inDim) {
real* pGradWeight = COL_PTR2(gradWeight, offset);
if (gradWeight->stride[0] == 1) {
THVector_(fill)(pGradWeight, 0, outDim);
} else {
long stride = gradWeight->stride[0];
for (j = 0; j < outDim; ++j) {
pGradWeight[j * stride] = 0;
}
}
} else {
THError(
"index out of bound. zeroGradParameters: %d not between 1 and %d",
offset + 1,
inDim);
}
}
}
}
#undef ROW_PTR2
#undef COL_PTR2
#endif