generic/SparseLinear.c - platform/external/pytorch - Git at Google

 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SparseLinear.c"
 #else

 #ifdef _OPENMP
 #include <omp.h>
 #endif

 #define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
 #define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])

 static bool THNN_(checkLegacyInput)(THTensor* t)
 {
   return t->nDimension == 3 && t->size[2] == 2;
 }

 static bool THNN_(checkInput)(THTensor* t)
 {
   return t->nDimension == 2 && t->size[1] == 3;
 }

 static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
 {
   return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
 }

 static bool THNN_(checkSize1D)(THTensor* t, long size0)
 {
   return t->nDimension == 1 && t->size[0] == size0;
 }

 static void THNN_(set1d)(THTensor *t, long x0, real value) {
   THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
 }
 static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
   return THStorage_(get)(t->storage, t->storageOffset +
                          x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
 }
 static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
   return THStorage_(get)(t->storage, t->storageOffset +
                          x0*t->stride[0] + x1*t->stride[1]);
 }

 void THNN_(SparseLinear_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
           THTensor *weight,
           THTensor *bias)
 {
   long h, i, j, hp0, hp1;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
   long batchSize = THTensor_(size)(output, 0);

   THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
   THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");

   long nnz = THTensor_(size)(input, 0);

   THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
   THLongTensor_zero(csr);

   weight = THTensor_(newContiguous)(weight);

 //#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
   for (i=0; i<nnz; i++) {
     hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
     hp1 = (i+1 == nnz) ?
             batchSize :
             (long)(THNN_(get2d)(input, i+1, 0)) - 1;
     if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
       THLongTensor_set1d(csr, h+1, i+1);
     }
   }


   // output = weight * input + bias
   THTensor_(zero)(output);
 #pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
   for (h = 0; h < batchSize; h++) {
     long i_start = THLongTensor_get1d(csr, h);
     long i_end = THLongTensor_get1d(csr, h+1);
     for (i = i_start; i < i_end; i++) {
       real val = THNN_(get2d)(input, i, 2);
       if (val == 0) {
         continue;
       }

       long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
       if (offset >= 0 && offset < inDim) {
         THBlas_(axpy)(outDim,
             val,
             COL_PTR2(weight, offset), weight->stride[0],
             ROW_PTR2(output, h), output->stride[1]);
       } else {
         THError("index out of bound. updateOutput: %d not between 1 and %d",
             offset + 1, inDim);
       }
     }
   }

   THTensor* output_row = THTensor_(new)();
   for (h = 0; h < batchSize; h++) {
     THTensor_(select)(output_row, output, 0, h);
     THTensor_(cadd)(output_row, bias, 1.0, output_row);
   }
   THTensor_(free)(output_row);
   THLongTensor_free(csr);
   THTensor_(free)(weight);
 }

 void THNN_(SparseLinear_legacyUpdateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
           THTensor *weight,
           THTensor *bias)
 {
   long h, i;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);

   THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
   THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");

   weight = THTensor_(newContiguous)(weight);

   long batchSize = THTensor_(size)(input, 0);
   long nnz = THTensor_(size)(input, 1);
   THTensor_(resize2d)(output, batchSize, outDim);

   // output = weight * input + bias
   THTensor_(zero)(output);
 #pragma omp parallel for private(h, i) schedule(static) if (   \
   batchSize > 1 && batchSize * nnz * outDim > 10000)
   for (h = 0; h < batchSize; h++) {
     for (i = 0; i < nnz; i++) {
       real val = THNN_(get3d)(input, h, i, 1);
       if (val == 0) {
         continue;
       }

       long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
       if (offset >= 0 && offset < inDim) {
         THBlas_(axpy)(outDim,
                       val,
                       COL_PTR2(weight, offset), weight->stride[0],
                       ROW_PTR2(output, h), output->stride[1]);
       } else {
         THError("index out of bound. updateOutput: %d not between 1 and %d",
                 offset + 1, inDim);
       }
     }
   }

   THTensor* output_row = THTensor_(new)();
   for (h = 0; h < batchSize; h++) {
     THTensor_(select)(output_row, output, 0, h);
     THTensor_(cadd)(output_row, bias, 1.0, output_row);
   }
   THTensor_(free)(output_row);
   THTensor_(free)(weight);
 }

 void THNN_(SparseLinear_accGradParameters)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *weight,
           THTensor *bias,
           accreal weightDecay_,
           accreal scale_)
 {
   real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
   real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   long h, i, col, hp0, hp1;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);

   THArgCheck(THNN_(checkInput)(input), 2,
              "input must be in coo format, nnz x 3");
   THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
              "gradWeight size wrong");
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
              "gradBias size wrong");
   THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
              "gradOutput must be contiguous");

   long nnz = THTensor_(size)(input, 0);

   THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
   THLongTensor_zero(csc);
   weight = THTensor_(newContiguous)(weight);

 #pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
   for (i = 0; i < nnz; i++) {
     hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
     hp1 = (i+1 == nnz) ?
             inDim :
             (long)(THNN_(get2d)(input, i+1, 1)) - 1;
     if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
       THLongTensor_set1d(csc, h+1, i+1);
     }
   }

   // gradWeight += gradOutput * input
 #pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
   for (col = 0; col < inDim; col++) {
     long i_start = THLongTensor_get1d(csc, col);
     long i_end = THLongTensor_get1d(csc, col+1);
     for (i = i_start; i < i_end; i++) {
       real val = scale * THNN_(get2d)(input, i, 2);

       h = (long)(THNN_(get2d)(input, i, 0)) - 1;
       long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
       if (offset >= 0 && offset < inDim) {
         THBlas_(axpy)(outDim,
             val,
             ROW_PTR2(gradOutput, h), gradOutput->stride[1],
             COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
       } else {
         THError(
             "index out of bound. accGradParameters: %d not between 1 and %d",
             offset + 1,
             inDim);
       }
     }
   }

   // gradBias += gradOutput
   THTensor* buf = THTensor_(new)();
   THTensor_(sum)(buf, gradOutput, 0);
   THTensor_(cadd)(gradBias, gradBias, scale, buf);
   THTensor_(free)(buf);
   THLongTensor_free(csc);

   if (weightDecay != 0) {
     THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
   }
   THTensor_(free)(weight);
 }

 void THNN_(SparseLinear_legacyAccGradParameters)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *weight,
           THTensor *bias,
           accreal weightDecay_,
           accreal scale_)
 {
   real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
   real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   long h, i;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);

   THArgCheck(THNN_(checkLegacyInput)(input), 2,
              "input size must be batchsize x nnz x 2");
   THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
              "gradWeight size wrong");
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
              "gradBias size wrong");
   THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
              "gradOutput must be contiguous");

   long batchSize = THTensor_(size)(input, 0);
   long nnz = THTensor_(size)(input, 1);
   THTensor_(resize2d)(gradOutput, batchSize, outDim);

   // gradWeight += gradOutput * input
 #pragma omp parallel for private(h, i) schedule(static) if (\
   batchSize * nnz * outDim > 10000)
   for (i = 0; i < nnz; i++) {
     for (h = 0; h < batchSize; h++) {
       real val = scale * THNN_(get3d)(input, h, i, 1);
       if (val == 0) {
         continue;
       }

       long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
       if (offset >= 0 && offset < inDim) {
         THBlas_(axpy)(outDim,
                       val,
                       ROW_PTR2(gradOutput, h), gradOutput->stride[1],
                       COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
       } else {
         THError(
           "index out of bound. accGradParameters: %d not between 1 and %d",
           offset + 1,
           inDim);
       }
     }
   }

   // gradBias += gradOutput
   THTensor* gradOutput_row = THTensor_(new)();
   for (h = 0; h < batchSize; h++) {
     THTensor_(select)(gradOutput_row, gradOutput, 0, h);
     THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
   }
   THTensor_(free)(gradOutput_row);

   if (weightDecay != 0) {
     THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
   }
 }

 void THNN_(SparseLinear_updateParameters)(
           THNNState *state,
           THTensor *weight,
           THTensor *bias,
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput,
           accreal learningRate_)
 {
   real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
   long h, i;
   long outDim = weight->size[0];
   long inDim = weight->size[1];

   THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
              "gradWeight size wrong");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
   THArgCheck(THNN_(checkInput)(lastInput), 6,
              "input must be in coo format, nnz x 3");


   long nnz = THTensor_(size)(lastInput, 0);

   // collect unique offsets of non-0 val in input
   THTensor* offsets = THTensor_(newWithSize1d)(nnz);
   long cnt = 0;
   for (i = 0; i < nnz; i++) {
     real val = THNN_(get2d)(lastInput, i, 2);
     if (val == 0) {
       continue;
     }
     long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
     if (offset >= 0 && offset < inDim) {
       THNN_(set1d)(offsets, cnt++, offset);
     } else {
       THError(
           "index out of bound. updateParameters: %d not between 1 and %d",
           offset + 1,
           inDim);
     }
   }
   if (cnt == 0) return;
   THTensor_(resize1d)(offsets, cnt);

   THTensor* uniqueOffsets = THTensor_(new)();
   THLongTensor* ri = THLongTensor_new();
   THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
   THLongTensor_free(ri);
   THTensor_(free)(offsets);

   cnt = 1;
   real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
   for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
     if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
       uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
     }
   }
   THTensor_(resize1d)(uniqueOffsets, cnt);

   // weight += -learningRate * gradWeight
   THTensor_(cadd)(bias, bias, -learningRate, gradBias);
 #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
   for (i = 0; i < cnt; i++) {
     long offset = (long)uniqueOffsets_p[i];
     THBlas_(axpy)(outDim,
                   -learningRate,
                   COL_PTR2(gradWeight, offset), gradWeight->stride[0],
                   COL_PTR2(weight, offset), weight->stride[0]);
   }

   THTensor_(free)(uniqueOffsets);
 }

 void THNN_(SparseLinear_legacyUpdateParameters)(
           THNNState *state,
           THTensor *weight,
           THTensor *bias,
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput,
           accreal learningRate_)
 {
   real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
   long h, i;
   long outDim = weight->size[0];
   long inDim = weight->size[1];

   THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
              "gradWeight size wrong");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
   THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
              "input size must be batchsize x nnz x 2");


   long batchSize = THTensor_(size)(lastInput, 0);
   long nnz = THTensor_(size)(lastInput, 1);

   // collect unique offsets of non-0 val in input
   THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
   long cnt = 0;
   for (h = 0; h < batchSize; h++) {
     for (i = 0; i < nnz; i++) {
       real val = THNN_(get3d)(lastInput, h, i, 1);
       if (val == 0 ) {
         continue;
       }
       long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
       if (offset >= 0 && offset < inDim) {
         THNN_(set1d)(offsets, cnt++, offset);
       } else {
         THError(
           "index out of bound. updateParameters: %d not between 1 and %d",
           offset + 1,
           inDim);
       }
     }
   }
   THTensor_(resize1d)(offsets, cnt);

   THTensor* uniqueOffsets = THTensor_(new)();
   THLongTensor* ri = THLongTensor_new();
   THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
   THLongTensor_free(ri);
   THTensor_(free)(offsets);

   cnt = 1;
   real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
   for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
     if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
       uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
     }
   }
   THTensor_(resize1d)(uniqueOffsets, cnt);

   // weight += -learningRate * gradWeight
   THTensor_(cadd)(bias, bias, -learningRate, gradBias);
 #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
   for (i = 0; i < cnt; i++) {
     long offset = (long)uniqueOffsets_p[i];
     THBlas_(axpy)(outDim,
                   -learningRate,
                   COL_PTR2(gradWeight, offset), gradWeight->stride[0],
                   COL_PTR2(weight, offset), weight->stride[0]);
   }

   THTensor_(free)(uniqueOffsets);
 }

 void THNN_(SparseLinear_zeroGradParameters)(
           THNNState *state,
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput)
 {
   long h, i, j;

   long outDim = gradWeight->size[0];
   long inDim = gradWeight->size[1];

   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
   THArgCheck(THNN_(checkInput)(lastInput), 4,
              "input must be in coo format, nnz x 3");

   THTensor_(zero)(gradBias);

   long nnz = THTensor_(size)(lastInput, 0);

 #pragma omp parallel for private(i, j) schedule(static) if (   \
   nnz * outDim > 10000)
   for (i = 0; i < nnz; i++) {
     if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
       continue;
     }

     long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
     if (offset >= 0 && offset < inDim) {
       real* pGradWeight = COL_PTR2(gradWeight, offset);
       if (gradWeight->stride[0] == 1) {
         THVector_(fill)(pGradWeight, 0, outDim);
       } else {
         long stride = gradWeight->stride[0];
         for (j = 0; j < outDim; ++j) {
           pGradWeight[j * stride] = 0;
         }
       }
     } else {
       THError(
           "index out of bound. zeroGradParameters: %d not between 1 and %d",
           offset + 1,
           inDim);
     }
   }
 }

 void THNN_(SparseLinear_legacyZeroGradParameters)(
           THNNState *state,
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput)
 {
   long h, i, j;

   long outDim = gradWeight->size[0];
   long inDim = gradWeight->size[1];

   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
   THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
              "input size must be batchsize x nnz x 2");

   THTensor_(zero)(gradBias);

   long batchSize = THTensor_(size)(lastInput, 0);
   long nnz = THTensor_(size)(lastInput, 1);

 #pragma omp parallel for private(h, i, j) schedule(static) if (   \
   batchSize > 1 && batchSize * nnz * outDim > 10000)
   for (h = 0; h < batchSize; h++) {
     for (i = 0; i < nnz; i++) {
       if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
         continue;
       }

       long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
       if (offset >= 0 && offset < inDim) {
         real* pGradWeight = COL_PTR2(gradWeight, offset);
         if (gradWeight->stride[0] == 1) {
           THVector_(fill)(pGradWeight, 0, outDim);
         } else {
           long stride = gradWeight->stride[0];
           for (j = 0; j < outDim; ++j) {
             pGradWeight[j * stride] = 0;
           }
         }
       } else {
         THError(
           "index out of bound. zeroGradParameters: %d not between 1 and %d",
           offset + 1,
           inDim);
       }
     }
   }
 }

 #undef ROW_PTR2
 #undef COL_PTR2

 #endif
	#ifndef TH_GENERIC_FILE
	#define TH_GENERIC_FILE "generic/SparseLinear.c"
	#else

	#ifdef _OPENMP
	#include <omp.h>
	#endif

	#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
	#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])

	static bool THNN_(checkLegacyInput)(THTensor* t)
	{
	return t->nDimension == 3 && t->size[2] == 2;
	}

	static bool THNN_(checkInput)(THTensor* t)
	{
	return t->nDimension == 2 && t->size[1] == 3;
	}

	static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
	{
	return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
	}

	static bool THNN_(checkSize1D)(THTensor* t, long size0)
	{
	return t->nDimension == 1 && t->size[0] == size0;
	}

	static void THNN_(set1d)(THTensor *t, long x0, real value) {
	THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
	}
	static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
	return THStorage_(get)(t->storage, t->storageOffset +
	x0t->stride[0] + x1t->stride[1] + x2*t->stride[2]);
	}
	static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
	return THStorage_(get)(t->storage, t->storageOffset +
	x0t->stride[0] + x1t->stride[1]);
	}

	void THNN_(SparseLinear_updateOutput)(
	THNNState *state,
	THTensor *input,
	THTensor *output,
	THTensor *weight,
	THTensor *bias)
	{
	long h, i, j, hp0, hp1;
	long outDim = THTensor_(size)(weight, 0);
	long inDim = THTensor_(size)(weight, 1);
	long batchSize = THTensor_(size)(output, 0);

	THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
	THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
	THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");

	long nnz = THTensor_(size)(input, 0);

	THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
	THLongTensor_zero(csr);

	weight = THTensor_(newContiguous)(weight);

	//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
	for (i=0; i<nnz; i++) {
	hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
	hp1 = (i+1 == nnz) ?
	batchSize :
	(long)(THNN_(get2d)(input, i+1, 0)) - 1;
	if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
	THLongTensor_set1d(csr, h+1, i+1);
	}
	}


	// output = weight * input + bias
	THTensor_(zero)(output);
	#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
	for (h = 0; h < batchSize; h++) {
	long i_start = THLongTensor_get1d(csr, h);
	long i_end = THLongTensor_get1d(csr, h+1);
	for (i = i_start; i < i_end; i++) {
	real val = THNN_(get2d)(input, i, 2);
	if (val == 0) {
	continue;
	}

	long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
	if (offset >= 0 && offset < inDim) {
	THBlas_(axpy)(outDim,
	val,
	COL_PTR2(weight, offset), weight->stride[0],
	ROW_PTR2(output, h), output->stride[1]);
	} else {
	THError("index out of bound. updateOutput: %d not between 1 and %d",
	offset + 1, inDim);
	}
	}
	}

	THTensor* output_row = THTensor_(new)();
	for (h = 0; h < batchSize; h++) {
	THTensor_(select)(output_row, output, 0, h);
	THTensor_(cadd)(output_row, bias, 1.0, output_row);
	}
	THTensor_(free)(output_row);
	THLongTensor_free(csr);
	THTensor_(free)(weight);
	}

	void THNN_(SparseLinear_legacyUpdateOutput)(
	THNNState *state,
	THTensor *input,
	THTensor *output,
	THTensor *weight,
	THTensor *bias)
	{
	long h, i;
	long outDim = THTensor_(size)(weight, 0);
	long inDim = THTensor_(size)(weight, 1);

	THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
	THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
	THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");

	weight = THTensor_(newContiguous)(weight);

	long batchSize = THTensor_(size)(input, 0);
	long nnz = THTensor_(size)(input, 1);
	THTensor_(resize2d)(output, batchSize, outDim);

	// output = weight * input + bias
	THTensor_(zero)(output);
	#pragma omp parallel for private(h, i) schedule(static) if ( \
	batchSize > 1 && batchSize * nnz * outDim > 10000)
	for (h = 0; h < batchSize; h++) {
	for (i = 0; i < nnz; i++) {
	real val = THNN_(get3d)(input, h, i, 1);
	if (val == 0) {
	continue;
	}

	long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
	if (offset >= 0 && offset < inDim) {
	THBlas_(axpy)(outDim,
	val,
	COL_PTR2(weight, offset), weight->stride[0],
	ROW_PTR2(output, h), output->stride[1]);
	} else {
	THError("index out of bound. updateOutput: %d not between 1 and %d",
	offset + 1, inDim);
	}
	}
	}

	THTensor* output_row = THTensor_(new)();
	for (h = 0; h < batchSize; h++) {
	THTensor_(select)(output_row, output, 0, h);
	THTensor_(cadd)(output_row, bias, 1.0, output_row);
	}
	THTensor_(free)(output_row);
	THTensor_(free)(weight);
	}

	void THNN_(SparseLinear_accGradParameters)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *weight,
	THTensor *bias,
	accreal weightDecay_,
	accreal scale_)
	{
	real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
	real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
	long h, i, col, hp0, hp1;
	long outDim = THTensor_(size)(weight, 0);
	long inDim = THTensor_(size)(weight, 1);

	THArgCheck(THNN_(checkInput)(input), 2,
	"input must be in coo format, nnz x 3");
	THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
	"gradWeight size wrong");
	THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
	"gradBias size wrong");
	THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
	"gradOutput must be contiguous");

	long nnz = THTensor_(size)(input, 0);

	THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
	THLongTensor_zero(csc);
	weight = THTensor_(newContiguous)(weight);

	#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
	for (i = 0; i < nnz; i++) {
	hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
	hp1 = (i+1 == nnz) ?
	inDim :
	(long)(THNN_(get2d)(input, i+1, 1)) - 1;
	if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
	THLongTensor_set1d(csc, h+1, i+1);
	}
	}

	// gradWeight += gradOutput * input
	#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
	for (col = 0; col < inDim; col++) {
	long i_start = THLongTensor_get1d(csc, col);
	long i_end = THLongTensor_get1d(csc, col+1);
	for (i = i_start; i < i_end; i++) {
	real val = scale * THNN_(get2d)(input, i, 2);

	h = (long)(THNN_(get2d)(input, i, 0)) - 1;
	long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
	if (offset >= 0 && offset < inDim) {
	THBlas_(axpy)(outDim,
	val,
	ROW_PTR2(gradOutput, h), gradOutput->stride[1],
	COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
	} else {
	THError(
	"index out of bound. accGradParameters: %d not between 1 and %d",
	offset + 1,
	inDim);
	}
	}
	}

	// gradBias += gradOutput
	THTensor* buf = THTensor_(new)();
	THTensor_(sum)(buf, gradOutput, 0);
	THTensor_(cadd)(gradBias, gradBias, scale, buf);
	THTensor_(free)(buf);
	THLongTensor_free(csc);

	if (weightDecay != 0) {
	THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
	}
	THTensor_(free)(weight);
	}

	void THNN_(SparseLinear_legacyAccGradParameters)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *weight,
	THTensor *bias,
	accreal weightDecay_,
	accreal scale_)
	{
	real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
	real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
	long h, i;
	long outDim = THTensor_(size)(weight, 0);
	long inDim = THTensor_(size)(weight, 1);

	THArgCheck(THNN_(checkLegacyInput)(input), 2,
	"input size must be batchsize x nnz x 2");
	THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
	"gradWeight size wrong");
	THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
	"gradBias size wrong");
	THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
	"gradOutput must be contiguous");

	long batchSize = THTensor_(size)(input, 0);
	long nnz = THTensor_(size)(input, 1);
	THTensor_(resize2d)(gradOutput, batchSize, outDim);

	// gradWeight += gradOutput * input
	#pragma omp parallel for private(h, i) schedule(static) if (\
	batchSize * nnz * outDim > 10000)
	for (i = 0; i < nnz; i++) {
	for (h = 0; h < batchSize; h++) {
	real val = scale * THNN_(get3d)(input, h, i, 1);
	if (val == 0) {
	continue;
	}

	long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
	if (offset >= 0 && offset < inDim) {
	THBlas_(axpy)(outDim,
	val,
	ROW_PTR2(gradOutput, h), gradOutput->stride[1],
	COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
	} else {
	THError(
	"index out of bound. accGradParameters: %d not between 1 and %d",
	offset + 1,
	inDim);
	}
	}
	}

	// gradBias += gradOutput
	THTensor* gradOutput_row = THTensor_(new)();
	for (h = 0; h < batchSize; h++) {
	THTensor_(select)(gradOutput_row, gradOutput, 0, h);
	THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
	}
	THTensor_(free)(gradOutput_row);

	if (weightDecay != 0) {
	THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
	}
	}

	void THNN_(SparseLinear_updateParameters)(
	THNNState *state,
	THTensor *weight,
	THTensor *bias,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *lastInput,
	accreal learningRate_)
	{
	real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
	long h, i;
	long outDim = weight->size[0];
	long inDim = weight->size[1];

	THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
	"gradWeight size wrong");
	THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
	THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
	THArgCheck(THNN_(checkInput)(lastInput), 6,
	"input must be in coo format, nnz x 3");


	long nnz = THTensor_(size)(lastInput, 0);

	// collect unique offsets of non-0 val in input
	THTensor* offsets = THTensor_(newWithSize1d)(nnz);
	long cnt = 0;
	for (i = 0; i < nnz; i++) {
	real val = THNN_(get2d)(lastInput, i, 2);
	if (val == 0) {
	continue;
	}
	long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
	if (offset >= 0 && offset < inDim) {
	THNN_(set1d)(offsets, cnt++, offset);
	} else {
	THError(
	"index out of bound. updateParameters: %d not between 1 and %d",
	offset + 1,
	inDim);
	}
	}
	if (cnt == 0) return;
	THTensor_(resize1d)(offsets, cnt);

	THTensor* uniqueOffsets = THTensor_(new)();
	THLongTensor* ri = THLongTensor_new();
	THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
	THLongTensor_free(ri);
	THTensor_(free)(offsets);

	cnt = 1;
	real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
	for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
	if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
	uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
	}
	}
	THTensor_(resize1d)(uniqueOffsets, cnt);

	// weight += -learningRate * gradWeight
	THTensor_(cadd)(bias, bias, -learningRate, gradBias);
	#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
	for (i = 0; i < cnt; i++) {
	long offset = (long)uniqueOffsets_p[i];
	THBlas_(axpy)(outDim,
	-learningRate,
	COL_PTR2(gradWeight, offset), gradWeight->stride[0],
	COL_PTR2(weight, offset), weight->stride[0]);
	}

	THTensor_(free)(uniqueOffsets);
	}

	void THNN_(SparseLinear_legacyUpdateParameters)(
	THNNState *state,
	THTensor *weight,
	THTensor *bias,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *lastInput,
	accreal learningRate_)
	{
	real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
	long h, i;
	long outDim = weight->size[0];
	long inDim = weight->size[1];

	THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
	"gradWeight size wrong");
	THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
	THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
	THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
	"input size must be batchsize x nnz x 2");


	long batchSize = THTensor_(size)(lastInput, 0);
	long nnz = THTensor_(size)(lastInput, 1);

	// collect unique offsets of non-0 val in input
	THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
	long cnt = 0;
	for (h = 0; h < batchSize; h++) {
	for (i = 0; i < nnz; i++) {
	real val = THNN_(get3d)(lastInput, h, i, 1);
	if (val == 0 ) {
	continue;
	}
	long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
	if (offset >= 0 && offset < inDim) {
	THNN_(set1d)(offsets, cnt++, offset);
	} else {
	THError(
	"index out of bound. updateParameters: %d not between 1 and %d",
	offset + 1,
	inDim);
	}
	}
	}
	THTensor_(resize1d)(offsets, cnt);

	THTensor* uniqueOffsets = THTensor_(new)();
	THLongTensor* ri = THLongTensor_new();
	THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
	THLongTensor_free(ri);
	THTensor_(free)(offsets);

	cnt = 1;
	real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
	for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
	if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
	uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
	}
	}
	THTensor_(resize1d)(uniqueOffsets, cnt);

	// weight += -learningRate * gradWeight
	THTensor_(cadd)(bias, bias, -learningRate, gradBias);
	#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
	for (i = 0; i < cnt; i++) {
	long offset = (long)uniqueOffsets_p[i];
	THBlas_(axpy)(outDim,
	-learningRate,
	COL_PTR2(gradWeight, offset), gradWeight->stride[0],
	COL_PTR2(weight, offset), weight->stride[0]);
	}

	THTensor_(free)(uniqueOffsets);
	}

	void THNN_(SparseLinear_zeroGradParameters)(
	THNNState *state,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *lastInput)
	{
	long h, i, j;

	long outDim = gradWeight->size[0];
	long inDim = gradWeight->size[1];

	THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
	THArgCheck(THNN_(checkInput)(lastInput), 4,
	"input must be in coo format, nnz x 3");

	THTensor_(zero)(gradBias);

	long nnz = THTensor_(size)(lastInput, 0);

	#pragma omp parallel for private(i, j) schedule(static) if ( \
	nnz * outDim > 10000)
	for (i = 0; i < nnz; i++) {
	if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
	continue;
	}

	long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
	if (offset >= 0 && offset < inDim) {
	real* pGradWeight = COL_PTR2(gradWeight, offset);
	if (gradWeight->stride[0] == 1) {
	THVector_(fill)(pGradWeight, 0, outDim);
	} else {
	long stride = gradWeight->stride[0];
	for (j = 0; j < outDim; ++j) {
	pGradWeight[j * stride] = 0;
	}
	}
	} else {
	THError(
	"index out of bound. zeroGradParameters: %d not between 1 and %d",
	offset + 1,
	inDim);
	}
	}
	}

	void THNN_(SparseLinear_legacyZeroGradParameters)(
	THNNState *state,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *lastInput)
	{
	long h, i, j;

	long outDim = gradWeight->size[0];
	long inDim = gradWeight->size[1];

	THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
	THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
	"input size must be batchsize x nnz x 2");

	THTensor_(zero)(gradBias);

	long batchSize = THTensor_(size)(lastInput, 0);
	long nnz = THTensor_(size)(lastInput, 1);

	#pragma omp parallel for private(h, i, j) schedule(static) if ( \
	batchSize > 1 && batchSize * nnz * outDim > 10000)
	for (h = 0; h < batchSize; h++) {
	for (i = 0; i < nnz; i++) {
	if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
	continue;
	}

	long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
	if (offset >= 0 && offset < inDim) {
	real* pGradWeight = COL_PTR2(gradWeight, offset);
	if (gradWeight->stride[0] == 1) {
	THVector_(fill)(pGradWeight, 0, outDim);
	} else {
	long stride = gradWeight->stride[0];
	for (j = 0; j < outDim; ++j) {
	pGradWeight[j * stride] = 0;
	}
	}
	} else {
	THError(
	"index out of bound. zeroGradParameters: %d not between 1 and %d",
	offset + 1,
	inDim);
	}
	}
	}
	}

	#undef ROW_PTR2
	#undef COL_PTR2

	#endif