generic/THTensorLapack.c - platform/external/pytorch - Git at Google

 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/THTensorLapack.c"
 #else

 /*
 Check if self is transpose of a contiguous matrix
 */
 static int THTensor_(isTransposedContiguous)(THTensor *self)
 {
   return self->stride[0] == 1 && self->stride[1] == self->size[0];
 }
 /*
 If a matrix is a regular contiguous matrix, make sure it is transposed
 because this is what we return from Lapack calls.
 */
 static void THTensor_(checkTransposed)(THTensor *self)
 {
   if(THTensor_(isContiguous)(self))
     THTensor_(transpose)(self, NULL, 0, 1);
   return;
 }
 /*
 newContiguous followed by transpose
 Similar to (newContiguous), but checks if the transpose of the matrix
 is contiguous and also limited to 2D matrices.
 */
 static THTensor *THTensor_(newTransposedContiguous)(THTensor *self)
 {
   THTensor *tensor;
   if(THTensor_(isTransposedContiguous)(self))
   {
     THTensor_(retain)(self);
     tensor = self;
   }
   else
   {
     tensor = THTensor_(newContiguous)(self);
     THTensor_(transpose)(tensor, NULL, 0, 1);
   }

   return tensor;
 }

 /*
 Given the result tensor and src tensor, decide if the lapack call should use the
 provided result tensor or should allocate a new space to put the result in.

 The returned tensor have to be freed by the calling function.

 nrows is required, because some lapack calls, require output space smaller than
 input space, like underdetermined gels.
 */
 static THTensor *THTensor_(checkLapackClone)(THTensor *result, THTensor *src, int nrows)
 {
   /* check if user wants to reuse src and if it is correct shape/size */
   if (src == result && THTensor_(isTransposedContiguous)(src) && src->size[1] == nrows)
     THTensor_(retain)(result);
   else if(src == result || result == NULL) /* in this case, user wants reuse of src, but its structure is not OK */
     result = THTensor_(new)();
   else
     THTensor_(retain)(result);
   return result;
 }

 /*
 Same as cloneColumnMajor, but accepts nrows argument, because some lapack calls require
 the resulting tensor to be larger than src.
 */
 static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src, int nrows)
 {
   THTensor *result;
   THTensor *view;

   if (src == NULL)
     src = self;
   result = THTensor_(checkLapackClone)(self, src, nrows);
   if (src == result)
     return result;

   THTensor_(resize2d)(result, src->size[1], nrows);
   THTensor_(checkTransposed)(result);

   if (src->size[0] == nrows)
     THTensor_(copy)(result, src);
   else
   {
     view = THTensor_(newNarrow)(result, 0, 0, src->size[0]);
     THTensor_(copy)(view, src);
     THTensor_(free)(view);
   }
   return result;
 }

 /*
 Create a clone of src in self column major order for use with Lapack.
 If src == self, a new tensor is allocated, in any case, the return tensor should be
 freed by calling function.
 */
 static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src)
 {
   return THTensor_(cloneColumnMajorNrows)(self, src, src->size[0]);
 }

 void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 {
   int free_b = 0;
   if (a == NULL) a = ra_;
   if (b == NULL) b = rb_;
   THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
       a->nDimension);
   THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
       "dimensions, but has %d", b->nDimension);
   THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
       a->size[0], a->size[1]);
   THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
       "rows, B has %ld", a->size[0], b->size[0]);

   if (b->nDimension == 1) {
     b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
             b->stride[0], 1, 0);
     free_b = 1;
   }

   int n, nrhs, lda, ldb, info;
   THIntTensor *ipiv;
   THTensor *ra__;  // working version of A matrix to be passed into lapack GELS
   THTensor *rb__;  // working version of B matrix to be passed into lapack GELS

   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);

   n    = (int)ra__->size[0];
   nrhs = (int)rb__->size[1];
   lda  = n;
   ldb  = n;

   ipiv = THIntTensor_newWithSize1d((long)n);
   THLapack_(gesv)(n, nrhs,
 		  THTensor_(data)(ra__), lda, THIntTensor_data(ipiv),
 		  THTensor_(data)(rb__), ldb, &info);

   THLapackCheckWithCleanup("Lapack Error in %s : U(%d,%d) is zero, singular U.",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(rb__);
                                THIntTensor_free(ipiv);
                                if (free_b) THTensor_(free)(b);),
                            "gesv", info, info);

   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(freeCopyTo)(rb__, rb_);
   THIntTensor_free(ipiv);
   if (free_b) THTensor_(free)(b);
 }

 void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
                       const char *uplo, const char *trans, const char *diag)
 {
   int free_b = 0;
   if (a == NULL) a = ra_;
   if (b == NULL) b = rb_;
   THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
       a->nDimension);
   THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
       "dimensions, but has %d", b->nDimension);
   THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
       a->size[0], a->size[1]);
   THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
       "rows, B has %ld", a->size[0], b->size[0]);

   if (b->nDimension == 1) {
     b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
             b->stride[0], 1, 0);
     free_b = 1;
   }

   int n, nrhs, lda, ldb, info;
   THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
   THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS

   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);

   n    = (int)ra__->size[0];
   nrhs = (int)rb__->size[1];
   lda  = n;
   ldb  = n;

   THLapack_(trtrs)(uplo[0], trans[0], diag[0], n, nrhs,
                    THTensor_(data)(ra__), lda,
                    THTensor_(data)(rb__), ldb, &info);


   THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
                            THCleanup(
                               THTensor_(free)(ra__);
                               THTensor_(free)(rb__);
                               if (free_b) THTensor_(free)(b);),
                            "trtrs", info, info);

   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(freeCopyTo)(rb__, rb_);
   if (free_b) THTensor_(free)(b);
 }

 void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 {
   int free_b = 0;
   // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_.
   if (a == NULL) a = ra_;
   if (b == NULL) b = rb_;
   THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
       a->nDimension);
   THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
       "dimensions, but has %d", b->nDimension);
   THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
       "rows, B has %ld", a->size[0], b->size[0]);

   if (b->nDimension == 1) {
     b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
             b->stride[0], 1, 0);
     free_b = 1;
   }

   int m, n, nrhs, lda, ldb, info, lwork;
   THTensor *work = NULL;
   real wkopt = 0;

   THTensor *ra__ = NULL;  // working version of A matrix to be passed into lapack GELS
   THTensor *rb__ = NULL;  // working version of B matrix to be passed into lapack GELS

   ra__ = THTensor_(cloneColumnMajor)(ra_, a);

   m = ra__->size[0];
   n = ra__->size[1];
   lda = m;
   ldb = (m > n) ? m : n;

   rb__ = THTensor_(cloneColumnMajorNrows)(rb_, b, ldb);

   nrhs = rb__->size[1];
   info = 0;


   /* get optimal workspace size */
   THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda,
 		  THTensor_(data)(rb__), ldb,
 		  &wkopt, -1, &info);
   lwork = (int)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
   THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda,
 		  THTensor_(data)(rb__), ldb,
 		  THTensor_(data)(work), lwork, &info);

   THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
                            THCleanup(THTensor_(free)(ra__);
                                      THTensor_(free)(rb__);
                                      THTensor_(free)(work);
                                      if (free_b) THTensor_(free)(b);),
                            "gels", info,"");

   /* rb__ is currently ldb by nrhs; resize it to n by nrhs */
   rb__->size[0] = n;
   if (rb__ != rb_)
     THTensor_(resize2d)(rb_, n, nrhs);

   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(freeCopyTo)(rb__, rb_);
   THTensor_(free)(work);
   if (free_b) THTensor_(free)(b);
 }

 void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
 {
   int n, lda, lwork, info, ldvr;
   THTensor *work, *wi, *wr, *a;
   real wkopt;
   real *rv_data;
   long i;

   THTensor *re__ = NULL;
   THTensor *rv__ = NULL;

   THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a_->size[0] == a_->size[1], 1,"A should be square");

   /* we want to definitely clone a_ for geev*/
   a = THTensor_(cloneColumnMajor)(NULL, a_);

   n = a->size[0];
   lda = n;

   wi = THTensor_(newWithSize1d)(n);
   wr = THTensor_(newWithSize1d)(n);

   rv_data = NULL;
   ldvr = 1;
   if (*jobvr == 'V')
   {
     THTensor_(resize2d)(rv_,n,n);
     /* guard against someone passing a correct size, but wrong stride */
     rv__ = THTensor_(newTransposedContiguous)(rv_);
     rv_data = THTensor_(data)(rv__);
     ldvr = n;
   }
   THTensor_(resize2d)(re_,n,2);
   re__ = THTensor_(newContiguous)(re_);

   /* get optimal workspace size */
   THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
       NULL, 1, rv_data, ldvr, &wkopt, -1, &info);

   lwork = (int)wkopt;
   work = THTensor_(newWithSize1d)(lwork);

   THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
       NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info);

   THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero",
                            THCleanup(THTensor_(free)(re__);
                                      THTensor_(free)(rv__);
                                      THTensor_(free)(a);
                                      THTensor_(free)(wi);
                                      THTensor_(free)(wr);
                                      THTensor_(free)(work);),
                            "geev", info,"");

   {
     real *re_data = THTensor_(data)(re__);
     real *wi_data = THTensor_(data)(wi);
     real *wr_data = THTensor_(data)(wr);
     for (i=0; i<n; i++)
     {
       re_data[2*i] = wr_data[i];
       re_data[2*i+1] = wi_data[i];
     }
   }

   if (*jobvr == 'V')
   {
     THTensor_(checkTransposed)(rv_);
     THTensor_(freeCopyTo)(rv__, rv_);
   }
   THTensor_(freeCopyTo)(re__, re_);
   THTensor_(free)(a);
   THTensor_(free)(wi);
   THTensor_(free)(wr);
   THTensor_(free)(work);
 }

 void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz, const char *uplo)
 {
   if (a == NULL) a = rv_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1,"A should be square");

   int n, lda, lwork, info;
   THTensor *work;
   real wkopt;

   THTensor *rv__ = NULL;
   THTensor *re__ = NULL;

   rv__ = THTensor_(cloneColumnMajor)(rv_, a);

   n = rv__->size[0];
   lda = n;

   THTensor_(resize1d)(re_,n);
   re__ = THTensor_(newContiguous)(re_);

   /* get optimal workspace size */
   THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
 		  THTensor_(data)(re_), &wkopt, -1, &info);
   lwork = (int)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
   THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
 		  THTensor_(data)(re_), THTensor_(data)(work), lwork, &info);

   THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero",
                            THCleanup(THTensor_(free)(rv__);
                                      THTensor_(free)(re__);
                                      THTensor_(free)(work);),
                            "syev", info,"");

   THTensor_(freeCopyTo)(rv__, rv_);
   THTensor_(freeCopyTo)(re__, re_);
   THTensor_(free)(work);
 }

 void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char* jobu)
 {
   THTensor *ra_ = THTensor_(new)();
   THTensor_(gesvd2)(ru_, rs_, rv_,  ra_, a, jobu);
   THTensor_(free)(ra_);
 }

 void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobu)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");

   int k,m, n, lda, ldu, ldvt, lwork, info;
   THTensor *work;
   THTensor *rvf_ = THTensor_(new)();
   real wkopt;

   THTensor *ra__ = NULL;
   THTensor *ru__ = NULL;
   THTensor *rs__ = NULL;
   THTensor *rv__ = NULL;

   ra__ = THTensor_(cloneColumnMajor)(ra_, a);

   m = ra__->size[0];
   n = ra__->size[1];
   k = (m < n ? m : n);

   lda = m;
   ldu = m;
   ldvt = n;

   THTensor_(resize1d)(rs_,k);
   THTensor_(resize2d)(rvf_,ldvt,n);
   if (*jobu == 'A')
     THTensor_(resize2d)(ru_,m,ldu);
   else
     THTensor_(resize2d)(ru_,k,ldu);

   THTensor_(checkTransposed)(ru_);

   /* guard against someone passing a correct size, but wrong stride */
   ru__ = THTensor_(newTransposedContiguous)(ru_);
   rs__ = THTensor_(newContiguous)(rs_);
   rv__ = THTensor_(newContiguous)(rvf_);

   THLapack_(gesvd)(jobu[0],jobu[0],
 		   m,n,THTensor_(data)(ra__),lda,
 		   THTensor_(data)(rs__),
 		   THTensor_(data)(ru__),
 		   ldu,
 		   THTensor_(data)(rv__), ldvt,
 		   &wkopt, -1, &info);
   lwork = (int)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
   THLapack_(gesvd)(jobu[0],jobu[0],
 		   m,n,THTensor_(data)(ra__),lda,
 		   THTensor_(data)(rs__),
 		   THTensor_(data)(ru__),
 		   ldu,
 		   THTensor_(data)(rv__), ldvt,
 		   THTensor_(data)(work),lwork, &info);

   THLapackCheckWithCleanup(" Lapack Error %s : %d superdiagonals failed to converge.",
                            THCleanup(
                                THTensor_(free)(ru__);
                                THTensor_(free)(rs__);
                                THTensor_(free)(rv__);
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
                            "gesvd", info,"");

   if (*jobu == 'S')
     THTensor_(narrow)(rv__,NULL,1,0,k);

   THTensor_(freeCopyTo)(ru__, ru_);
   THTensor_(freeCopyTo)(rs__, rs_);
   THTensor_(freeCopyTo)(rv__, rvf_);
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);

   if (*jobu == 'S') {
     THTensor_(narrow)(rvf_,NULL,1,0,k);
   }
   THTensor_(resizeAs)(rv_, rvf_);
   THTensor_(copy)(rv_, rvf_);
   THTensor_(free)(rvf_);
 }

 void THTensor_(getri)(THTensor *ra_, THTensor *a)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");

   int m, n, lda, info, lwork;
   real wkopt;
   THIntTensor *ipiv;
   THTensor *work;
   THTensor *ra__ = NULL;

   ra__ = THTensor_(cloneColumnMajor)(ra_, a);

   m = ra__->size[0];
   n = ra__->size[1];
   lda = m;
   ipiv = THIntTensor_newWithSize1d((long)m);

   /* Run LU */
   THLapack_(getrf)(n, n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &info);
   THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THIntTensor_free(ipiv);),
                            "getrf", info, info);

   /* Run inverse */
   THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &wkopt, -1, &info);
   lwork = (int)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
   THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), THTensor_(data)(work), lwork, &info);
   THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);
                                THIntTensor_free(ipiv);),
                            "getri", info, info);

   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
   THIntTensor_free(ipiv);
 }

 void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
 {
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");

   int n = a->size[0];

   /* Build full matrix */
   real *p = THTensor_(data)(a);
   long i, j;

   /* Upper Triangular Case */
   if (uplo[0] == 'U')
   {
     /* Clear lower triangle (excluding diagonals) */
     for (i=0; i<n; i++) {
      for (j=i+1; j<n; j++) {
         p[n*i + j] = 0;
       }
     }
   }
   /* Lower Triangular Case */
   else if (uplo[0] == 'L')
   {
     /* Clear upper triangle (excluding diagonals) */
     for (i=0; i<n; i++) {
       for (j=0; j<i; j++) {
         p[n*i + j] = 0;
       }
     }
   }
 }

 void THTensor_(copyUpLoTriangle)(THTensor *a, const char *uplo)
 {
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");

   int n = a->size[0];

   /* Build full matrix */
   real *p = THTensor_(data)(a);
   long i, j;

   /* Upper Triangular Case */
   if (uplo[0] == 'U')
   {
     /* Clear lower triangle (excluding diagonals) */
     for (i=0; i<n; i++) {
      for (j=i+1; j<n; j++) {
         p[n*i + j] = p[n*j+i];
       }
     }
   }
   /* Lower Triangular Case */
   else if (uplo[0] == 'L')
   {
     /* Clear upper triangle (excluding diagonals) */
     for (i=0; i<n; i++) {
       for (j=0; j<i; j++) {
         p[n*i + j] = p[n*j+i];
       }
     }
   }
 }

 void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");

   int n, lda, info;
   THTensor *ra__ = NULL;

   ra__ = THTensor_(cloneColumnMajor)(ra_, a);

   n = ra__->size[0];
   lda = n;

   /* Run Factorization */
   THLapack_(potrf)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
   THLapackCheckWithCleanup("Lapack Error in %s : the leading minor of order %d is not positive definite",
                            THCleanup(THTensor_(free)(ra__);),
                            "potrf", info, "");

   THTensor_(clearUpLoTriangle)(ra__, uplo);
   THTensor_(freeCopyTo)(ra__, ra_);
 }

 void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
 {
   int free_b = 0;
   if (b == NULL) b = rb_;

   THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
       a->nDimension);
   THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
       "dimensions, but has %d", b->nDimension);
   THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
       a->size[0], a->size[1]);
   THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
       "rows, B has %ld", a->size[0], b->size[0]);

   if (b->nDimension == 1) {
     b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
             b->stride[0], 1, 0);
     free_b = 1;
   }

   int n, nrhs, lda, ldb, info;
   THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
   THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS

   ra__ = THTensor_(cloneColumnMajor)(NULL, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);

   n    = (int)ra__->size[0];
   nrhs = (int)rb__->size[1];
   lda  = n;
   ldb  = n;

   THLapack_(potrs)(uplo[0], n, nrhs, THTensor_(data)(ra__),
                    lda, THTensor_(data)(rb__), ldb, &info);


   THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(rb__);
                                if (free_b) THTensor_(free)(b);),
                            "potrs", info, info);

   if (free_b) THTensor_(free)(b);
   THTensor_(free)(ra__);
   THTensor_(freeCopyTo)(rb__, rb_);
 }

 void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");

   int n, lda, info;
   THTensor *ra__ = NULL;

   ra__ = THTensor_(cloneColumnMajor)(ra_, a);

   n = ra__->size[0];
   lda = n;

   /* Run inverse */
   THLapack_(potri)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
   THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized",
                            THCleanup(THTensor_(free)(ra__);),
                            "potri", info, info);

   THTensor_(copyUpLoTriangle)(ra__, uplo);
   THTensor_(freeCopyTo)(ra__, ra_);
 }

 /*
  Computes the Cholesky factorization with complete pivoting of a real symmetric
  positive semidefinite matrix.

  Args:
  * `ra_`    - result Tensor in which to store the factor U or L from the
               Cholesky factorization.
  * `rpiv_`  - result IntTensor containing sparse permutation matrix P, encoded
               as P[rpiv_[k], k] = 1.
  * `a`      - input Tensor; the input matrix to factorize.
  * `uplo`   - string; specifies whether the upper or lower triangular part of
               the symmetric matrix A is stored. "U"/"L" for upper/lower
               triangular.
  * `tol`    - double; user defined tolerance, or < 0 for automatic choice.
               The algorithm terminates when the pivot <= tol.
  */
 void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char *uplo, real tol) {
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
   THArgCheck(a->size[0] == a->size[1], 1, "A should be square");

   int n = a->size[0];

   THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   THIntTensor_resize1d(rpiv_, n);

   // Allocate working tensor
   THTensor *work = THTensor_(newWithSize1d)(2 * n);

   // Run Cholesky factorization
   int lda = n;
   int rank, info;

   THLapack_(pstrf)(uplo[0], n, THTensor_(data)(ra__), lda,
                    THIntTensor_data(rpiv_), &rank, tol,
                    THTensor_(data)(work), &info);

   THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
                            "pstrf", info,"");

   THTensor_(clearUpLoTriangle)(ra__, uplo);

   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
 }

 /*
   Perform a QR decomposition of a matrix.

   In LAPACK, two parts of the QR decomposition are implemented as two separate
   functions: geqrf and orgqr. For flexibility and efficiency, these are wrapped
   directly, below - but to make the common usage convenient, we also provide
   this function, which calls them both and returns the results in a more
   intuitive form.

   Args:
   * `rq_` - result Tensor in which to store the Q part of the decomposition.
   * `rr_` - result Tensor in which to store the R part of the decomposition.
   * `a`   - input Tensor; the matrix to decompose.

 */
 void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a)
 {
   int m = a->size[0];
   int n = a->size[1];
   int k = (m < n ? m : n);
   THTensor *ra_ = THTensor_(new)();
   THTensor *rtau_ = THTensor_(new)();
   THTensor *rr__ = THTensor_(new)();
   THTensor_(geqrf)(ra_, rtau_, a);
   THTensor_(resize2d)(rr__, k, ra_->size[1]);
   THTensor_(narrow)(rr__, ra_, 0, 0, k);
   THTensor_(triu)(rr_, rr__, 0);
   THTensor_(resize2d)(rq_, ra_->size[0], k);
   THTensor_(orgqr)(rq_, ra_, rtau_);
   THTensor_(narrow)(rq_, rq_, 1, 0, k);
   THTensor_(free)(ra_);
   THTensor_(free)(rtau_);
   THTensor_(free)(rr__);
 }

 /*
   The geqrf function does the main work of QR-decomposing a matrix.
   However, rather than producing a Q matrix directly, it produces a sequence of
   elementary reflectors which may later be composed to construct Q - for example
   with the orgqr function, below.

   Args:
   * `ra_`   - Result matrix which will contain:
               i)  The elements of R, on and above the diagonal.
               ii) Directions of the reflectors implicitly defining Q.
   * `rtau_` - Result tensor which will contain the magnitudes of the reflectors
               implicitly defining Q.
   * `a`     - Input matrix, to decompose. If NULL, `ra_` is used as input.

   For further details, please see the LAPACK documentation.

 */
 void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a)
 {
   if (a == NULL) ra_ = a;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");

   THTensor *ra__ = NULL;

   /* Prepare the input for LAPACK, making a copy if necessary. */
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);

   int m = ra__->size[0];
   int n = ra__->size[1];
   int k = (m < n ? m : n);
   int lda = m;
   THTensor_(resize1d)(rtau_, k);

   /* Dry-run to query the suggested size of the workspace. */
   int info = 0;
   real wkopt = 0;
   THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
                    THTensor_(data)(rtau_),
                    &wkopt, -1, &info);

   /* Allocate the workspace and call LAPACK to do the real work. */
   int lwork = (int)wkopt;
   THTensor *work = THTensor_(newWithSize1d)(lwork);
   THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
                    THTensor_(data)(rtau_),
                    THTensor_(data)(work), lwork, &info);

   THLapackCheckWithCleanup("Lapack Error %s : unknown Lapack error. info = %i",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
                            "geqrf", info,"");

   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
 }

 /*
   The orgqr function allows reconstruction of a matrix Q with orthogonal
   columns, from a sequence of elementary reflectors, such as is produced by the
   geqrf function.

   Args:
   * `ra_` - result Tensor, which will contain the matrix Q.
   * `a`   - input Tensor, which should be a matrix with the directions of the
             elementary reflectors below the diagonal. If NULL, `ra_` is used as
             input.
   * `tau` - input Tensor, containing the magnitudes of the elementary
             reflectors.

   For further details, please see the LAPACK documentation.

 */
 void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");

   THTensor *ra__ = NULL;
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);

   int m = ra__->size[0];
   int n = ra__->size[1];
   int k = tau->size[0];
   int lda = m;

   /* Dry-run to query the suggested size of the workspace. */
   int info = 0;
   real wkopt = 0;
   THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
                    THTensor_(data)(tau),
                    &wkopt, -1, &info);

   /* Allocate the workspace and call LAPACK to do the real work. */
   int lwork = (int)wkopt;
   THTensor *work = THTensor_(newWithSize1d)(lwork);
   THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
                    THTensor_(data)(tau),
                    THTensor_(data)(work), lwork, &info);

   THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
                            "orgqr", info,"");
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
 }

 /*
   The ormqr function multiplies Q with another matrix from a sequence of
   elementary reflectors, such as is produced by the geqrf function.

   Args:
   * `ra_`   - result Tensor, which will contain the matrix Q' c.
   * `a`     - input Tensor, which should be a matrix with the directions of the
               elementary reflectors below the diagonal. If NULL, `ra_` is used as
               input.
   * `tau`   - input Tensor, containing the magnitudes of the elementary
               reflectors.
   * `c`     - input Tensor, containing the matrix to be multiplied.
   * `side`  - char, determining whether c is left- or right-multiplied with Q.
   * `trans` - char, determining whether to transpose Q before multiplying.

   For further details, please see the LAPACK documentation.

 */
 void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");

   THTensor *ra__ = NULL;
   ra__ = THTensor_(cloneColumnMajor)(ra_, c);

   int m = c->size[0];
   int n = c->size[1];
   int k = tau->size[0];
   int lda;
   if (*side == 'L')
   {
     lda = m;
   }
   else
   {
     lda = n;
   }
   int ldc = m;

   /* Dry-run to query the suggested size of the workspace. */
   int info = 0;
   real wkopt = 0;
   THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
                    THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
                    &wkopt, -1, &info);

   /* Allocate the workspace and call LAPACK to do the real work. */
   int lwork = (int)wkopt;
   THTensor *work = THTensor_(newWithSize1d)(lwork);
   THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
                    THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
                    THTensor_(data)(work), lwork, &info);

   THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
                            THCleanup(
                                THTensor_(free)(ra__);
                                THTensor_(free)(work);),
                            "ormqr", info,"");
   THTensor_(freeCopyTo)(ra__, ra_);
   THTensor_(free)(work);
 }

 void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, THTensor *a)
 {
   THArgCheck(THTensor_(nDimension)(a) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(a));

   if (ra_ != a) {
     THTensor_(resizeAs)(ra_, a);
     THTensor_(copy)(ra_, a);
   }

   int m = a->size[1];
   int n = a->size[2];
   if (m != n) {
     THError("btrifact is only implemented for square matrices");
   }
   long num_batches = THTensor_(size)(a, 0);
   THTensor *ra__;
   int lda;

   if (ra_->stride[1] == 1) {
     // column ordered, what BLAS wants
     lda = ra_->stride[2];
     ra__ = ra_;
   } else {
     // not column ordered, need to make it such (requires copy)
     THTensor *transp_r_ = THTensor_(newTranspose)(ra_, 1, 2);
     ra__ = THTensor_(newClone)(transp_r_);
     THTensor_(free)(transp_r_);
     THTensor_(transpose)(ra__, NULL, 1, 2);
     lda = ra__->stride[2];
   }

   THTensor *ai = THTensor_(new)();
   THTensor *rai = THTensor_(new)();
   THIntTensor *rpivoti = THIntTensor_new();

   int info = 0;
   int *info_ptr = &info;
   if (rinfo_) {
     THIntTensor_resize1d(rinfo_, num_batches);
     info_ptr = THIntTensor_data(rinfo_);
   }

   THIntTensor_resize2d(rpivots_, num_batches, n);

   long batch = 0;
   for (; batch < num_batches; ++batch) {
     THTensor_(select)(ai, a, 0, batch);
     THTensor_(select)(rai, ra__, 0, batch);
     THIntTensor_select(rpivoti, rpivots_, 0, batch);

     THLapack_(getrf)(n, n, THTensor_(data)(rai), lda,
                      THIntTensor_data(rpivoti), info_ptr);
     if (rinfo_) {
       info_ptr++;
     } else if (info != 0) {
       break;
     }
   }

   THTensor_(free)(ai);
   THTensor_(free)(rai);
   THIntTensor_free(rpivoti);

   if (ra__ != ra_) {
     THTensor_(freeCopyTo)(ra__, ra_);
   }

   if (!rinfo_ && info != 0) {
     THError("failed to factorize batch element %ld (info == %d)", batch, info);
   }
 }

 void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots)
 {
   THArgCheck(THTensor_(nDimension)(atf) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(atf));

   int lda;
   THTensor *atf_;

   // correct ordering of A_a
   if (atf->stride[1] == 1) {
     // column ordered, what BLAS wants
     lda = atf->stride[2];
     atf_ = atf;
   } else {
     // not column ordered, need to make it such (requires copy)
     // it would be nice if we could use the op(A) flags to automatically
     // transpose A if needed, but this leads to unpredictable behavior if the
     // user clones A_tf later with a different ordering
     THTensor *transp_r_ = THTensor_(newTranspose)(atf, 1, 2);
     atf_ = THTensor_(newClone)(transp_r_);
     THTensor_(free)(transp_r_);
     THTensor_(transpose)(atf_, NULL, 1, 2);
     lda = atf_->stride[2];
   }

   if (rb_ != b) {
     THTensor_(resizeAs)(rb_, b);
     THTensor_(copy)(rb_, b);
   }

   long num_batches = atf->size[0];
   long n = atf->size[1];

   THTensor *ai = THTensor_(new)();
   THTensor *rbi = THTensor_(new)();
   THIntTensor *pivoti = THIntTensor_new();

   if (!THIntTensor_isContiguous(pivots)) {
       THError("Error: rpivots_ is not contiguous.");
   }

   for (long batch = 0; batch < num_batches; ++batch) {
     THTensor_(select)(ai, atf_, 0, batch);
     THTensor_(select)(rbi, rb_, 0, batch);
     THIntTensor_select(pivoti, pivots, 0, batch);

 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_FLOAT)
     int info;
     THLapack_(getrs)('N', n, 1, THTensor_(data)(ai), lda,
                      THIntTensor_data(pivoti), THTensor_(data)(rbi),
                      n, &info);
     if (info != 0) {
       THError("Error: Nonzero info.");
     }
 #else
     THError("Unimplemented");
 #endif
   }

   THTensor_(free)(ai);
   THTensor_(free)(rbi);
   THIntTensor_free(pivoti);

   if (atf_ != atf) {
     THTensor_(free)(atf_);
   }
 }


 #endif