aten/src/ATen/native/BatchLinearAlgebra.cpp - platform/external/pytorch - Git at Google

 #include <ATen/ATen.h>
 #include <ATen/CPUApplyUtils.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/ExpandUtils.h>

 #include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/cpu/zmath.h>
 #include <ATen/Parallel.h>

 #include <c10/util/irange.h>

 #include <vector>

 // First the required LAPACK implementations are registered here.
 // A comment above the registered LAPACK routine suggest which batched
 // linear algebra function uses that routine
 #if AT_BUILD_WITH_LAPACK()

 // gesv
 extern "C" void zgesv_(int *n, int *nrhs, std::complex<double> *a, int *lda, int *ipiv, std::complex<double> *b, int *ldb, int *info);
 extern "C" void cgesv_(int *n, int *nrhs, std::complex<float> *a, int *lda, int *ipiv, std::complex<float> *b, int *ldb, int *info);
 extern "C" void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
 extern "C" void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);

 // getrf
 extern "C" void zgetrf_(int *m, int *n, std::complex<double> *a, int *lda, int *ipiv, int *info);
 extern "C" void cgetrf_(int *m, int *n, std::complex<float> *a, int *lda, int *ipiv, int *info);
 extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
 extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);

 // getri
 extern "C" void zgetri_(int *n, std::complex<double> *a, int *lda, int *ipiv, std::complex<double> *work, int *lwork, int *info);
 extern "C" void cgetri_(int *n, std::complex<float> *a, int *lda, int *ipiv, std::complex<float> *work, int *lwork, int *info);
 extern "C" void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info);
 extern "C" void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info);

 // potrs
 extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb, int *info);
 extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb, int *info);
 extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
 extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);

 // potrf
 extern "C" void zpotrf_(char *uplo, int *n, std::complex<double> *a, int *lda, int *info);
 extern "C" void cpotrf_(char *uplo, int *n, std::complex<float> *a, int *lda, int *info);
 extern "C" void dpotrf_(char *uplo, int *n, double *a, int *lda, int *info);
 extern "C" void spotrf_(char *uplo, int *n, float *a, int *lda, int *info);

 // potri
 extern "C" void zpotri_(char *uplo, int *n, std::complex<double> *a, int *lda, int *info);
 extern "C" void cpotri_(char *uplo, int *n, std::complex<float> *a, int *lda, int *info);
 extern "C" void dpotri_(char *uplo, int *n, double *a, int *lda, int *info);
 extern "C" void spotri_(char *uplo, int *n, float *a, int *lda, int *info);

 // trtrs
 extern "C" void ztrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb, int *info);
 extern "C" void ctrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb, int *info);
 extern "C" void dtrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
 extern "C" void strtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);

 // geqrf
 extern "C" void zgeqrf_(int *m, int *n, std::complex<double> *a, int *lda, std::complex<double> *tau, std::complex<double> *work, int *lwork, int *info);
 extern "C" void cgeqrf_(int *m, int *n, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *work, int *lwork, int *info);
 extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
 extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);

 // orgqr
 extern "C" void zungqr_(int *m, int *n, int *k, std::complex<double> *a, int *lda, std::complex<double> *tau, std::complex<double> *work, int *lwork, int *info);
 extern "C" void cungqr_(int *m, int *n, int *k, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *work, int *lwork, int *info);
 extern "C" void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
 extern "C" void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);

 // ormqr
 extern "C" void zunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex<double> *a, int *lda, std::complex<double> *tau, std::complex<double> *c, int *ldc, std::complex<double> *work, int *lwork, int *info);
 extern "C" void cunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *c, int *ldc, std::complex<float> *work, int *lwork, int *info);
 extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
 extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info);

 // syev
 extern "C" void zheev_(char *jobz, char *uplo, int *n, std::complex<double> *a, int *lda, double *w, std::complex<double> *work, int *lwork, double *rwork, int *info);
 extern "C" void cheev_(char *jobz, char *uplo, int *n, std::complex<float> *a, int *lda, float *w, std::complex<float> *work, int *lwork, float *rwork, int *info);
 extern "C" void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
 extern "C" void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);

 // syevd
 extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a, int *lda, double *w, std::complex<double> *work, int *lwork, double *rwork, int *lrwork, int *iwork, int *liwork, int *info);
 extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a, int *lda, float *w, std::complex<float> *work, int *lwork, float *rwork, int *lrwork, int *iwork, int *liwork, int *info);
 extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *iwork, int *liwork, int *info);
 extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *iwork, int *liwork, int *info);

 // geev
 extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info);
 extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info);
 extern "C" void cgeev_(char *jobvl, char *jobvr, int *n,
              std::complex<float> *a, int *lda,
              std::complex<float> *w,
              std::complex<float> *vl, int *ldvl,
              std::complex<float> *vr, int *ldvr,
              std::complex<float> *work, int *lwork,
              float *rwork,
              int *info);
 extern "C" void zgeev_(char *jobvl, char *jobvr, int *n,
              std::complex<double> *a, int *lda,
              std::complex<double> *w,
              std::complex<double> *vl, int *ldvl,
              std::complex<double> *vr, int *ldvr,
              std::complex<double> *work, int *lwork,
              double *rwork,
              int *info);

 // gesdd
 extern "C" void zgesdd_(char *jobz, int *m, int *n, std::complex<double> *a, int *lda,
                         double *s, std::complex<double> *u, int *ldu, std::complex<double> *vt, int *ldvt, std::complex<double> *work, int *lwork, double *rwork, int *iwork, int *info);
 extern "C" void cgesdd_(char *jobz, int *m, int *n, std::complex<float> *a, int *lda,
                         float *s, std::complex<float> *u, int *ldu, std::complex<float> *vt, int *ldvt, std::complex<float> *work, int *lwork, float *rwork, int *iwork, int *info);
 extern "C" void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda,
                         double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info);
 extern "C" void sgesdd_(char *jobz, int *m, int *n, float *a, int *lda,
                         float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *iwork, int *info);

 // getrs
 extern "C" void zgetrs_(char *trans, int *n, int *nrhs, std::complex<double> *a, int *lda, int *ipiv, std::complex<double> *b, int *ldb, int *info);
 extern "C" void cgetrs_(char *trans, int *n, int *nrhs, std::complex<float> *a, int *lda, int *ipiv, std::complex<float> *b, int *ldb, int *info);
 extern "C" void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
 extern "C" void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);

 // gels
 extern "C" void zgels_(char *trans, int *m, int *n, int *nrhs,
     std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb,
     std::complex<double> *work, int *lwork, int *info);
 extern "C" void cgels_(char *trans, int *m, int *n, int *nrhs,
     std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb,
     std::complex<float> *work, int *lwork, int *info);
 extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs,
     double *a, int *lda, double *b, int *ldb,
     double *work, int *lwork, int *info);
 extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs,
     float *a, int *lda, float *b, int *ldb,
     float *work, int *lwork, int *info);

 // gelsd
 extern "C" void zgelsd_(int *m, int *n, int *nrhs,
     std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb,
     double *s, double *rcond, int *rank,
     std::complex<double> *work, int *lwork, double *rwork, int *iwork, int *info);
 extern "C" void cgelsd_(int *m, int *n, int *nrhs,
     std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb,
     float *s, float *rcond, int *rank,
     std::complex<float> *work, int *lwork, float *rwork, int *iwork, int *info);
 extern "C" void dgelsd_(int *m, int *n, int *nrhs,
     double *a, int *lda, double *b, int *ldb,
     double *s, double *rcond, int *rank,
     double *work, int *lwork, int *iwork, int *info);
 extern "C" void sgelsd_(int *m, int *n, int *nrhs,
     float *a, int *lda, float *b, int *ldb,
     float *s, float *rcond, int *rank,
     float *work, int *lwork, int *iwork, int *info);

 // gelsy
 extern "C" void zgelsy_(int *m, int *n, int *nrhs,
     std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb,
     int *jpvt, double *rcond, int *rank,
     std::complex<double> *work, int *lwork,
     double *rwork, int *info);
 extern "C" void cgelsy_(int *m, int *n, int *nrhs,
     std::complex<float> * a, int *lda, std::complex<float> *b, int *ldb,
     int *jpvt, float *rcond, int *rank,
     std::complex<float> *work, int *lwork,
     float *rwork, int *info);
 extern "C" void dgelsy_(int *m, int *n, int *nrhs,
     double *a, int *lda, double *b, int *ldb,
     int *jpvt, double *rcond, int *rank,
     double *work, int *lwork, int *info);
 extern "C" void sgelsy_(int *m, int *n, int *nrhs,
     float *a, int *lda, float *b, int *ldb,
     int *jpvt, float *rcond, int *rank,
     float *work, int *lwork, int *info);

 // gelss
 extern "C" void zgelss_(int *m, int *n, int *nrhs,
     std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb,
     double *s, double *rcond, int *rank,
     std::complex<double> *work, int *lwork,
     double *rwork, int *info);
 extern "C" void cgelss_(int *m, int *n, int *nrhs,
     std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb,
     float *s, float *rcond, int *rank,
     std::complex<float> *work, int *lwork,
     float *rwork, int *info);
 extern "C" void dgelss_(int *m, int *n, int *nrhs,
     double *a, int *lda, double *b, int *ldb,
     double *s, double *rcond, int *rank,
     double *work, int *lwork, int *info);
 extern "C" void sgelss_(int *m, int *n, int *nrhs,
     float *a, int *lda, float *b, int *ldb,
     float *s, float *rcond, int *rank,
     float *work, int *lwork, int *info);
 #endif

 namespace at {
 namespace native {

 #if AT_BUILD_WITH_LAPACK()
 // Define the per-batch functions to be used in the main implementation of the batched
 // linear algebra operations
 template<class scalar_t>
 void lapackSolve(int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);

 template<class scalar_t>
 void lapackGetri(int n, scalar_t *a, int lda, int *ipiv, scalar_t *work, int lwork, int *info);

 template<class scalar_t>
 void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info);

 template<class scalar_t, class value_t=scalar_t>
 void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info);

 template<class scalar_t, class value_t=scalar_t>
 void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda,
                value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);

 template<> void lapackSolve<c10::complex<double>>(int n, int nrhs, c10::complex<double> *a, int lda, int *ipiv, c10::complex<double> *b, int ldb, int *info) {
   zgesv_(&n, &nrhs, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, reinterpret_cast<std::complex<double>*>(b), &ldb, info);
 }

 template<> void lapackSolve<c10::complex<float>>(int n, int nrhs, c10::complex<float> *a, int lda, int *ipiv, c10::complex<float> *b, int ldb, int *info) {
   cgesv_(&n, &nrhs, reinterpret_cast<std::complex<float>*>(a), &lda, ipiv, reinterpret_cast<std::complex<float>*>(b), &ldb, info);
 }

 template<> void lapackSolve<double>(int n, int nrhs, double *a, int lda, int *ipiv, double *b, int ldb, int *info) {
   dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
 }

 template<> void lapackSolve<float>(int n, int nrhs, float *a, int lda, int *ipiv, float *b, int ldb, int *info) {
   sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
 }

 template<> void lapackGetri<c10::complex<double>>(int n, c10::complex<double> *a, int lda, int *ipiv, c10::complex<double> *work, int lwork, int *info) {
   zgetri_(&n, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, reinterpret_cast<std::complex<double>*>(work), &lwork, info);
 }

 template<> void lapackGetri<c10::complex<float>>(int n, c10::complex<float> *a, int lda, int *ipiv, c10::complex<float> *work, int lwork, int *info) {
   cgetri_(&n, reinterpret_cast<std::complex<float>*>(a), &lda, ipiv, reinterpret_cast<std::complex<float>*>(work), &lwork, info);
 }

 template<> void lapackGetri<double>(int n, double *a, int lda, int *ipiv, double *work, int lwork, int *info) {
   dgetri_(&n, a, &lda, ipiv, work, &lwork, info);
 }

 template<> void lapackGetri<float>(int n, float *a, int lda, int *ipiv, float *work, int lwork, int *info) {
   sgetri_(&n, a, &lda, ipiv, work, &lwork, info);
 }

 template<> void lapackLu<c10::complex<double>>(int m, int n, c10::complex<double> *a, int lda, int *ipiv, int *info) {
   zgetrf_(&m, &n, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, info);
 }

 template<> void lapackLu<c10::complex<float>>(int m, int n, c10::complex<float> *a, int lda, int *ipiv, int *info) {
   cgetrf_(&m, &n, reinterpret_cast<std::complex<float>*>(a), &lda, ipiv, info);
 }

 template<> void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
   dgetrf_(&m, &n, a, &lda, ipiv, info);
 }

 template<> void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
   sgetrf_(&m, &n, a, &lda, ipiv, info);
 }

 template<> void lapackCholeskySolve<c10::complex<double>>(char uplo, int n, int nrhs, c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb, int *info) {
   zpotrs_(&uplo, &n, &nrhs, reinterpret_cast<std::complex<double>*>(a), &lda, reinterpret_cast<std::complex<double>*>(b), &ldb, info);
 }

 template<> void lapackCholeskySolve<c10::complex<float>>(char uplo, int n, int nrhs, c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb, int *info) {
   cpotrs_(&uplo, &n, &nrhs, reinterpret_cast<std::complex<float>*>(a), &lda, reinterpret_cast<std::complex<float>*>(b), &ldb, info);
 }

 template<> void lapackCholeskySolve<double>(char uplo, int n, int nrhs, double *a, int lda, double *b, int ldb, int *info) {
   dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }

 template<> void lapackCholeskySolve<float>(char uplo, int n, int nrhs, float *a, int lda, float *b, int ldb, int *info) {
   spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
 }

 template<> void lapackCholesky<c10::complex<double>>(char uplo, int n, c10::complex<double> *a, int lda, int *info) {
   zpotrf_(&uplo, &n, reinterpret_cast<std::complex<double>*>(a), &lda, info);
 }

 template<> void lapackCholesky<c10::complex<float>>(char uplo, int n, c10::complex<float> *a, int lda, int *info) {
   cpotrf_(&uplo, &n, reinterpret_cast<std::complex<float>*>(a), &lda, info);
 }

 template<> void lapackCholesky<double>(char uplo, int n, double *a, int lda, int *info) {
   dpotrf_(&uplo, &n, a, &lda, info);
 }

 template<> void lapackCholesky<float>(char uplo, int n, float *a, int lda, int *info) {
   spotrf_(&uplo, &n, a, &lda, info);
 }

 template<> void lapackCholeskyInverse<c10::complex<double>>(char uplo, int n, c10::complex<double> *a, int lda, int *info) {
   zpotri_(&uplo, &n, reinterpret_cast<std::complex<double>*>(a), &lda, info);
 }

 template<> void lapackCholeskyInverse<c10::complex<float>>(char uplo, int n, c10::complex<float> *a, int lda, int *info) {
   cpotri_(&uplo, &n, reinterpret_cast<std::complex<float>*>(a), &lda, info);
 }

 template<> void lapackCholeskyInverse<double>(char uplo, int n, double *a, int lda, int *info) {
   dpotri_(&uplo, &n, a, &lda, info);
 }

 template<> void lapackCholeskyInverse<float>(char uplo, int n, float *a, int lda, int *info) {
   spotri_(&uplo, &n, a, &lda, info);
 }

 template<> void lapackTriangularSolve<c10::complex<double>>(char uplo, char trans, char diag, int n, int nrhs, c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb, int *info) {
   ztrtrs_(&uplo, &trans, &diag, &n, &nrhs, reinterpret_cast<std::complex<double>*>(a), &lda, reinterpret_cast<std::complex<double>*>(b), &ldb, info);
 }

 template<> void lapackTriangularSolve<c10::complex<float>>(char uplo, char trans, char diag, int n, int nrhs, c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb, int *info) {
   ctrtrs_(&uplo, &trans, &diag, &n, &nrhs, reinterpret_cast<std::complex<float>*>(a), &lda, reinterpret_cast<std::complex<float>*>(b), &ldb, info);
 }

 template<> void lapackTriangularSolve<double>(char uplo, char trans, char diag, int n, int nrhs, double *a, int lda, double *b, int ldb, int *info) {
   dtrtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
 }

 template<> void lapackTriangularSolve<float>(char uplo, char trans, char diag, int n, int nrhs, float *a, int lda, float *b, int ldb, int *info) {
   strtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
 }

 template<> void lapackGeqrf<c10::complex<double>>(int m, int n, c10::complex<double> *a, int lda, c10::complex<double> *tau, c10::complex<double> *work, int lwork, int *info) {
   zgeqrf_(&m, &n, reinterpret_cast<std::complex<double>*>(a), &lda, reinterpret_cast<std::complex<double>*>(tau), reinterpret_cast<std::complex<double>*>(work), &lwork, info);
 }

 template<> void lapackGeqrf<c10::complex<float>>(int m, int n, c10::complex<float> *a, int lda, c10::complex<float> *tau, c10::complex<float> *work, int lwork, int *info) {
   cgeqrf_(&m, &n, reinterpret_cast<std::complex<float>*>(a), &lda, reinterpret_cast<std::complex<float>*>(tau), reinterpret_cast<std::complex<float>*>(work), &lwork, info);
 }

 template<> void lapackGeqrf<double>(int m, int n, double *a, int lda, double *tau, double *work, int lwork, int *info) {
   dgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
 }

 template<> void lapackGeqrf<float>(int m, int n, float *a, int lda, float *tau, float *work, int lwork, int *info) {
   sgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
 }

 template<> void lapackOrgqr<c10::complex<double>>(int m, int n, int k, c10::complex<double> *a, int lda, c10::complex<double> *tau, c10::complex<double> *work, int lwork, int *info) {
   zungqr_(&m, &n, &k, reinterpret_cast<std::complex<double>*>(a), &lda, reinterpret_cast<std::complex<double>*>(tau), reinterpret_cast<std::complex<double>*>(work), &lwork, info);
 }

 template<> void lapackOrgqr<c10::complex<float>>(int m, int n, int k, c10::complex<float> *a, int lda, c10::complex<float> *tau, c10::complex<float> *work, int lwork, int *info) {
   cungqr_(&m, &n, &k, reinterpret_cast<std::complex<float>*>(a), &lda, reinterpret_cast<std::complex<float>*>(tau), reinterpret_cast<std::complex<float>*>(work), &lwork, info);
 }

 template<> void lapackOrgqr<double>(int m, int n, int k, double *a, int lda, double *tau, double *work, int lwork, int *info) {
   dorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
 }

 template<> void lapackOrgqr<float>(int m, int n, int k, float *a, int lda, float *tau, float *work, int lwork, int *info) {
   sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
 }

 template<> void lapackOrmqr<c10::complex<double>>(char side, char trans, int m, int n, int k, c10::complex<double> *a, int lda, c10::complex<double> *tau, c10::complex<double> *c, int ldc, c10::complex<double> *work, int lwork, int *info) {
   zunmqr_(&side, &trans, &m, &n, &k, reinterpret_cast<std::complex<double>*>(a), &lda, reinterpret_cast<std::complex<double>*>(tau), reinterpret_cast<std::complex<double>*>(c), &ldc, reinterpret_cast<std::complex<double>*>(work), &lwork, info);
 }

 template<> void lapackOrmqr<c10::complex<float>>(char side, char trans, int m, int n, int k, c10::complex<float> *a, int lda, c10::complex<float> *tau, c10::complex<float> *c, int ldc, c10::complex<float> *work, int lwork, int *info) {
   cunmqr_(&side, &trans, &m, &n, &k, reinterpret_cast<std::complex<float>*>(a), &lda, reinterpret_cast<std::complex<float>*>(tau), reinterpret_cast<std::complex<float>*>(c), &ldc, reinterpret_cast<std::complex<float>*>(work), &lwork, info);
 }

 template<> void lapackOrmqr<double>(char side, char trans, int m, int n, int k, double *a, int lda, double *tau, double *c, int ldc, double *work, int lwork, int *info) {
   dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
 }

 template<> void lapackOrmqr<float>(char side, char trans, int m, int n, int k, float *a, int lda, float *tau, float *c, int ldc, float *work, int lwork, int *info) {
   sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
 }

 template<> void lapackSymeig<c10::complex<double>, double>(char jobz, char uplo, int n, c10::complex<double> *a, int lda, double *w, c10::complex<double> *work, int lwork, double *rwork, int *info) {
   zheev_(&jobz, &uplo, &n, reinterpret_cast<std::complex<double>*>(a), &lda, w, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, info);
 }

 template<> void lapackSymeig<c10::complex<float>, float>(char jobz, char uplo, int n, c10::complex<float> *a, int lda, float *w, c10::complex<float> *work, int lwork, float *rwork, int *info) {
   cheev_(&jobz, &uplo, &n, reinterpret_cast<std::complex<float>*>(a), &lda, w, reinterpret_cast<std::complex<float>*>(work), &lwork, rwork, info);
 }

 template<> void lapackSymeig<double>(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, double* rwork, int *info) {
   (void)rwork;  // unused
   dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
 }

 template<> void lapackSymeig<float>(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, float* rwork, int *info) {
   (void)rwork;  // unused
   ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
 }

 template<> void lapackSyevd<c10::complex<double>, double>(char jobz, char uplo, int n, c10::complex<double> *a, int lda, double *w, c10::complex<double> *work, int lwork, double *rwork, int lrwork, int *iwork, int liwork, int *info) {
   zheevd_(&jobz, &uplo, &n, reinterpret_cast<std::complex<double>*>(a), &lda, w, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, &lrwork, iwork, &liwork, info);
 }

 template<> void lapackSyevd<c10::complex<float>, float>(char jobz, char uplo, int n, c10::complex<float> *a, int lda, float *w, c10::complex<float> *work, int lwork, float *rwork, int lrwork, int *iwork, int liwork, int *info) {
   cheevd_(&jobz, &uplo, &n, reinterpret_cast<std::complex<float>*>(a), &lda, w, reinterpret_cast<std::complex<float>*>(work), &lwork, rwork, &lrwork, iwork, &liwork, info);
 }

 template<> void lapackSyevd<double>(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, double *rwork, int lrwork, int *iwork, int liwork, int *info) {
   (void)rwork;  // unused
   (void)lrwork;  // unused
   dsyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }

 template<> void lapackSyevd<float>(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, float *rwork, int lrwork, int *iwork, int liwork, int *info) {
   (void)rwork;  // unused
   (void)lrwork;  // unused
   ssyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info);
 }

 template<> void lapackEig<double>(char jobvl, char jobvr, int n, double *a, int lda, double *w, double* vl, int ldvl, double *vr, int ldvr, double *work, int lwork, double *rwork, int *info) {
   // lapack [sd]geev wants to separate output arrays: wr and wi for the real
   // and imaginary parts
   double *wr = w;
   double *wi = w + n;
   (void)rwork; // unused
   dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info);
 }

 template<> void lapackEig<float>(char jobvl, char jobvr, int n, float *a, int lda, float *w, float* vl, int ldvl, float *vr, int ldvr, float *work, int lwork, float *rwork, int *info) {
   // lapack [sd]geev wants to separate output arrays: wr and wi for the real
   // and imaginary parts
   float *wr = w;
   float *wi = w + n;
   (void)rwork; // unused
   sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info);
 }

 template<> void lapackEig<c10::complex<double>, double>(char jobvl, char jobvr, int n, c10::complex<double> *a, int lda, c10::complex<double> *w, c10::complex<double> *vl, int ldvl, c10::complex<double> *vr, int ldvr, c10::complex<double> *work, int lwork, double *rwork, int *info) {
   zgeev_(&jobvl, &jobvr, &n,
          reinterpret_cast<std::complex<double>*>(a), &lda,
          reinterpret_cast<std::complex<double>*>(w),
          reinterpret_cast<std::complex<double>*>(vl), &ldvl,
          reinterpret_cast<std::complex<double>*>(vr), &ldvr,
          reinterpret_cast<std::complex<double>*>(work), &lwork,
          rwork, info);
 }

 template<> void lapackEig<c10::complex<float>, float>(char jobvl, char jobvr, int n, c10::complex<float> *a, int lda, c10::complex<float> *w, c10::complex<float> *vl, int ldvl, c10::complex<float> *vr, int ldvr, c10::complex<float> *work, int lwork, float *rwork, int *info) {
   cgeev_(&jobvl, &jobvr, &n,
          reinterpret_cast<std::complex<float>*>(a), &lda,
          reinterpret_cast<std::complex<float>*>(w),
          reinterpret_cast<std::complex<float>*>(vl), &ldvl,
          reinterpret_cast<std::complex<float>*>(vr), &ldvr,
          reinterpret_cast<std::complex<float>*>(work), &lwork,
          rwork, info);
 }

 template<> void lapackSvd<c10::complex<double>, double>(char jobz, int m, int n, c10::complex<double> *a, int lda,
                                   double *s, c10::complex<double> *u, int ldu, c10::complex<double> *vt, int ldvt, c10::complex<double> *work, int lwork, double *rwork, int *iwork, int *info) {
   zgesdd_(&jobz, &m, &n, reinterpret_cast<std::complex<double>*>(a), &lda, s, reinterpret_cast<std::complex<double>*>(u), &ldu,
           reinterpret_cast<std::complex<double>*>(vt), &ldvt, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, iwork, info);
 }

 template<> void lapackSvd<c10::complex<float>, float>(char jobz, int m, int n, c10::complex<float> *a, int lda,
                                  float *s, c10::complex<float> *u, int ldu, c10::complex<float> *vt, int ldvt, c10::complex<float> *work, int lwork, float *rwork, int *iwork, int *info) {
   cgesdd_(&jobz, &m, &n, reinterpret_cast<std::complex<float>*>(a), &lda, s, reinterpret_cast<std::complex<float>*>(u), &ldu,
           reinterpret_cast<std::complex<float>*>(vt), &ldvt, reinterpret_cast<std::complex<float>*>(work), &lwork, rwork, iwork, info);
 }

 template<> void lapackSvd<double>(char jobz, int m, int n, double *a, int lda,
                                   double *s, double *u, int ldu, double *vt, int ldvt, double *work, int lwork, double *rwork, int *iwork, int *info) {
   dgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info);
 }

 template<> void lapackSvd<float>(char jobz, int m, int n, float *a, int lda,
                                  float *s, float *u, int ldu, float *vt, int ldvt, float *work, int lwork, float *rwork, int *iwork, int *info) {
   sgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info);
 }

 template<> void lapackLuSolve<c10::complex<double>>(char trans, int n, int nrhs, c10::complex<double> *a, int lda, int *ipiv, c10::complex<double> *b, int ldb, int *info) {
   zgetrs_(&trans, &n, &nrhs, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, reinterpret_cast<std::complex<double>*>(b), &ldb, info);
 }

 template<> void lapackLuSolve<c10::complex<float>>(char trans, int n, int nrhs, c10::complex<float> *a, int lda, int *ipiv, c10::complex<float> *b, int ldb, int *info) {
   cgetrs_(&trans, &n, &nrhs, reinterpret_cast<std::complex<float>*>(a), &lda, ipiv, reinterpret_cast<std::complex<float>*>(b), &ldb, info);
 }

 template<> void lapackLuSolve<double>(char trans, int n, int nrhs, double *a, int lda, int *ipiv, double *b, int ldb, int *info) {
   dgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
 }

 template<> void lapackLuSolve<float>(char trans, int n, int nrhs, float *a, int lda, int *ipiv, float *b, int ldb, int *info) {
   sgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
 }

 template<> void lapackGels<c10::complex<double>>(
     char trans, int m, int n, int nrhs,
     c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb,
     c10::complex<double> *work, int lwork, int *info) {
   zgels_(&trans, &m, &n, &nrhs,
       reinterpret_cast<std::complex<double>*>(a), &lda,
       reinterpret_cast<std::complex<double>*>(b), &ldb,
       reinterpret_cast<std::complex<double>*>(work), &lwork, info);
 }

 template<> void lapackGels<c10::complex<float>>(
     char trans, int m, int n, int nrhs,
     c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb,
     c10::complex<float> *work, int lwork, int *info) {
   cgels_(&trans, &m, &n, &nrhs,
       reinterpret_cast<std::complex<float>*>(a), &lda,
       reinterpret_cast<std::complex<float>*>(b), &ldb,
       reinterpret_cast<std::complex<float>*>(work), &lwork, info);
 }

 template<> void lapackGels<double>(
     char trans, int m, int n, int nrhs,
     double *a, int lda, double *b, int ldb,
     double *work, int lwork, int *info) {
   dgels_(&trans, &m, &n, &nrhs,
       a, &lda, b, &ldb, work, &lwork, info);
 }

 template<> void lapackGels<float>(
     char trans, int m, int n, int nrhs,
     float *a, int lda, float *b, int ldb,
     float *work, int lwork, int *info) {
   sgels_(&trans, &m, &n, &nrhs,
       a, &lda, b, &ldb, work, &lwork, info);
 }

 template<> void lapackGelsd<c10::complex<double>, double>(
     int m, int n, int nrhs,
     c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb,
     double *s, double rcond, int *rank,
     c10::complex<double> *work, int lwork,
     double *rwork, int *iwork, int *info) {
   zgelsd_(&m, &n, &nrhs,
       reinterpret_cast<std::complex<double>*>(a), &lda,
       reinterpret_cast<std::complex<double>*>(b), &ldb,
       s, &rcond, rank,
       reinterpret_cast<std::complex<double>*>(work), &lwork,
       rwork, iwork, info);
 }

 template<> void lapackGelsd<c10::complex<float>, float>(
     int m, int n, int nrhs,
     c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb,
     float *s, float rcond, int *rank,
     c10::complex<float> *work, int lwork,
     float *rwork, int *iwork, int *info) {
   cgelsd_(&m, &n, &nrhs,
       reinterpret_cast<std::complex<float>*>(a), &lda,
       reinterpret_cast<std::complex<float>*>(b), &ldb,
       s, &rcond, rank,
       reinterpret_cast<std::complex<float>*>(work), &lwork,
       rwork, iwork, info);
 }

 template<> void lapackGelsd<double>(
     int m, int n, int nrhs,
     double *a, int lda, double *b, int ldb,
     double *s, double rcond, int *rank,
     double *work, int lwork,
     double *rwork, int *iwork, int *info) {
   dgelsd_(&m, &n, &nrhs,
       a, &lda, b, &ldb,
       s, &rcond, rank,
       work, &lwork, iwork, info);
 }

 template<> void lapackGelsd<float>(
     int m, int n, int nrhs,
     float *a, int lda, float *b, int ldb,
     float *s, float rcond, int *rank,
     float *work, int lwork,
     float *rwork, int *iwork, int *info) {
   sgelsd_(&m, &n, &nrhs,
       a, &lda, b, &ldb,
       s, &rcond, rank,
       work, &lwork, iwork, info);
 }

 template<> void lapackGelsy<c10::complex<double>, double>(
     int m, int n, int nrhs,
     c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb,
     int *jpvt, double rcond, int *rank,
     c10::complex<double> *work, int lwork, double *rwork, int *info) {
   zgelsy_(&m, &n, &nrhs,
       reinterpret_cast<std::complex<double>*>(a), &lda,
       reinterpret_cast<std::complex<double>*>(b), &ldb,
       jpvt, &rcond, rank,
       reinterpret_cast<std::complex<double>*>(work), &lwork,
       rwork, info);
 }

 template<> void lapackGelsy<c10::complex<float>, float>(
     int m, int n, int nrhs,
     c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb,
     int *jpvt, float rcond, int *rank,
     c10::complex<float> *work, int lwork, float *rwork, int *info) {
   cgelsy_(&m, &n, &nrhs,
       reinterpret_cast<std::complex<float>*>(a), &lda,
       reinterpret_cast<std::complex<float>*>(b), &ldb,
       jpvt, &rcond, rank,
       reinterpret_cast<std::complex<float>*>(work), &lwork,
       rwork, info);
 }

 template<> void lapackGelsy<double>(
     int m, int n, int nrhs,
     double *a, int lda, double *b, int ldb,
     int *jpvt, double rcond, int *rank,
     double *work, int lwork, double *rwork, int *info) {
   dgelsy_(&m, &n, &nrhs,
       a, &lda, b, &ldb,
       jpvt, &rcond, rank,
       work, &lwork, info);
 }

 template<> void lapackGelsy<float>(
     int m, int n, int nrhs,
     float *a, int lda, float *b, int ldb,
     int *jpvt, float rcond, int *rank,
     float *work, int lwork, float *rwork, int *info) {
   sgelsy_(&m, &n, &nrhs,
       a, &lda, b, &ldb,
       jpvt, &rcond, rank,
       work, &lwork, info);
 }

 template<> void lapackGelss<c10::complex<double>, double>(
     int m, int n, int nrhs,
     c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb,
     double *s, double rcond, int *rank,
     c10::complex<double> *work, int lwork,
     double *rwork, int *info
     ) {
   zgelss_(&m, &n, &nrhs,
       reinterpret_cast<std::complex<double>*>(a), &lda,
       reinterpret_cast<std::complex<double>*>(b), &ldb,
       s, &rcond, rank,
       reinterpret_cast<std::complex<double>*>(work), &lwork,
       rwork, info);
 }

 template<> void lapackGelss<c10::complex<float>, float>(
     int m, int n, int nrhs,
     c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb,
     float *s, float rcond, int *rank,
     c10::complex<float> *work, int lwork,
     float *rwork, int *info
     ) {
   cgelss_(&m, &n, &nrhs,
       reinterpret_cast<std::complex<float>*>(a), &lda,
       reinterpret_cast<std::complex<float>*>(b), &ldb,
       s, &rcond, rank,
       reinterpret_cast<std::complex<float>*>(work), &lwork,
       rwork, info);
 }

 template<> void lapackGelss<double>(
     int m, int n, int nrhs,
     double *a, int lda, double *b, int ldb,
     double *s, double rcond, int *rank,
     double *work, int lwork,
     double *rwork, int *info) {
   dgelss_(&m, &n, &nrhs,
       a, &lda, b, &ldb,
       s, &rcond, rank,
       work, &lwork, info);
 }

 template<> void lapackGelss<float>(
     int m, int n, int nrhs,
     float *a, int lda, float *b, int ldb,
     float *s, float rcond, int *rank,
     float *work, int lwork,
     float *rwork, int *info) {
   sgelss_(&m, &n, &nrhs,
       a, &lda, b, &ldb,
       s, &rcond, rank,
       work, &lwork, info);
 }
 #endif

 // Below of the definitions of the functions operating on a batch that are going to be dispatched
 // in the main helper functions for the linear algebra operations

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 /*
 Computes the solution to a system of linear equations
   A X = B,
 where A is an n-by-n matrix and X and B are n-by-nrhs matrices.
 Note that B is required to be a matrix, the usual, vector case, is obtained with nrhs = 1.
 Above description is for non-batched input, the batched input is also supported.
 This is an in-place routine, content of both A and b are overwritten.
 'infos' is an int Tensor containing error codes for each matrix in the batched input.
 For more information see LAPACK's documentation for GESV routine.
 */
 template<typename scalar_t>
 static void apply_solve(Tensor& b, Tensor& A, Tensor& infos) {
 #if !AT_BUILD_WITH_LAPACK()
   AT_ERROR("solve: LAPACK library not found in compilation");
 #else
   auto A_data = A.data_ptr<scalar_t>();
   auto b_data = b.data_ptr<scalar_t>();
   auto A_mat_stride = matrixStride(A);
   auto b_mat_stride = matrixStride(b);
   auto batch_size = batchCount(A);
   auto n = A.size(-2);
   auto nrhs = b.size(-1);
   auto lda = std::max<int64_t>(1, n);

   auto ipiv = at::empty({lda}, b.options().dtype(kInt));
   auto ipiv_data = ipiv.data_ptr<int>();
   auto infos_data = infos.data_ptr<int>();

   for (const auto i : c10::irange(batch_size)) {
     scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
     scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
     int* info_working_ptr = &infos_data[i];
     lapackSolve<scalar_t>(n, nrhs, A_working_ptr, lda, ipiv_data, b_working_ptr, lda, info_working_ptr);
   }
 #endif
 }

 std::tuple<Tensor, Tensor> _solve_helper_cpu(const Tensor& self, const Tensor& A) {
   auto self_working_copy = cloneBatchedColumnMajor(self);
   auto A_working_copy = cloneBatchedColumnMajor(A);
   // infos might not get filled for empty inputs therefore at::zeros is used instead of at::empty
   auto infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "solve_cpu", [&]{
     apply_solve<scalar_t>(self_working_copy, A_working_copy, infos);
   });
   if (self.dim() > 2) {
     batchCheckErrors(infos, "solve_cpu");
   } else {
     singleCheckErrors(infos.item().toInt(), "solve_cpu");
   }
   return std::tuple<Tensor, Tensor>(self_working_copy, A_working_copy);
 }

 // Supports arbitrary batch dimensions for self and A
 std::tuple<Tensor,Tensor> solve(const Tensor& self, const Tensor& A) {
   TORCH_WARN_ONCE(
     "torch.solve is deprecated in favor of torch.linalg.solve",
     "and will be removed in a future PyTorch release.\n",
     "torch.linalg.solve has its arguments reversed and does not return the LU factorization.\n",
     "To get the LU factorization see torch.lu, which can be used with torch.lu_solve or torch.lu_unpack.\n",
     "X = torch.solve(B, A).solution\n",
     "should be replaced with\n",
     "X = torch.linalg.solve(A, B)"
   );
   TORCH_CHECK(self.dim() >= 2,
            "B should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
   TORCH_CHECK(A.dim() >= 2,
            "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
   Tensor self_broadcasted, A_broadcasted;
   std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "solve");
   return at::_solve_helper(self_broadcasted, A_broadcasted);
 }

 std::tuple<Tensor&,Tensor&> solve_out(const Tensor& self, const Tensor& A, Tensor& solution, Tensor& lu) {
   TORCH_WARN_ONCE(
     "torch.solve is deprecated in favor of torch.linalg.solve",
     "and will be removed in a future PyTorch release.\n",
     "torch.linalg.solve has its arguments reversed and does not return the LU factorization.\n",
     "To get the LU factorization see torch.lu, which can be used with torch.lu_solve or torch.lu_unpack.\n",
     "X = torch.solve(B, A).solution\n",
     "should be replaced with\n",
     "X = torch.linalg.solve(A, B)"
   );
   checkSameDevice("solve", solution, self, "solution");
   checkSameDevice("solve", lu, self, "lu");
   checkLinalgCompatibleDtype("solve", solution, self, "solution");
   checkLinalgCompatibleDtype("solve", lu, self, "lu");

   Tensor solution_tmp, lu_tmp;
   std::tie(solution_tmp, lu_tmp) = at::_solve_helper(self, A);

   at::native::resize_output(solution, solution_tmp.sizes());
   at::native::resize_output(lu, lu_tmp.sizes());
   solution.copy_(solution_tmp);
   lu.copy_(lu_tmp);
   return std::tuple<Tensor&, Tensor&>(solution, lu);
 }

 // Solves a system of linear equations matmul(input, x) = other in-place
 // LAPACK/MAGMA error codes are saved in 'infos' tensor, they are not checked here
 static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor& input, const Tensor& other) {
   checkSameDevice("linalg_solve", result, input);
   checkSameDevice("linalg_solve", other, input, "other");
   checkLinalgCompatibleDtype("linalg_solve", result, input);

   TORCH_CHECK(input.scalar_type() == other.scalar_type(),
     "input dtype ", input.scalar_type(), " does not match other dtype ", other.scalar_type());

   TORCH_CHECK(input.dim() >= 2,
            "input should have at least 2 dimensions, but has ", input.dim(), " dimensions instead");
   TORCH_CHECK(other.dim() >= 1,
            "other should have at least 1 dimension, but has ", other.dim(), " dimensions instead");

   // Two types of 'other' tensors are supported:
   // - 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
   // - 2-dimensional (2D) tensor or batch of 2D tensors (matrix case)
   // original torch.solve supported only the matrix case, while NumPy works for both cases
   // for the batched input we need to be able to distinguish them
   bool vector_case = linalg_solve_is_vector_rhs(input, other);

   bool is_batched_column_major = false;
   if (vector_case) {
     is_batched_column_major = result.is_contiguous();
   } else if (!vector_case && result.dim() >= 2) {
     is_batched_column_major = result.transpose(-2, -1).is_contiguous();
   }

   // if 'other' is a batch of 2D tensors, then 'input' can be non-batched and will be broadcasted
   auto expected_shape = IntArrayRef(input.sizes().data(), input.dim() - 1);  // input.shape[:-1]
   if (!vector_case && other.dim() > 2) {
     expected_shape = other.sizes();
   }

   bool result_equal_expected_shape = result.sizes().equals(expected_shape);
   bool result_input_same_type = (result.scalar_type() == input.scalar_type());

   // if result is not empty and not in batched column major format
   bool copy_needed = (result.numel() != 0 && !is_batched_column_major);
   copy_needed |= !result_input_same_type;  // or result does not have the same dtype as input
   copy_needed |= (result.numel() != 0 && !result_equal_expected_shape); // or result does not have the expected shape
   // we have to allocate a temporary tensor
   if (copy_needed) {
     Tensor result_tmp = at::empty({0}, input.options());
     result_tmp = linalg_solve_out_info(result_tmp, infos, input, other);
     at::native::resize_output(result, result_tmp.sizes());
     result.copy_(result_tmp);
     return result;
   }
   // else use result's storage directly

   // we need to unsqueeze 'other' because 2-dimensional tensors are expected in the implementation
   Tensor other_ = vector_case ? other.unsqueeze(-1) : other;

   // _linalg_broadcast_batch_dims also includes linearSolveCheckInputs
   // it checks for squareness of 'input' and 'shape' compatibility of 'other' and 'input'
   Tensor other_broadcasted, input_broadcasted;
   std::tie(other_broadcasted, input_broadcasted) = _linalg_broadcast_batch_dims(other_, input, "linalg_solve");

   auto squeezed_other_broadcasted = at::squeeze(other_broadcasted, -1);
   auto squeezed_result_shape = squeezed_other_broadcasted.sizes();

   // if result has no elements we can modify it
   if (result.numel() == 0) {
     if (vector_case) {
       result.resize_(squeezed_result_shape);
     } else {
       at::native::resize_as_(result, other_broadcasted.transpose(-2, -1), MemoryFormat::Contiguous);
       result.transpose_(-2, -1);
     }
   }

   auto expected_result_shape = vector_case ? squeezed_result_shape : other_broadcasted.sizes();
   TORCH_INTERNAL_ASSERT(result.sizes().equals(expected_result_shape));
   TORCH_INTERNAL_ASSERT(result.scalar_type() == input.scalar_type());
   TORCH_INTERNAL_ASSERT(result.device() == input.device());

   // result tensor must be in batched column major order (Fortran contiguous) for 2D inputs
   // or C contiguous for 1D input
   if (vector_case) {
     TORCH_INTERNAL_ASSERT(result.is_contiguous());
   } else {
     TORCH_INTERNAL_ASSERT(result.transpose(-2, -1).is_contiguous());
   }

   // for 1-dimensional 'other', we need to unsqueeze the result before passing to "apply_solve"
   if (vector_case) {
     result = result.unsqueeze_(-1);
   }

   // lu_stub+lu_solve_stub perform calculations in-place and 'result' must be a copy of 'other_broadcasted'
   result.copy_(other_broadcasted);

   auto input_working_copy = cloneBatchedColumnMajor(input_broadcasted);

   TORCH_INTERNAL_ASSERT(infos.scalar_type() == kInt);
   TORCH_INTERNAL_ASSERT(infos.device() == input.device());
   infos.resize_({std::max<int64_t>(1, batchCount(input_broadcasted))});
   // if input is empty infos might not get filled; make sure infos doesn't contain garbage then
   if (input.numel() == 0) {
     infos.fill_(0);
   }

   // compute the LU factorization of 'input_working_copy'
   auto pivots_shape = IntArrayRef(input_broadcasted.sizes().data(), input_broadcasted.dim() - 2).vec(); // input_broadcasted.shape[:-2]
   pivots_shape.push_back(std::min(input.size(-2), input.size(-1)));
   Tensor pivots = at::empty(pivots_shape, input.options().dtype(kInt));
   lu_stub(input.device().type(), input_working_copy, pivots, infos, /*compute_pivots=*/true);

   // solve the linear system using the LU factorization
   lu_solve_stub(input.device().type(), result, input_working_copy, pivots);

   // for 1-dimensional 'other', we need to squeeze the result after "apply_solve"
   if (vector_case) {
     result = result.squeeze_(-1);
   }

   return result;
 }

 // Solves a system of linear equations matmul(input, x) = other in-place
 Tensor& linalg_solve_out(const Tensor& input, const Tensor& other, Tensor& result) {
   auto infos = at::empty({0}, input.options().dtype(kInt));
   result = linalg_solve_out_info(result, infos, input, other);

   // Now check LAPACK/MAGMA error codes
   // batchCheckErrors(Tensor, char*) calls 'infos = infos.to(kCPU)'
   bool vector_case = linalg_solve_is_vector_rhs(input, other);
   if (vector_case ? result.dim() > 1 : result.dim() > 2) {
     batchCheckErrors(infos, "linalg_solve");
   } else {
     singleCheckErrors(infos.item().toInt(), "linalg_solve");
   }

   return result;
 }

 // Solves a system of linear equations matmul(input, x) = other
 Tensor linalg_solve(const Tensor& input, const Tensor& other) {
   Tensor result = at::empty({0}, input.options());
   result = at::linalg_solve_out(result, input, other);
   return result;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 /*
 Computes the inverse of n-by-n matrix 'self'
 This is an in-place routine, it overwrites the content of 'self'.
 'infos_lu' and 'infos_getri' are int Tensors containing error codes for each matrix in the batched input.
 'infos_lu' is for holding lapackLU errors, and 'infos_getri' is for holding lapackGetri errors.
 For more information see LAPACK's documentation for GETRI and GETRF routines.
 */
 template <typename scalar_t>
 static void apply_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) {
 #if !AT_BUILD_WITH_LAPACK()
   AT_ERROR("inverse: LAPACK library not found in compilation");
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
   auto self_matrix_stride = matrixStride(self);
   auto batch_size = batchCount(self);
   auto n = self.size(-2);
   auto lda = std::max<int64_t>(1, n);

   auto ipiv = at::empty({lda}, self.options().dtype(kInt));
   auto ipiv_data = ipiv.data_ptr<int>();
   auto infos_lu_data = infos_lu.data_ptr<int>();
   auto infos_getri_data = infos_getri.data_ptr<int>();

   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int info;
   // Run once, first to get the optimum work size
   // Since we deal with batches of matrices with the same dimensions, doing this outside
   // the loop saves (batch_size - 1) workspace queries which would provide the same result
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   int lwork = -1;
   scalar_t wkopt;
   lapackGetri<scalar_t>(n, self_data, lda, ipiv_data, &wkopt, lwork, &info);
   lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());
   auto work_data = work.data_ptr<scalar_t>();

   for (const auto i : c10::irange(batch_size)) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
     int* info_lu_working_ptr = &infos_lu_data[i];
     lapackLu<scalar_t>(n, n, self_working_ptr, lda, ipiv_data, info_lu_working_ptr);

     // now compute the actual inverse
     int* info_getri_working_ptr = &infos_getri_data[i];
     lapackGetri<scalar_t>(n, self_working_ptr, lda, ipiv_data, work_data, lwork, info_getri_working_ptr);
   }
 #endif
 }

 Tensor _inverse_helper_cpu(const Tensor& self) {
   auto infos_lu = at::empty({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
   auto infos_getri = at::empty({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt));
   auto self_working_copy = cloneBatchedColumnMajor(self);
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "inverse_cpu", [&]{
     apply_inverse<scalar_t>(self_working_copy, infos_lu, infos_getri);
   });
   if (self.dim() > 2) {
     batchCheckErrors(infos_lu, "inverse_cpu");
     batchCheckErrors(infos_getri, "inverse_cpu");
   } else {
     singleCheckErrors(infos_lu.item().toInt(), "inverse_cpu");
     singleCheckErrors(infos_getri.item().toInt(), "inverse_cpu");
   }
   return self_working_copy;
 }

 Tensor inverse(const Tensor &self) {
   if (self.numel() == 0) {
     return at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
   squareCheckInputs(self);
   return at::_inverse_helper(self);
 }

 Tensor& inverse_out(const Tensor &self, Tensor &result) {
   checkSameDevice("inverse", result, self);
   checkLinalgCompatibleDtype("inverse", result, self);
   Tensor result_tmp = at::inverse(self);
   at::native::resize_output(result, result_tmp.sizes());
   result.copy_(result_tmp);
   return result;
 }

 // This is a type dispatching helper function for 'apply_inverse'
 Tensor& _linalg_inv_out_helper_cpu(Tensor &result, Tensor& infos_lu, Tensor& infos_getri) {
   // This function calculates the inverse matrix in-place
   // result should be in column major order and contain matrices to invert
   // the content of result is overwritten by 'apply_inverse'
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "linalg_inv_out_cpu", [&]{
     apply_inverse<scalar_t>(result, infos_lu, infos_getri);
   });
   return result;
 }

 // Computes the inverse matrix of 'input', it is is saved to 'result' in-place
 // LAPACK/MAGMA/cuSOLVER error codes are saved in 'infos' tensors, they are not checked here
 static Tensor& linalg_inv_out_info(Tensor& result, Tensor& infos_lu, Tensor& infos_getri, const Tensor& input) {
   squareCheckInputs(input);
   checkSameDevice("linalg_inv", result, input);
   checkLinalgCompatibleDtype("linalg_inv", result, input);

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos_lu.scalar_type() == kInt);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos_getri.scalar_type() == kInt);

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos_lu.device() == input.device());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos_getri.device() == input.device());

   bool result_input_same_type = (result.scalar_type() == input.scalar_type());
   bool result_equal_expected_shape = result.sizes().equals(input.sizes());
   bool is_batched_column_major = false;
   if (result.dim() >= 2) {
     is_batched_column_major = result.transpose(-2, -1).is_contiguous();
   }

   // if result is not empty and not in batched column major format
   bool copy_needed = (result.numel() != 0 && !is_batched_column_major);
   copy_needed |= !result_input_same_type;  // or result does not have the same dtype as input
   copy_needed |= (result.numel() != 0 && !result_equal_expected_shape); // or result does not have the expected shape
   // we have to allocate a temporary tensor

   // similar conditions for infos_lu and infos_getri tensors
   auto expected_info_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2); // input.shape[:-2]
   copy_needed |= (infos_lu.numel() != 0 && !infos_lu.is_contiguous());
   copy_needed |= (infos_lu.numel() != 0 && !(infos_lu.sizes().equals(expected_info_shape)));

   copy_needed |= (infos_getri.numel() != 0 && !infos_getri.is_contiguous());
   copy_needed |= (infos_getri.numel() != 0 && !(infos_getri.sizes().equals(expected_info_shape)));

   if (copy_needed) {
     Tensor result_tmp = at::empty(input.sizes(), input.options());
     result_tmp.transpose_(-2, -1);
     Tensor infos_lu_tmp = at::zeros({expected_info_shape}, input.options().dtype(kInt));
     Tensor infos_getri_tmp = at::zeros({expected_info_shape}, input.options().dtype(kInt));

     result_tmp = linalg_inv_out_info(result_tmp, infos_lu_tmp, infos_getri_tmp, input);

     at::native::resize_output(result, result_tmp.sizes());
     result.copy_(result_tmp);
     at::native::resize_output(infos_lu, infos_lu_tmp.sizes());
     infos_lu.copy_(infos_lu_tmp);
     at::native::resize_output(infos_getri, infos_getri_tmp.sizes());
     infos_getri.copy_(infos_getri_tmp);
     return result;
   }
   // else  use result's storage directly

   // if result has no elements we can modify it
   if (result.numel() == 0) {
     at::native::resize_as_(result, input.transpose(-2, -1), MemoryFormat::Contiguous);
     result.transpose_(-2, -1);
   }

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(input.sizes()));
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.scalar_type() == input.scalar_type());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.device() == input.device());

   // result tensor must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.transpose(-2, -1).is_contiguous());

   // if info has no elements we can modify it
   if (infos_lu.numel() == 0) {
     infos_lu.resize_(expected_info_shape);
     infos_lu.fill_(0);
   }
   if (infos_getri.numel() == 0) {
     infos_getri.resize_(expected_info_shape);
     infos_getri.fill_(0);
   }

   // info tensors must be contiguous
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos_lu.is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos_lu.sizes().equals(expected_info_shape));
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos_getri.is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos_getri.sizes().equals(expected_info_shape));

   // _linalg_inv_out_helper_ (apply_inverse) performs calculations in-place and result must be a copy of input
   result.copy_(input);

   // TODO: Replace this helper with DECLARE/DEFINE_DISPATCH
   result = at::_linalg_inv_out_helper_(result, infos_lu, infos_getri);
   return result;
 }

 // Computes the inverse matrix of 'input', it is is saved to 'result' in-place
 Tensor& linalg_inv_out(const Tensor &input, Tensor &result) {
   auto info_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2); // input.shape[:-2]
   auto infos_lu = at::zeros({info_shape}, input.options().dtype(kInt));
   auto infos_getri = at::zeros({info_shape}, input.options().dtype(kInt));
   result = linalg_inv_out_info(result, infos_lu, infos_getri, input);

   // Now check LAPACK/MAGMA/cuSOLVER error codes
   if (result.dim() > 2) {
     batchCheckErrors(infos_lu, "linalg_inv_lu");
     batchCheckErrors(infos_getri, "linalg_inv_getri");
   } else {
     singleCheckErrors(infos_lu.item().toInt(), "linalg_inv_lu");
     singleCheckErrors(infos_getri.item().toInt(), "linalg_inv_getri");
   }

   return result;
 }

 // Computes the inverse matrix of 'input'
 Tensor linalg_inv(const Tensor &input) {
   Tensor result, info;
   std::tie(result, info) = at::linalg_inv_ex(input, /*check_errors=*/false);

   // we pass check_errors=false above and do the check here
   // so that the name of the function is correct in the error message
   if (input.dim() > 2) {
     batchCheckErrors(info, "torch.linalg.inv");
   } else {
     singleCheckErrors(info.item<int64_t>(), "torch.linalg.inv");
   }

   return result;
 }

 std::tuple<Tensor&, Tensor&> linalg_inv_ex_out(const Tensor& input, bool check_errors, Tensor& inverse, Tensor& info) {
   squareCheckInputs(input);
   ScalarType info_output_type = ScalarType::Int;
   TORCH_CHECK(
       info.scalar_type() == info_output_type,
       "torch.linalg.inv_ex: ",
       "Expected info to have ", info_output_type, " dtype, but got info with dtype ", info.scalar_type());

   // provided `info` tensor is used to save the information about the LU decomposition of `input`
   // in addition current implementation requires a separate tensor
   // for saving the information about the inversion process after the LU decomposition
   auto expected_info_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2); // input.shape[:-2]
   auto info_inversion = at::zeros({expected_info_shape}, input.options().dtype(kInt));

   linalg_inv_out_info(inverse, info, info_inversion, input);

   if (check_errors) {
     if (input.dim() > 2) {
       batchCheckErrors(info, "torch.linalg.inv_ex");
     } else {
       singleCheckErrors(info.item().toInt(), "torch.linalg.inv_ex");
     }
   }

   return std::tuple<Tensor&, Tensor&>(inverse, info);
 }

 std::tuple<Tensor, Tensor> linalg_inv_ex(const Tensor& input, bool check_errors) {
   squareCheckInputs(input);
   Tensor inverse = at::empty(input.sizes(), input.options(), MemoryFormat::Contiguous);
   inverse.transpose_(-2, -1); // make `inverse` tensor with batched column major format
   auto info_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2); // input.shape[:-2]
   Tensor info = at::zeros({info_shape}, input.options().dtype(kInt));
   std::tie(inverse, info) = at::native::linalg_inv_ex_out(input, check_errors, inverse, info);
   return std::make_tuple(inverse, info);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 template<typename scalar_t>
 static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, std::vector<int64_t>& infos) {
 #if !AT_BUILD_WITH_LAPACK()
   AT_ERROR("cholesky_solve: LAPACK library not found in compilation");
 #else
   char uplo = upper ? 'U' : 'L';

   auto A_data = A.data_ptr<scalar_t>();
   auto b_data = b.data_ptr<scalar_t>();
   auto A_mat_stride = matrixStride(A);
   auto b_mat_stride = matrixStride(b);
   auto batch_size = batchCount(A);
   auto n = A.size(-2);
   auto nrhs = b.size(-1);

   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int info;
   for (const auto i : c10::irange(batch_size)) {
     scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
     scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
     lapackCholeskySolve<scalar_t>(uplo, n, nrhs, A_working_ptr, n, b_working_ptr, n, &info);
     infos[i] = info;
     if (info != 0) {
       return;
     }
   }
 #endif
 }

 Tensor _cholesky_solve_helper_cpu(const Tensor& self, const Tensor& A, bool upper) {
   auto self_working_copy = cloneBatchedColumnMajor(self);
   auto A_working_copy = cloneBatchedColumnMajor(A);
   std::vector<int64_t> infos(batchCount(self), 0);
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "cholesky_solve_cpu", [&]{
     apply_cholesky_solve<scalar_t>(self_working_copy, A_working_copy, upper, infos);
   });
   if (self.dim() > 2) {
     batchCheckErrors(infos, "cholesky_solve_cpu");
   } else {
     singleCheckErrors(infos[0], "cholesky_solve_cpu");
   }
   return self_working_copy;
 }

 // Supports arbitrary batch dimensions for self and A
 Tensor cholesky_solve(const Tensor& self, const Tensor& A, bool upper) {
   TORCH_CHECK(self.dim() >= 2,
            "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
   TORCH_CHECK(A.dim() >= 2,
            "u should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
   Tensor self_broadcasted, A_broadcasted;
   std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "cholesky_solve");
   return at::_cholesky_solve_helper(self_broadcasted, A_broadcasted, upper);
 }

 Tensor& cholesky_solve_out(const Tensor& self, const Tensor& A, bool upper, Tensor& result) {
   checkSameDevice("cholesky_solve", result, self);
   checkLinalgCompatibleDtype("cholesky_solve", result, self);
   Tensor result_tmp = at::cholesky_solve(self, A, upper);
   at::native::resize_output(result, result_tmp.sizes());
   result.copy_(result_tmp);
   return result;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 DEFINE_DISPATCH(cholesky_stub);

 Tensor cholesky(const Tensor &self, bool upper) {
    TORCH_WARN_ONCE(
     "torch.cholesky is deprecated in favor of torch.linalg.cholesky and will be ",
     "removed in a future PyTorch release.\n",
     "L = torch.cholesky(A)\n",
     "should be replaced with\n",
     "L = torch.linalg.cholesky(A)\n",
     "and\n"
     "U = torch.cholesky(A, upper=True)\n",
     "should be replaced with\n",
     "U = torch.linalg.cholesky(A).transpose(-2, -1).conj().\n"
     "This transform will produce equivalent results for all valid (symmetric positive definite) inputs."
   );
   if (self.numel() == 0) {
     return at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
   squareCheckInputs(self);

   auto raw_cholesky_output = cloneBatchedColumnMajor(self);
   auto info_shape = IntArrayRef(
       self.sizes().cbegin(), self.sizes().cend() - 2); // self.shape[:-2]
   auto info = at::empty({info_shape}, self.options().dtype(kInt));

   // fill the raw_cholesky_output with the result
   cholesky_stub(self.device().type(), raw_cholesky_output, info, upper);

   if (self.dim() > 2) {
     batchCheckErrors(info, "cholesky");
   } else {
     singleCheckErrors(info.item<int64_t>(), "cholesky");
   }

   if (upper) {
     return raw_cholesky_output.triu_();
   } else {
     return raw_cholesky_output.tril_();
   }
 }

 Tensor& cholesky_out(const Tensor &self, bool upper, Tensor &result) {
    TORCH_WARN_ONCE(
     "torch.cholesky is deprecated in favor of torch.linalg.cholesky and will be ",
     "removed in a future PyTorch release.\n",
     "L = torch.cholesky(A)\n",
     "should be replaced with\n",
     "L = torch.linalg.cholesky(A)\n",
     "and\n"
     "U = torch.cholesky(A, upper=True)\n",
     "should be replaced with\n",
     "U = torch.linalg.cholesky(A).transpose(-2, -1).conj().\n"
     "This transform will produce equivalent results for all valid (symmetric positive definite) inputs."
   );
   checkSameDevice("cholesky", result, self);
   checkLinalgCompatibleDtype("cholesky", result, self);
   Tensor result_tmp = at::cholesky(self, upper);
   at::native::resize_output(result, result_tmp.sizes());
   result.copy_(result_tmp);
   return result;
 }

 void linalg_cholesky_out_info(const Tensor& input, const Tensor& result, const Tensor& info) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.size(-1) == input.size(-2));

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.scalar_type() == input.scalar_type());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.device() == input.device());

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info.scalar_type() == at::kInt);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info.device() == input.device());

   // if result has no elements we can modify it
   if (result.numel() == 0) {
     at::native::resize_as_(result, input.transpose(-2, -1), MemoryFormat::Contiguous);
     result.transpose_(-2, -1);
   }

   // result tensor must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.transpose(-2, -1).is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(input.sizes()));

   // cholesky_stub (apply_cholesky) performs calculations in-place and result must be a copy of input
   result.copy_(input);

   // if info has no elements we can modify it
   auto expected_info_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2); // input.shape[:-2]
   if (info.numel() == 0) {
     info.resize_(expected_info_shape);
   }

   // info must be contiguous
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info.is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info.sizes().equals(expected_info_shape));
   info.fill_(0);

   cholesky_stub(result.device().type(), result, info, /*upper=*/false);

   result.tril_();
 }

 std::tuple<Tensor&, Tensor&> linalg_cholesky_ex_out(const Tensor& input, bool check_errors, Tensor& L, Tensor& info) {
   squareCheckInputs(input);
   checkSameDevice("torch.linalg.cholesky_ex", L, input, "L");
   checkLinalgCompatibleDtype("torch.linalg.cholesky_ex", L, input, "L");
   checkSameDevice("torch.linalg.cholesky_ex", info, input, "info");

   // Do not allow type promotion for the `info` tensor, it must be of Int dtype
   // Int is used because current interface to LAPACK and its CUDA implementation use "int" type.
   // https://github.com/pytorch/pytorch/pull/56724#discussion_r618916774
   ScalarType info_output_type = ScalarType::Int;
   TORCH_CHECK(
       info.scalar_type() == info_output_type,
       "torch.linalg.cholesky_ex: ",
       "Expected info to have ", info_output_type, " dtype, but got info with dtype ", info.scalar_type());

   bool L_input_same_type = (L.scalar_type() == input.scalar_type());
   bool L_equal_expected_shape = L.sizes().equals(input.sizes());
   bool is_L_batched_column_major = false;
   if (L.dim() >= 2) {
     is_L_batched_column_major = L.transpose(-2, -1).is_contiguous();
   }

   // if L is not empty and not in batched column major format
   bool copy_needed = (L.numel() != 0 && !is_L_batched_column_major);
   copy_needed |= (L.numel() != 0 && !L_equal_expected_shape); // or L does not have the expected shape
   copy_needed |= !L_input_same_type;  // or L does not have the same dtype as input
   // we have to allocate a temporary tensor

   // similar conditions for info tensor
   auto expected_info_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2); // input.shape[:-2]
   copy_needed |= (info.numel() != 0 && !info.is_contiguous());
   copy_needed |= (info.numel() != 0 && !(info.sizes().equals(expected_info_shape))); // or L does not have the expected shape

   if (copy_needed) {
     Tensor L_tmp = at::empty({0}, input.options());
     Tensor info_tmp = at::empty({0}, input.options().dtype(kInt));
     linalg_cholesky_out_info(input, L_tmp, info_tmp);
     at::native::resize_output(L, L_tmp.sizes());
     L.copy_(L_tmp);
     at::native::resize_output(info, info_tmp.sizes());
     info.copy_(info_tmp);
   } else {
     // use "out" tensors' memory directly
     linalg_cholesky_out_info(input, L, info);
   }

   if (check_errors) {
     if (input.dim() > 2) {
       batchCheckErrors(info, "torch.linalg.cholesky_ex");
     } else {
       singleCheckErrors(info.item<int64_t>(), "torch.linalg.cholesky_ex");
     }
   }

   return std::tuple<Tensor&, Tensor&>(L, info);
 }

 std::tuple<Tensor, Tensor> linalg_cholesky_ex(const Tensor& input, bool check_errors) {
   Tensor L = at::empty({0}, input.options());
   Tensor info = at::empty({0}, input.options().dtype(kInt));
   std::tie(L, info) = at::native::linalg_cholesky_ex_out(input, check_errors, L, info);
   return std::make_tuple(L, info);
 }

 Tensor linalg_cholesky(const Tensor &self) {
   Tensor result, info;
   std::tie(result, info) = at::linalg_cholesky_ex(self, /*check_errors=*/false);

   // we pass check_errors=false above and do the check here
   // so that the name of the function is correct in the error message
   if (self.dim() > 2) {
     batchCheckErrors(info, "torch.linalg.cholesky");
   } else {
     singleCheckErrors(info.item<int64_t>(), "torch.linalg.cholesky");
   }

   return result;
 }

 Tensor& linalg_cholesky_out(const Tensor &self, Tensor &result) {
   // linalg_cholesky_ex_outf includes these checks, but we do it here
   // so that the name of the function is correct in the error message
   checkSameDevice("torch.linalg.cholesky", result, self);
   checkLinalgCompatibleDtype("torch.linalg.cholesky", result, self);

   Tensor info = at::empty({0}, self.options().dtype(kInt));
   std::tie(result, info) = at::linalg_cholesky_ex_outf(self, /*check_errors=*/false, result, info);

   // we pass check_errors=false above and do the check here
   // so that the name of the function is correct in the error message
   if (self.dim() > 2) {
     batchCheckErrors(info, "torch.linalg.cholesky");
   } else {
     singleCheckErrors(info.item<int64_t>(), "torch.linalg.cholesky");
   }

   return result;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cholesky_inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(cholesky_inverse_stub);

 Tensor& cholesky_inverse_out_info(Tensor& result, Tensor& infos, const Tensor& input, bool upper) {
   TORCH_INTERNAL_ASSERT(input.dim() >= 2);
   TORCH_INTERNAL_ASSERT(input.size(-1) == input.size(-2));

   TORCH_INTERNAL_ASSERT(result.scalar_type() == input.scalar_type());
   TORCH_INTERNAL_ASSERT(result.device() == input.device());

   TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
   TORCH_INTERNAL_ASSERT(infos.device() == at::kCPU);
   TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));

   // if result has no elements we can modify it
   if (result.numel() == 0) {
     at::native::resize_as_(result, input.transpose(-2, -1), MemoryFormat::Contiguous);
     result.transpose_(-2, -1);
   }

   // result tensor must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT(result.transpose(-2, -1).is_contiguous());
   TORCH_INTERNAL_ASSERT(result.sizes().equals(input.sizes()));

   // cholesky_inverse_stub (apply_cholesky_inverse) performs calculations in-place and result must be a copy of input
   result.copy_(input);

   // infos must be contiguous
   TORCH_INTERNAL_ASSERT(infos.is_contiguous());
   infos.fill_(0);

   result = cholesky_inverse_stub(result.device().type(), result, infos, upper);
   return result;
 }

 Tensor& cholesky_inverse_out(const Tensor &input, bool upper, Tensor &result) {
   squareCheckInputs(input);
   checkSameDevice("cholesky_inverse", result, input);
   checkLinalgCompatibleDtype("cholesky_inverse", result, input);

   // MAGMA requires 'infos' to reside in CPU memory, therefore we create 'infos' only on CPU for now.
   auto infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt).device(kCPU));

   bool result_input_same_type = (result.scalar_type() == input.scalar_type());
   bool result_equal_expected_shape = result.sizes().equals(input.sizes());
   bool is_batched_column_major = false;
   if (result.dim() >= 2) {
     is_batched_column_major = result.transpose(-2, -1).is_contiguous();
   }

   // if result is not empty and not in batched column major format
   bool copy_needed = (result.numel() != 0 && !is_batched_column_major);
   copy_needed |= !result_input_same_type;  // or result does not have the same dtype as input
   copy_needed |= (result.numel() != 0 && !result_equal_expected_shape); // or result does not have the expected shape
   // we have to allocate a temporary tensor
   if (copy_needed) {
     Tensor result_tmp = at::empty({0}, input.options());
     result_tmp = cholesky_inverse_out_info(result_tmp, infos, input, upper);
     at::native::resize_output(result, result_tmp.sizes());
     result.copy_(result_tmp);
   } else {
     // use result's memory directly
     result = cholesky_inverse_out_info(result, infos, input, upper);
   }

   // Now check LAPACK/MAGMA error codes
   if (result.dim() > 2) {
     batchCheckErrors(infos, "cholesky_inverse");
   } else {
     singleCheckErrors(infos.item().toInt(), "cholesky_inverse");
   }
   return result;
 }

 Tensor cholesky_inverse(const Tensor &input, bool upper) {
   Tensor result = at::empty({0}, input.options());
   result = at::cholesky_inverse_out(result, input, upper);
   return result;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 DEFINE_DISPATCH(lu_stub);

 std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool check_errors) {
   TORCH_CHECK(self.dim() >= 2,
            "expected tensor with 2 or more dimensions, got size: ", self.sizes(),
            " instead");
   auto m = self.size(-2);
   auto n = self.size(-1);
   auto req_size = self.sizes().vec();
   req_size.pop_back();
   req_size.back() = std::min(m, n);
   auto pivots_tensor = at::empty(req_size, self.options().dtype(kInt));
   req_size.pop_back();
   auto infos_tensor = at::zeros(req_size, self.options().dtype(kInt));

   // lu_stub (apply_lu) requires batched column major (Fortran-contiguous) tensors
   // 'lu' tensor is modified in-place and must be a copy of 'self'
   Tensor lu = cloneBatchedColumnMajor(self);
   lu_stub(self.device().type(), lu, pivots_tensor, infos_tensor, compute_pivots);

   if (check_errors) {
     if (self.dim() > 2) {
       batchCheckErrors(infos_tensor, "lu", /*allow_singular=*/true);
     } else {
       singleCheckErrors(infos_tensor.item<int64_t>(), "lu", /*allow_singular=*/true);
     }
   }
   return std::make_tuple(lu, pivots_tensor, infos_tensor);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(triangular_solve_stub);

 /*
 Solves the matrix equation 'input' @ 'result' = 'other' for the 'result'.
 The result of the computation is saved in-place in 'result' tensor,
 'clone_input' will be a copy of 'input',
 'infos' is used to store information for possible checks for error,
 'upper' controls the portion of input matrix to consider in computations,
 'transpose' if true then 'input.transpose(-2, -1)' @ 'result' = 'other' is solved,
 'unitriangular' if true then the diagonal elements of 'input' are assumed to be 1
 and the actual diagonal values are not used.
 */
 static std::tuple<Tensor&, Tensor&> triangular_solve_out_info(
     Tensor& result,
     Tensor& clone_input,
     Tensor& infos,
     const Tensor& input,
     const Tensor& other,
     bool upper, bool transpose, bool unitriangular) {
   // These internal asserts make explicit the assumptions in the implementation
   // Error check with the actual error messages are done on the higher level of
   // the hierarchy of calls
   TORCH_INTERNAL_ASSERT(input.dim() >= 2);
   TORCH_INTERNAL_ASSERT(input.size(-2) == input.size(-1));

   TORCH_INTERNAL_ASSERT(input.device() == other.device());
   TORCH_INTERNAL_ASSERT(input.device() == result.device());
   TORCH_INTERNAL_ASSERT(input.device() == clone_input.device());
   TORCH_INTERNAL_ASSERT(input.device() == infos.device());

   TORCH_INTERNAL_ASSERT(input.scalar_type() == other.scalar_type());
   TORCH_INTERNAL_ASSERT(input.scalar_type() == result.scalar_type());
   TORCH_INTERNAL_ASSERT(input.scalar_type() == clone_input.scalar_type());

   TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
   TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));
   TORCH_INTERNAL_ASSERT(infos.is_contiguous());

   // if 'result' has no elements we can modify it
   if (result.numel() == 0) {
     result.resize_(other.transpose(-2, -1).sizes(), MemoryFormat::Contiguous);
     result.transpose_(-2, -1);  // make 'result' to have Fortran contiguous memory layout
   }

   // if 'clone_input' has no elements we can modify it
   if (clone_input.numel() == 0) {
     clone_input.resize_(input.transpose(-2, -1).sizes(), MemoryFormat::Contiguous);
     clone_input.transpose_(-2, -1);  // make 'clone_input' to have Fortran contiguous memory layout
   }

   // 'result' and 'clone_input' must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT(result.transpose(-2, -1).is_contiguous());
   TORCH_INTERNAL_ASSERT(clone_input.transpose(-2, -1).is_contiguous());

   // triangular_solve_stub performs calculations in-place
   // 'result' must be a copy of 'other'
   // 'clone_input' must be a copy of 'input'
   TORCH_INTERNAL_ASSERT(result.sizes().equals(other.sizes()));
   TORCH_INTERNAL_ASSERT(clone_input.sizes().equals(input.sizes()));
   result.copy_(other);
   clone_input.copy_(input);

   triangular_solve_stub(input.device().type(), clone_input, result, infos, upper, transpose, /*conjugate_transpose=*/false, unitriangular);

   return std::tuple<Tensor&, Tensor&>(result, clone_input);
 }

 // Supports arbitrary batch dimensions for self and A
 std::tuple<Tensor, Tensor> triangular_solve(const Tensor& self, const Tensor& A,
                                             bool upper, bool transpose, bool unitriangular) {
   TORCH_CHECK(self.dim() >= 2,
            "torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead");
   TORCH_CHECK(A.dim() >= 2,
            "torch.triangular_solve: Expected A to have at least 2 dimensions, but it has ", A.dim(), " dimensions instead");

   Tensor self_broadcasted, A_broadcasted;
   std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "triangular_solve");

   Tensor result = at::empty({0}, self.options());
   Tensor clone_A = at::empty({0}, self.options());
   Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self_broadcasted))}, self.options().dtype(kInt));

   triangular_solve_out_info(result, clone_A, infos, A_broadcasted, self_broadcasted, upper, transpose, unitriangular);

   if (self_broadcasted.dim() > 2) {
     batchCheckErrors(infos, "triangular_solve");
   } else {
     singleCheckErrors(infos.item().toInt(), "triangular_solve");
   }

   return std::tuple<Tensor, Tensor>(result, clone_A);
 }

 std::tuple<Tensor&, Tensor&> triangular_solve_out(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, Tensor& result, Tensor& clone_A) {
   checkSameDevice("triangular_solve", result, self);
   checkLinalgCompatibleDtype("triangular_solve", result, self);
   checkSameDevice("triangular_solve", clone_A, self, "clone_A");
   checkLinalgCompatibleDtype("triangular_solve", clone_A, self, "clone_A");
   Tensor result_tmp, clone_A_tmp;
   std::tie(result_tmp, clone_A_tmp) = at::native::triangular_solve(self, A, upper, transpose, unitriangular);
   at::native::resize_output(result, result_tmp.sizes());
   at::native::resize_output(clone_A, clone_A_tmp.sizes());
   result.copy_(result_tmp);
   clone_A.copy_(clone_A_tmp);
   return std::tuple<Tensor&, Tensor&>(result, clone_A);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(geqrf_stub);

 static void geqrf_out_helper(const Tensor& input, const Tensor& QR, const Tensor& tau) {
   TORCH_INTERNAL_ASSERT(input.dim() >= 2);

   TORCH_INTERNAL_ASSERT(input.scalar_type() == QR.scalar_type());
   TORCH_INTERNAL_ASSERT(input.device() == QR.device());

   TORCH_INTERNAL_ASSERT(input.scalar_type() == tau.scalar_type());
   TORCH_INTERNAL_ASSERT(input.device() == tau.device());

   // if 'QR' has no elements we can modify it
   if (QR.numel() == 0) {
     QR.resize_as_(input.transpose(-2, -1), MemoryFormat::Contiguous);
     QR.transpose_(-2, -1); // make Fortran-contiguous
   }

   auto expected_batch_tau_shape = IntArrayRef(input.sizes().data(), input.dim() - 2).vec(); // input.shape[:-2]
   expected_batch_tau_shape.push_back(std::min(input.size(-2), input.size(-1)));
   if (tau.numel() == 0) {
     tau.resize_(expected_batch_tau_shape);
   }

   // QR tensor must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT(QR.transpose(-2, -1).is_contiguous());
   TORCH_INTERNAL_ASSERT(QR.sizes().equals(input.sizes()));

   // tau tensor must be contiguous
   TORCH_INTERNAL_ASSERT(tau.is_contiguous());
   TORCH_INTERNAL_ASSERT(tau.sizes().equals(expected_batch_tau_shape));

   // geqrf_stub (apply_geqrf) performs calculations in-place and 'QR' must be a copy of input
   QR.copy_(input);
   geqrf_stub(input.device().type(), QR, tau);
 }

 std::tuple<Tensor&, Tensor&> geqrf_out(const Tensor& input, Tensor& QR, Tensor& tau) {
   TORCH_CHECK(input.dim() >= 2, "torch.geqrf: input must have at least 2 dimensions.");

   checkSameDevice("torch.geqrf", QR, input, "a"); // 'a' is used in documentation and native_functions.yml
   checkSameDevice("torch.geqrf", tau, input, "tau");
   checkLinalgCompatibleDtype("torch.geqrf", QR, input, "a");
   checkLinalgCompatibleDtype("torch.geqrf", tau, input, "tau");

   bool QR_input_same_type = (QR.scalar_type() == input.scalar_type());
   bool tau_input_same_type = (tau.scalar_type() == input.scalar_type());
   bool QR_equal_expected_shape = QR.sizes().equals(input.sizes());

   auto expected_batch_tau_shape = IntArrayRef(input.sizes().data(), input.dim() - 2).vec(); // input.shape[:-2]
   expected_batch_tau_shape.push_back(std::min(input.size(-2), input.size(-1)));
   bool tau_equal_expected_shape = tau.sizes().equals(expected_batch_tau_shape);

   bool is_batched_column_major = false;
   if (QR.dim() >= 2) {
     is_batched_column_major = QR.transpose(-2, -1).is_contiguous();
   }

   // if 'QR' is not empty and not in batched column major format
   bool copy_needed = (QR.numel() != 0 && !is_batched_column_major);
   copy_needed |= (QR.numel() != 0 && !QR_equal_expected_shape); // or 'QR' does not have the expected shape
   copy_needed |= !QR_input_same_type;  // or 'QR' does not have the same dtype as input
   // we have to allocate a temporary tensor

   copy_needed |= (tau.numel() != 0 && !tau.is_contiguous());
   copy_needed |= (tau.numel() != 0 && !tau_equal_expected_shape); // or 'tau' does not have the expected shape
   copy_needed |= !tau_input_same_type;  // or 'tau' does not have the same dtype as input

   if (copy_needed) {
     Tensor QR_tmp = at::empty({0}, input.options());
     Tensor tau_tmp = at::empty({0}, input.options());

     geqrf_out_helper(input, QR_tmp, tau_tmp);

     at::native::resize_output(QR, QR_tmp.sizes());
     QR.copy_(QR_tmp);
     at::native::resize_output(tau, tau_tmp.sizes());
     tau.copy_(tau_tmp);
   } else {
     // use "out" tensors' storage directly
     geqrf_out_helper(input, QR, tau);
   }

   return std::tuple<Tensor&, Tensor&>(QR, tau);
 }

 std::tuple<Tensor, Tensor> geqrf(const Tensor& input) {
   Tensor QR = at::empty({0}, input.options());
   Tensor tau = at::empty({0}, input.options());
   std::tie(QR, tau) = at::geqrf_outf(input, QR, tau);
   return std::make_tuple(QR, tau);
 }

 /*
   Computes the QR decomposition using GEQRF and ORGQR operations.
   This is an in-place function and Q, R tensors must have correct shape and be Fortran contiguous.

   Args:
   * `input` - [in] Input tensor for QR decomposition
   * `Q` - [out] Tensor containing the Q matrices of QR decomposition
   * `R` - [out] Tensor containing the R matrices of QR decomposition
   * `compute_q` - controls whether the Q tensor is computed
   * `reduced_mode` - controls the size of Q and R tensors

   For further details, please see the LAPACK documentation for GEQRF and ORGQR.
 */
 void linalg_qr_out_helper(const Tensor& input, const Tensor& Q, const Tensor& R, bool compute_q, bool reduced_mode) {

   TORCH_INTERNAL_ASSERT(input.dim() >= 2);

   TORCH_INTERNAL_ASSERT(input.scalar_type() == Q.scalar_type());
   TORCH_INTERNAL_ASSERT(input.device() == Q.device());

   TORCH_INTERNAL_ASSERT(input.scalar_type() == R.scalar_type());
   TORCH_INTERNAL_ASSERT(input.device() == R.device());

   auto m = input.size(-2);
   auto n = input.size(-1);
   auto mn = std::min(m, n);

   // Q must have the expected shape: reduced_mode ? (..., m, min(m, n)) : (..., m, m)
   if (compute_q) {
     auto expected_Q_shape = input.sizes().vec();
     expected_Q_shape.back() = reduced_mode ? mn : m;
     TORCH_INTERNAL_ASSERT(Q.sizes().equals(expected_Q_shape));

     // Q tensor must be in batched column major order (Fortran contiguous)
     TORCH_INTERNAL_ASSERT(Q.transpose(-2, -1).is_contiguous());
   }

   // R must have the expected shape: (reduced_mode || !compute_q) ? (..., min(m,n), n) : (..., m, n)
   auto expected_R_shape = input.sizes().vec();
   expected_R_shape.end()[-2] = (reduced_mode || !compute_q) ? mn : m;
   TORCH_INTERNAL_ASSERT(R.sizes().equals(expected_R_shape));

   // R tensor must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT(R.transpose(-2, -1).is_contiguous());

   auto tau_shape = input.sizes().vec();
   tau_shape.pop_back();
   tau_shape.back() = mn;
   Tensor tau = at::empty(tau_shape, input.options());

   // geqrf requires m x n workspace input that is modified in-place
   // if m > n and reduced==true we use Q tensor for storing the result of geqrf operation
   // otherwise R tensor is used
   Tensor QR;
   if (m <= n) {
     QR = R;
   } else { // m > n
     if (compute_q) {
       QR = reduced_mode ? Q : R;
     } else {
       // if m > n and compute_q==false we need to allocate an additional temporary tensor
       QR = at::empty(input.transpose(-2, -1).sizes(), input.options());
       QR.transpose_(-2, -1);
     }
   }

   // geqrf_stub (apply_geqrf) performs calculations in-place and 'QR' must be a copy of input
   QR.copy_(input);
   geqrf_stub(input.device().type(), QR, tau);

   // this is for mode='r'
   if (!compute_q) {
     // if m > n we used a temporary tensor to store the result of geqrf
     if (m > n) {
       R.copy_(QR.slice(-2, 0, mn));
     }
     R.triu_();
     return;
   }

   // if Q tensor was used for geqrf copy the result for R from QR
   if (m > n && reduced_mode) {
     R.copy_(Q.slice(-2, 0, n));
   } else {
     Q.slice(-1, 0, n).copy_(R.slice(-1, 0, m));
   }
   R.triu_();

   // Next perform ORGQR for Q using the result from GEQRF
   orgqr_stub(input.device().type(), const_cast<Tensor&>(Q), tau);
 }

 std::tuple<Tensor, Tensor> _linalg_qr_helper_default(const Tensor& input, c10::string_view mode) {
   bool compute_q, reduced_mode;
   std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode);
   auto m = input.size(-2);
   auto n = input.size(-1);
   auto mn = std::min(m, n);

   // Allocate Q, R tensors with correct shape and memory layout
   Tensor Q;
   if (compute_q) {
     auto Qt_shape = input.sizes().vec();
     Qt_shape.end()[-2] = reduced_mode ? mn : m;
     Qt_shape.end()[-1] = m;
     Q = at::empty(Qt_shape, input.options());
     Q.transpose_(-2, -1); // make 'Q' with Fortran contiguous memory layout
   } else {
     Q = at::empty({0}, input.options());
   }

   auto Rt_shape = input.sizes().vec();
   Rt_shape.end()[-2] = n;
   Rt_shape.end()[-1] = (reduced_mode || !compute_q) ? mn : m;
   Tensor R = at::empty(Rt_shape, input.options());
   R.transpose_(-2, -1); // make 'R' with Fortran contiguous memory layout

   // Now fill Q, R tensors with the result
   linalg_qr_out_helper(input, Q, R, compute_q, reduced_mode);

   return std::make_tuple(Q, R);
 }

 std::tuple<Tensor,Tensor> linalg_qr(const Tensor& self, c10::string_view mode) {
   TORCH_CHECK(self.dim() >= 2,
               "qr input should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
   return at::_linalg_qr_helper(self, mode);
 }

 std::tuple<Tensor&,Tensor&> linalg_qr_out(const Tensor& self, c10::string_view mode, Tensor& Q, Tensor& R) {
   TORCH_CHECK(self.dim() >= 2,
               "torch.linalg.qr: input should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
   checkSameDevice("torch.linalg.qr", Q, self, "Q");
   checkSameDevice("torch.linalg.qr", R, self, "R");
   checkLinalgCompatibleDtype("torch.linalg.qr", Q, self, "Q");
   checkLinalgCompatibleDtype("torch.linalg.qr", R, self, "R");
   Tensor Q_tmp, R_tmp;
   std::tie(Q_tmp, R_tmp) = at::_linalg_qr_helper(self, mode);
   at::native::resize_output(Q, Q_tmp.sizes());
   Q.copy_(Q_tmp);
   at::native::resize_output(R, R_tmp.sizes());
   R.copy_(R_tmp);
   return std::tuple<Tensor&, Tensor&>(Q, R);
 }

 std::tuple<Tensor,Tensor> qr(const Tensor& self, bool some) {
   TORCH_WARN_ONCE(
     "torch.qr is deprecated in favor of torch.linalg.qr and will be removed in a future PyTorch release.\n",
     "The boolean parameter 'some' has been replaced with a string parameter 'mode'.\n",
     "Q, R = torch.qr(A, some)\n",
     "should be replaced with\n",
     "Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete')"
   );
   const char* mode = some ? "reduced" : "complete";
   return at::linalg_qr(self, mode);
 }

 std::tuple<Tensor&,Tensor&> qr_out(const Tensor& self, bool some, Tensor& Q, Tensor& R) {
   TORCH_WARN_ONCE(
     "torch.qr is deprecated in favor of torch.linalg.qr and will be removed in a future PyTorch release.\n",
     "The boolean parameter 'some' has been replaced with a string parameter 'mode'.\n",
     "Q, R = torch.qr(A, some)\n",
     "should be replaced with\n",
     "Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete')"
   );
   const char* mode = some ? "reduced" : "complete";
   return at::linalg_qr_out(Q, R, self, mode);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ orgqr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(orgqr_stub);

 /*
   The householder_product (orgqr) function allows reconstruction of an orthogonal (or unitary) matrix Q,
   from a sequence of elementary reflectors, such as is produced by the geqrf function.

   Args:
   * `input` - Tensor with the directions of the elementary reflectors below the diagonal.
   * `tau` - Tensor containing the magnitudes of the elementary reflectors.
   * `result` - result Tensor, which will contain the orthogonal (or unitary) matrix Q.

   For further details, please see the LAPACK/MAGMA documentation.
 */
 Tensor& householder_product_out_helper(const Tensor& input, const Tensor& tau, Tensor& result) {
   TORCH_INTERNAL_ASSERT(input.dim() >= 2);
   TORCH_INTERNAL_ASSERT(input.size(-2) >= input.size(-1));
   TORCH_INTERNAL_ASSERT(input.size(-1) >= tau.size(-1));

   TORCH_INTERNAL_ASSERT(input.scalar_type() == tau.scalar_type());
   TORCH_INTERNAL_ASSERT(input.device() == tau.device());

   TORCH_INTERNAL_ASSERT(result.scalar_type() == input.scalar_type());
   TORCH_INTERNAL_ASSERT(result.device() == input.device());

   // if result has no elements we can modify it
   if (result.numel() == 0) {
     at::native::resize_as_(result, input.transpose(-2, -1), MemoryFormat::Contiguous);
     result.transpose_(-2, -1);
   }

   // result tensor must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT(result.transpose(-2, -1).is_contiguous());
   TORCH_INTERNAL_ASSERT(result.sizes().equals(input.sizes()));

   // tau tensor must be contiguous
   Tensor tau_ = tau;
   if (!tau.is_contiguous()) {
     tau_ = at::empty(tau.sizes(), tau.options(), MemoryFormat::Contiguous);
     tau_.copy_(tau);
   }

   // orgqr_stub (apply_orgqr) performs calculations in-place and result must be a copy of input
   result.copy_(input);

   result = orgqr_stub(result.device().type(), result, tau_);
   return result;
 }

 Tensor& linalg_householder_product_out(const Tensor& input, const Tensor& tau, Tensor& result) {
   TORCH_CHECK(input.dim() >= 2, "torch.linalg.householder_product: input must have at least 2 dimensions.");
   TORCH_CHECK(
       input.size(-2) >= input.size(-1),
       "torch.linalg.householder_product: input.shape[-2] must be greater than or equal to input.shape[-1]");
   TORCH_CHECK(
       input.size(-1) >= tau.size(-1),
       "torch.linalg.householder_product: input.shape[-1] must be greater than or equal to tau.shape[-1]");

   TORCH_CHECK(
       input.dim() - tau.dim() == 1,
       "torch.linalg.householder_product: Expected tau to have one dimension less than input, but got tau.ndim equal to ",
       tau.dim(),
       " and input.ndim is equal to ",
       input.dim());
   if (input.dim() > 2) {
     auto expected_batch_tau_shape = IntArrayRef(input.sizes().data(), input.dim() - 2); // input.shape[:-2]
     auto actual_batch_tau_shape = IntArrayRef(tau.sizes().data(), tau.dim() - 1); // tau.shape[:-1]
     TORCH_CHECK(
         actual_batch_tau_shape.equals(expected_batch_tau_shape),
         "torch.linalg.householder_product: Expected batch dimensions of tau to be equal to input.shape[:-2], but got ",
         actual_batch_tau_shape);
   }

   TORCH_CHECK(
       tau.scalar_type() == input.scalar_type(),
       "torch.linalg.householder_product: tau dtype ",
       tau.scalar_type(),
       " does not match input dtype ",
       input.scalar_type());
   TORCH_CHECK(
       input.device() == tau.device(),
       "torch.linalg.householder_product: Expected input and tau to be on the same device, but found input on ",
       input.device(),
       " and tau on ",
       tau.device(),
       " instead.");

   checkSameDevice("torch.linalg.householder_product", result, input);
   checkLinalgCompatibleDtype("torch.linalg.householder_product", result, input);

   // TODO: uncomment the following when passing incorrectly sized 'result' is not allowed
   // if (result.numel() != 0) {
   //   // Resize messes up the strides, so let's not use at::native::resize_output
   //   TORCH_CHECK(result.sizes().equals(input.sizes()),
   //   "result shape ", result.sizes(), " does not match input shape ", input.sizes());
   // }

   bool result_input_same_type = (result.scalar_type() == input.scalar_type());
   bool result_equal_expected_shape = result.sizes().equals(input.sizes());
   bool is_batched_column_major = false;
   if (result.dim() >= 2) {
     is_batched_column_major = result.transpose(-2, -1).is_contiguous();
   }

   // if result is not empty and not in batched column major format
   bool copy_needed = (result.numel() != 0 && !is_batched_column_major);
   copy_needed |= !result_input_same_type;  // or result does not have the same dtype as input
   copy_needed |= (result.numel() != 0 && !result_equal_expected_shape); // or result does not have the expected shape
   // we have to allocate a temporary tensor
   if (copy_needed) {
     Tensor result_tmp = at::empty({0}, input.options());
     result_tmp = householder_product_out_helper(input, tau, result_tmp);
     at::native::resize_output(result, result_tmp.sizes());
     result.copy_(result_tmp);
   } else {
     // use result's storage directly
     result = householder_product_out_helper(input, tau, result);
   }

   return result;
 }

 Tensor linalg_householder_product(const Tensor& input, const Tensor& tau) {
   Tensor result = at::empty({0}, input.options());
   result = at::linalg_householder_product_outf(input, tau, result);
   return result;
 }

 // torch.orgqr is an alias of torch.linalg.householder_product
 // torch.linalg.householder_product is the preferred new function
 Tensor& orgqr_out(const Tensor& input, const Tensor& tau, Tensor& result) {
   return at::linalg_householder_product_outf(input, tau, result);
 }

 Tensor orgqr(const Tensor& input, const Tensor& tau) {
   return at::linalg_householder_product(input, tau);
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(ormqr_stub);

 void ormqr_out_helper(const Tensor& input, const Tensor& tau, const Tensor& other, const Tensor& result, bool left, bool transpose) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other.dim() >= 2);

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other.size(left ? -2 : -1) >= tau.size(-1));
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other.size(left ? -2 : -1) == input.size(-2));

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == tau.scalar_type());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == tau.device());

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == other.scalar_type());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == other.device());

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.scalar_type() == input.scalar_type());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.device() == input.device());

   // if 'result' has no elements we can modify it
   if (result.numel() == 0) {
     at::native::resize_as_(result, other.transpose(-2, -1), MemoryFormat::Contiguous);
     result.transpose_(-2, -1);
   }

   // 'result' tensor must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.transpose(-2, -1).is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.sizes().equals(other.sizes()));

   // 'tau' tensor must be contiguous
   Tensor tau_ = tau;
   if (!tau.is_contiguous()) {
     tau_ = at::empty(tau.sizes(), tau.options(), MemoryFormat::Contiguous);
     tau_.copy_(tau);
   }

   // 'input' tensor must be Fortran contiguous
   Tensor input_ = input;
   if (!input.transpose(-2, -1).is_contiguous()) {
     input_ = at::empty(input.transpose(-2, -1).sizes(), input.options(), MemoryFormat::Contiguous);
     input_.transpose_(-2, -1);
     input_.copy_(input);
   }

   // ormqr_stub (apply_ormqr) performs calculations in-place and 'result' must be a copy of 'other'
   result.copy_(other);

   ormqr_stub(result.device().type(), input_, tau_, result, left, transpose);
 }

 Tensor& ormqr_out(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose, Tensor& result) {
   TORCH_CHECK(input.dim() >= 2, "torch.ormqr: input must have at least 2 dimensions.");
   TORCH_CHECK(other.dim() >= 2, "torch.ormqr: other must have at least 2 dimensions.");

   int64_t left_size_condition = left ? -2 : -1;
   TORCH_CHECK(
       other.size(left_size_condition) >= tau.size(-1),
       "torch.ormqr: other.shape[",
       left_size_condition,
       "] must be greater than or equal to tau.shape[-1]");

   TORCH_CHECK(
       other.size(left_size_condition) == input.size(-2),
       "torch.ormqr: other.shape[",
       left_size_condition,
       "] must be equal to input.shape[-2]");

   TORCH_CHECK(
       input.dim() - tau.dim() == 1,
       "torch.ormqr: ",
       "Expected tau to have one dimension less than input, but got tau.ndim equal to ",
       tau.dim(),
       " and input.ndim is equal to ",
       input.dim());
   TORCH_CHECK(
       input.dim() == other.dim(),
       "torch.ormqr: ",
       "Expected other to have the same number of dimensions as input, but got other.ndim equal to ",
       other.dim(),
       " and input.ndim is equal to ",
       input.dim());

   if (input.dim() > 2) {
     auto expected_batch_shape = IntArrayRef(input.sizes().data(), input.dim() - 2); // input.shape[:-2]
     auto actual_batch_tau_shape = IntArrayRef(tau.sizes().data(), tau.dim() - 1); // tau.shape[:-1]
     TORCH_CHECK(
         actual_batch_tau_shape.equals(expected_batch_shape),
         "torch.ormqr: Expected batch dimensions of tau to be equal to input.shape[:-2], but got ",
         actual_batch_tau_shape);

     auto actual_batch_other_shape = IntArrayRef(other.sizes().data(), other.dim() - 2); // other.shape[:-2]
     TORCH_CHECK(
         actual_batch_other_shape.equals(expected_batch_shape),
         "torch.ormqr: Expected batch dimensions of other to be equal to input.shape[:-2], but got ",
         actual_batch_other_shape);
   }

   TORCH_CHECK(
       tau.scalar_type() == input.scalar_type(),
       "torch.ormqr: Expected input and tau to have the same dtype, but input has dtype", input.scalar_type(),
       " and tau has dtype ", tau.scalar_type());
   TORCH_CHECK(
       other.scalar_type() == input.scalar_type(),
       "torch.ormqr: Expected input and other to have the same dtype, but input has dtype", input.scalar_type(),
       " and other has dtype ", other.scalar_type());
   TORCH_CHECK(
       result.scalar_type() == input.scalar_type(),
       "torch.ormqr: Expected input and result to have the same dtype, but input has dtype", input.scalar_type(),
       " and result has dtype ", result.scalar_type());

   checkSameDevice("torch.ormqr", tau, input, "tau");
   checkSameDevice("torch.ormqr", other, input, "other");
   checkSameDevice("torch.ormqr", result, input);

   bool result_equal_expected_shape = result.sizes().equals(other.sizes());
   bool is_batched_column_major = false;
   if (result.dim() >= 2) {
     is_batched_column_major = result.transpose(-2, -1).is_contiguous();
   }

   // if result is not empty and not in batched column major format
   bool copy_needed = (result.numel() != 0 && !is_batched_column_major);
   copy_needed |= (result.numel() != 0 && !result_equal_expected_shape); // or result does not have the expected shape
   // we have to allocate a temporary tensor
   if (copy_needed) {
     Tensor result_tmp = at::empty({0}, input.options());
     ormqr_out_helper(input, tau, other, result_tmp, left, transpose);
     at::native::resize_output(result, result_tmp.sizes());
     result.copy_(result_tmp);
   } else {
     // use result's storage directly
     ormqr_out_helper(input, tau, other, result, left, transpose);
   }

   return result;
 }

 Tensor ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) {
   Tensor result = at::empty({0}, input.options());
   result = at::native::ormqr_out(input, tau, other, left, transpose, result);
   return result;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_eigh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(linalg_eigh_stub);

 /*
   Computes eigenvalues and eigenvectors of the tensor 'input'.

   Args:
   * 'input' - input Tensor for eigendecomposition
   * 'values' - Tensor to store computed eigenvalues
   * 'vectors' - Tensor to store computed eigenvectors
   * 'infos' - Tensor to store LAPACK/MAGMA/cuSOLVER error codes
   * 'compute_eigenvectors' - controls whether eigenvectors should be computed
   * 'uplo_str' - controls the portion of input matrix to consider in computations, allowed values are "u", "U", "l", "L"
     "u", "U" - upper triangular portion of the input matrix is used in computations; "l", "L" - lower.
 */
 void linalg_eigh_out_info(
     const Tensor& input,
     const Tensor& values,
     const Tensor& vectors,
     const Tensor& infos,
     bool compute_eigenvectors,
     const c10::string_view uplo_str) {
   // These internal asserts make explicit the assumptions in the implementation
   // Error check with the actual error messages are done on the higher level of
   // the hierarchy of calls
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.size(-2) == input.size(-1));

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == vectors.device());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.device() == values.device());

   // eigenvalues are always real-valued
   // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
   ScalarType real_dtype = toValueType(input.scalar_type());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.scalar_type() == real_dtype);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == vectors.scalar_type());

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.scalar_type() == at::kInt);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.device() == input.device());

   // infos can have the shape equal to input.shape[:-2] or (batchCount(input), ), both would work with the current implementation.
   // infos.shape == input.shape[:-2] might be useful in the future for easier checking the error code for the specific matrix
   // in batched input when we would have a user-exposed way to get infos tensor.
   // 1-dimensional tensor of shape (batchCount(input), ) is currently used for the internal implementation everywhere.
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.numel() == std::max<int64_t>(1, batchCount(input)));
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.is_contiguous());

   // if 'vectors' has no elements we can modify it
   if (vectors.numel() == 0) {
     vectors.resize_(input.sizes(), MemoryFormat::Contiguous);
     vectors.transpose_(-2, -1);  // make 'vectors' to have Fortran contiguous memory layout
   }

   // if 'values' has no elements we can modify it
   auto values_shape = IntArrayRef(input.sizes().data(), input.dim()-1);  // input.shape[:-1]
   if (values.numel() == 0) {
     values.resize_(values_shape, MemoryFormat::Contiguous);
   }

   // 'vectors' must be in batched column major order (Fortran contiguous)
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(vectors.transpose(-2, -1).is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(vectors.sizes().equals(input.sizes()));

   // 'values' must be contiguous
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.sizes().equals(values_shape));

   // linalg_eigh_stub performs calculations in-place and 'vectors' must be a copy of 'input'
   vectors.copy_(input);

   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   char uplo = std::toupper(uplo_str[0]);
   bool upper = (uplo == 'U');

   linalg_eigh_stub(input.device().type(), values, vectors, infos, upper, compute_eigenvectors);
 }

 std::tuple<Tensor, Tensor> linalg_eigh(const Tensor& input, c10::string_view uplo) {
   squareCheckInputs(input);
   checkUplo(uplo);
   ScalarType real_dtype = toValueType(input.scalar_type());
   Tensor values = at::empty({0}, input.options().dtype(real_dtype));
   Tensor vectors = at::empty({0}, input.options());
   Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt));

   linalg_eigh_out_info(input, values, vectors, infos, true, uplo);

   if (input.dim() > 2) {
     batchCheckErrors(infos, "torch.linalg.eigh");
   } else {
     singleCheckErrors(infos.item().toInt(), "torch.linalg.eigh");
   }

   return std::tuple<Tensor, Tensor>(values, vectors);
 }

 // TODO: it's possible to make the _out variant to be a primal function and implement linalg_eigh on top of _out
 // TODO: implement _out variant avoiding copy and using already allocated storage directly
 std::tuple<Tensor&, Tensor&> linalg_eigh_out(const Tensor& input, c10::string_view uplo, Tensor& eigvals, Tensor& eigvecs) {
   checkLinalgCompatibleDtype("torch.linalg.eigh", eigvecs, input, "eigenvectors");

   // eigenvalues are always real-valued here
   ScalarType real_dtype = toValueType(input.scalar_type());
   checkLinalgCompatibleDtype("torch.linalg.eigh", eigvals.scalar_type(), real_dtype, "eigenvalues");

   Tensor eigvals_tmp, eigvecs_tmp;
   std::tie(eigvals_tmp, eigvecs_tmp) = at::linalg_eigh(input, uplo);

   at::native::resize_output(eigvals, eigvals_tmp.sizes());
   eigvals.copy_(eigvals_tmp);
   at::native::resize_output(eigvecs, eigvecs_tmp.sizes());
   eigvecs.copy_(eigvecs_tmp);

   return std::tuple<Tensor&, Tensor&>(eigvals, eigvecs);
 }

 Tensor linalg_eigvalsh(const Tensor& input, c10::string_view uplo) {
   // if input requires grad we must compute the eigenvectors to make this function differentiable
   // the eigenvectors are not exposed to the user
   if (at::GradMode::is_enabled() && input.requires_grad()) {
     Tensor values;
     std::tie(values, std::ignore) = at::linalg_eigh(input, uplo);
     return values;
   }

   ScalarType real_dtype = toValueType(input.scalar_type());
   Tensor values = at::empty({0}, input.options().dtype(real_dtype));
   values = at::linalg_eigvalsh_outf(input, uplo, values);
   return values;
 }

 Tensor& linalg_eigvalsh_out(const Tensor& input, c10::string_view uplo, Tensor& result) {
   ScalarType real_dtype = toValueType(input.scalar_type());
   checkLinalgCompatibleDtype("torch.linalg.eigvalsh", result.scalar_type(), real_dtype);

   squareCheckInputs(input);
   checkUplo(uplo);

   auto expected_result_shape = IntArrayRef(input.sizes().data(), input.dim()-1);  // input.shape[:-1]
   bool result_equal_expected_shape = result.sizes().equals(expected_result_shape);
   bool expected_result_type = (result.scalar_type() == real_dtype);
   bool copy_needed = !expected_result_type;
   copy_needed |= (result.numel() != 0 && !result_equal_expected_shape);
   copy_needed |= (result.numel() != 0 && !result.is_contiguous());

   Tensor vectors = at::empty({0}, input.options());
   Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt));

   if (copy_needed) { // we have to allocate a temporary tensor
     Tensor result_tmp = at::empty({expected_result_shape}, input.options().dtype(real_dtype));
     linalg_eigh_out_info(input, result_tmp, vectors, infos, /*compute_eigenvectors=*/false, uplo);
     at::native::resize_output(result, result_tmp.sizes());
     result.copy_(result_tmp);
   } else {
     // else use the provided output storage directly
     linalg_eigh_out_info(input, result, vectors, infos, /*compute_eigenvectors=*/false, uplo);
   }

   if (input.dim() > 2) {
     batchCheckErrors(infos, "torch.linalg.eigvalsh");
   } else {
     singleCheckErrors(infos.item().toInt(), "torch.linalg.eigvalsh");
   }

   return result;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ symeig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 template <typename scalar_t>
 static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool upper, std::vector<int64_t>& infos) {
 #if !AT_BUILD_WITH_LAPACK()
   AT_ERROR("symeig: LAPACK library not found in compilation");
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
   auto eigvals_data = eigvals.data_ptr<value_t>();
   auto self_matrix_stride = matrixStride(self);
   auto eigvals_stride = eigvals.size(-1);
   auto batch_size = batchCount(self);
   auto n = self.size(-1);

   char uplo = upper ? 'U' : 'L';
   char jobz = eigenvectors ? 'V' : 'N';

   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int info;
   // Run once, first to get the optimum work size.
   // Since we deal with batches of matrices with the same dimensions, doing this outside
   // the loop saves (batch_size - 1) workspace queries which would provide the same result
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   int lwork = -1;
   scalar_t wkopt;

   Tensor rwork;
   value_t* rwork_data = nullptr;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
     int64_t lrwork = std::max(int64_t(1), 3 * n - 2);
     ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
     rwork = at::empty({lrwork}, self.options().dtype(dtype));
     rwork_data = rwork.data_ptr<value_t>();
   }

   lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, rwork_data, &info);
   lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());

   for (const auto i : c10::irange(batch_size)) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
     value_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];

     // now compute the eigenvalues and the eigenvectors (optionally)
     lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data_ptr<scalar_t>(), lwork, rwork_data, &info);
     infos[i] = info;
     if (info != 0) {
       return;
     }
   }
 #endif
 }

 std::tuple<Tensor, Tensor> _symeig_helper_cpu(const Tensor& self, bool eigenvectors, bool upper) {
   std::vector<int64_t> infos(batchCount(self), 0);

   auto self_sizes = self.sizes().vec();
   self_sizes.pop_back();
   ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
   auto eigvals = at::empty(self_sizes, self.options().dtype(dtype));

   if (self.numel() == 0) {
     return std::tuple<Tensor, Tensor>(eigvals, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
   }

   auto self_working_copy = cloneBatchedColumnMajor(self);
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cpu", [&]{
     apply_symeig<scalar_t>(self_working_copy, eigvals, eigenvectors, upper, infos);
   });

   if (self.dim() > 2) {
     batchCheckErrors(infos, "symeig_cpu");
   } else {
     singleCheckErrors(infos[0], "symeig_cpu");
   }
   if (eigenvectors) {
     return std::tuple<Tensor, Tensor>(eigvals, self_working_copy);
   } else {
     return std::tuple<Tensor, Tensor>(eigvals, at::empty({0}, self.options()));
   }
 }

 std::tuple<Tensor, Tensor> symeig(const Tensor& self, bool eigenvectors, bool upper) {
   TORCH_WARN_ONCE(
     "torch.symeig is deprecated in favor of torch.linalg.eigh and will be removed in a future ",
     "PyTorch release.\n",
     "The default behavior has changed from using the upper triangular portion of the matrix by default ",
     "to using the lower triangular portion.\n",
     "L, _ = torch.symeig(A, upper=upper)\n",
     "should be replaced with\n",
     "L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L')\n",
     "and\n",
     "L, V = torch.symeig(A, eigenvectors=True)\n"
     "should be replaced with\n",
     "L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L')"
   );
   squareCheckInputs(self);
   return at::_symeig_helper(self, eigenvectors, upper);
 }

 std::tuple<Tensor&, Tensor&> symeig_out(const Tensor& self, bool eigenvectors, bool upper, Tensor& vals, Tensor& vecs) {
   TORCH_WARN_ONCE(
     "torch.symeig is deprecated in favor of torch.linalg.eigh and will be removed in a future ",
     "PyTorch release.\n",
     "The default behavior has changed from using the upper triangular portion of the matrix by default ",
     "to using the lower triangular portion.\n",
     "L, _ = torch.symeig(A, upper=upper)\n",
     "should be replaced with\n",
     "L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L')\n",
     "and\n",
     "L, V = torch.symeig(A, eigenvectors=True)\n"
     "should be replaced with\n",
     "L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L')"
   );
   checkSameDevice("symeig", vals, self, "eigenvalues");
   checkSameDevice("symeig", vecs, self, "eigenvectors");
   checkLinalgCompatibleDtype("symeig", vecs, self, "eigenvectors");
   // eigenvalues are always real-valued here
   ScalarType real_dtype = toValueType(self.scalar_type());
   checkLinalgCompatibleDtype("symeig", vals.scalar_type(), real_dtype, "eigenvalues");

   Tensor vals_tmp, vecs_tmp;
   std::tie(vals_tmp, vecs_tmp) = at::symeig(self, eigenvectors, upper);

   at::native::resize_output(vals, vals_tmp.sizes());
   at::native::resize_output(vecs, vecs_tmp.sizes());
   vals.copy_(vals_tmp);
   vecs.copy_(vecs_tmp);
   return std::tuple<Tensor&, Tensor&>(vals, vecs);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // This function returns complex-valued eigenvectors that is obtained from LAPACK GEEV's real-valued output
 // This function is also used for the MAGMA path because intermediate MAGMA's results live on CPU
 template <typename scalar_t>
 static void linalg_eig_make_complex_eigenvectors_impl(Tensor& result, const Tensor& complex_values, const Tensor& real_vectors) {
   // From GEEV documentation:
   // Complex conjugate pairs of eigenvalues appear consecutively with the eigenvalue having the positive imaginary part first
   // If the j-th eigenvalue is real, then v(j) = VR(:,j), the j-th column of VR.
   // If the j-th and (j+1)-st eigenvalues form a complex conjugate pair, then v(j) = VR(:,j) + i*VR(:,j+1) and v(j+1) = VR(:,j) - i*VR(:,j+1).

   auto batch_size = batchCount(real_vectors);
   auto n = real_vectors.size(-1);
   auto matrix_stride = matrixStride(real_vectors);

   auto result_data = result.data_ptr<c10::complex<scalar_t>>();
   auto real_vectors_data = real_vectors.data_ptr<scalar_t>();
   auto values_data = complex_values.data_ptr<c10::complex<scalar_t>>();

   for (auto b = decltype(batch_size){0}; b < batch_size; b++) {
     scalar_t* vecs = &real_vectors_data[b * matrix_stride];
     c10::complex<scalar_t>* res = &result_data[b * matrix_stride];
     c10::complex<scalar_t>* vals = &values_data[b * n];
     for (auto j = decltype(n){0}; j < n; j++) {
       if (vals[j].imag() == 0.0) {  // eigenvalue is real, then v(j) = VR(:,j)
         for (auto i = decltype(n){0}; i < n; i++) {
           res[j * n + i] = c10::complex<scalar_t>(vecs[j * n + i], 0);
         }
       } else {
         for (auto i = decltype(n){0}; i < n; i++) {
           res[j * n + i] = c10::complex<scalar_t>(vecs[j * n + i],  vecs[(j+1) * n + i]);      // v(j)   = VR(:,j) + i*VR(:,j+1)
           res[(j+1) * n + i] = c10::complex<scalar_t>(vecs[j * n + i], -vecs[(j+1) * n + i]);  // v(j+1) = VR(:,j) - i*VR(:,j+1)
         }
         j++;
       }
     }
   }
 }

 static Tensor& linalg_eig_make_complex_eigenvectors(Tensor& complex_vectors, const Tensor& complex_values, const Tensor& real_vectors) {
   // These asserts make explicit the requirements on tensors for 'linalg_eig_make_complex_eigenvectors_impl'
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(complex_vectors.device() == at::kCPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(complex_values.device() == at::kCPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(real_vectors.device() == at::kCPU);

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(complex_vectors.is_complex());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(complex_values.is_complex());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(real_vectors.is_floating_point());

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(complex_vectors.transpose(-2, -1).is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(complex_values.is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(real_vectors.transpose(-2, -1).is_contiguous());

   AT_DISPATCH_FLOATING_TYPES(real_vectors.scalar_type(), "linalg_eig_make_complex_vector", [&]{
     linalg_eig_make_complex_eigenvectors_impl<scalar_t>(complex_vectors, complex_values, real_vectors);
   });
   return complex_vectors;
 }

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(linalg_eig_stub);

 std::tuple<Tensor&, Tensor&> linalg_eig_out_info(const Tensor& input, Tensor& values, Tensor& vectors, Tensor& infos, bool compute_eigenvectors) {
   // MAGMA doesn't have GPU interface for GEEV routine, it requires inputs to be on CPU
   // therefore we create all intermediate tensors on CPU
   auto options = input.options().device(at::kCPU);

   // These internal asserts make explicit the assumptions in the implementation
   // Error check with the actual error messages are done on the higher level of the hierarchy of calls
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.size(-2) == input.size(-1));

   // for real-valued 'input', eigenvalues can be real-valued or complex-valued
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY((toComplexType(input.scalar_type()) == values.scalar_type()) || (input.scalar_type() == values.scalar_type()));
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.device() == at::kCPU);

   // for real-valued 'input', eigenvectors can be real-valued or complex-valued
   if (compute_eigenvectors) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY((toComplexType(input.scalar_type()) == vectors.scalar_type()) || (input.scalar_type() == vectors.scalar_type()));
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(vectors.device() == at::kCPU);
   }

   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.scalar_type() == at::kInt);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.device() == at::kCPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.numel() == std::max<int64_t>(1, batchCount(input)));
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(infos.is_contiguous());

   // if 'vectors' has no elements we can modify it
   if (vectors.numel() == 0 && compute_eigenvectors) {
     vectors.resize_(input.sizes(), MemoryFormat::Contiguous);
     vectors.transpose_(-2, -1);  // make 'vectors' to have Fortran contiguous memory layout
   }

   // if 'values' has no elements we can modify it
   auto values_shape = IntArrayRef(input.sizes().data(), input.dim()-1);  // input.shape[:-1]
   if (values.numel() == 0) {
     values.resize_(values_shape, MemoryFormat::Contiguous);
   }

   // 'vectors' must be in batched column major order (Fortran contiguous)
   if (compute_eigenvectors) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(vectors.transpose(-2, -1).is_contiguous());
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(vectors.sizes().equals(input.sizes()));
   }

   // 'values' must be contiguous
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.is_contiguous());
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.sizes().equals(values_shape));

   // if 'input' is complex then use 'values' directly else create a temporary to hold the real and imaginary parts
   // and then use at::complex_out
   Tensor real_imag_values = values;

   // if 'input' is complex then use 'vectors' directly else maybe create a temporary to hold real vectors
   // and then use linalg_eig_make_complex_eigenvectors
   Tensor maybe_complex_vectors = vectors;
   if (!input.is_complex()) {
     // first n elements to hold the real portion of the output and the last n elements to hold the imaginary portion
     auto real_imag_shape = IntArrayRef(input.sizes().data(), input.dim()-2).vec();  // input.shape[:-2]
     real_imag_shape.push_back(input.size(-1) * 2);
     real_imag_values = at::empty(real_imag_shape, options, MemoryFormat::Contiguous);

     // linalg_eig_stub expects real-valued tensor to store eigenvectors
     // output of linalg_eig_stub need to be post-processed later to produce complex-valued eigenvectors
     // we do this post-processing only if 'vectors' is complex-valued
     // otherwise storage of 'vectors' is used directly
     if (vectors.is_complex() && compute_eigenvectors) {
       maybe_complex_vectors = at::empty(input.sizes(), options, MemoryFormat::Contiguous);
       maybe_complex_vectors.transpose_(-2, -1);  // make 'maybe_complex_vectors' to have Fortran contiguous memory layout
     }
   }

   // MAGMA uses a hybrid CPU-GPU algorithm that performs well only for large matrices
   // See: https://github.com/pytorch/pytorch/pull/52491#issuecomment-795685687
   // Here we call CPU path for matrices smaller than 2048x2048
   // that should be in general significantly faster than calling MAGMA
   if (input.size(-1) <= 2048) {
     linalg_eig_stub(at::kCPU, real_imag_values, maybe_complex_vectors, infos, input.to(kCPU), compute_eigenvectors);
   } else {
     linalg_eig_stub(input.device().type(), real_imag_values, maybe_complex_vectors, infos, input, compute_eigenvectors);
   }

   // if input is not complex we need to do some post-processing
   if (!input.is_complex()) {
     // extract real and imaginary parts of the output
     auto real_values = real_imag_values.slice(/*dim=*/-1, /*start=*/0, /*end*/input.size(-1));
     auto imag_values = real_imag_values.slice(/*dim=*/-1, /*start=*/input.size(-1));

     // if the imaginary part is zero we don't need to do anything
     bool is_zero_imag = at::all(imag_values == 0.0).item().toBool();
     if (is_zero_imag) {
       values.copy_(real_values);
       if (compute_eigenvectors) {
         vectors.copy_(maybe_complex_vectors);  // does nothing for !vectors.is_complex() because vectors.is_same(maybe_complex_vectors) == true
       }
       return std::tuple<Tensor&, Tensor&>(values, vectors);
     }

     if (values.is_complex()) {
       values = at::complex_out(values, real_values, imag_values);
     } else {
       TORCH_CHECK(false, "torch.linalg.eig: imaginary part of eigenvalues is non-zero, can't safely cast eigenvalues to non-complex dtype.")
     }
     if (compute_eigenvectors) {
       if (vectors.is_complex()) {
           vectors = linalg_eig_make_complex_eigenvectors(vectors, values, maybe_complex_vectors);
       } else {
         TORCH_CHECK(false, "torch.linalg.eig: imaginary part of eigenvectors is non-zero, can't safely cast eigenvectors to non-complex dtype.")
       }
     }
   }

   return std::tuple<Tensor&, Tensor&>(values, vectors);
 }

 std::tuple<Tensor&, Tensor&> linalg_eig_out(const Tensor& input, Tensor& values, Tensor& vectors) {
   squareCheckInputs(input);

   // unlike NumPy for real-valued inputs the output is always complex-valued
   checkLinalgCompatibleDtype("torch.linalg.eig", values.scalar_type(), toComplexType(input.scalar_type()), "eigenvalues");
   checkLinalgCompatibleDtype("torch.linalg.eig", vectors.scalar_type(), toComplexType(input.scalar_type()), "eigenvectors");
   checkSameDevice("torch.linalg.eig", values, input, "eigenvalues");
   checkSameDevice("torch.linalg.eig", vectors, input, "eigenvectors");

   // MAGMA doesn't have GPU interface for GEEV routine, it requires inputs to be on CPU
   auto options = input.options().device(at::kCPU);
   auto infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, options.dtype(kInt));

   // if result is not empty and not in batched column major format we have to allocate a temporary tensor
   bool is_batched_column_major = false;
   if (vectors.dim() >= 2) {
     is_batched_column_major = vectors.transpose(-2, -1).is_contiguous();
   }

   bool values_expected_type = (values.scalar_type() == toComplexType(input.scalar_type()));
   bool vectors_expected_type = (vectors.scalar_type() == toComplexType(input.scalar_type()));

   auto expected_values_shape = IntArrayRef(input.sizes().data(), input.dim()-1);  // input.shape[:-1]
   bool values_equal_expected_shape = values.sizes().equals(expected_values_shape);
   bool vectors_equal_expected_shape = vectors.sizes().equals(input.sizes());

   // if result is not empty and not in batched column major format
   bool values_tmp_needed = (values.numel() != 0 && !values.is_contiguous());
   bool vectors_tmp_needed = (vectors.numel() != 0 && !is_batched_column_major);
   // or result does not have the expected shape
   values_tmp_needed |= (values.numel() != 0 && !values_equal_expected_shape);
   vectors_tmp_needed |= (vectors.numel() != 0 && !vectors_equal_expected_shape);
   // or result does not have the expected dtype
   values_tmp_needed |= !values_expected_type;
   vectors_tmp_needed |= !vectors_expected_type;
   // we will allocate a temporary tensor and do the copy

   // because MAGMA's GEEV takes CPU inputs and returns CPU outputs
   // "out" tensors that are on GPU device can't be used directly
   values_tmp_needed |= values.is_cuda();
   vectors_tmp_needed |= vectors.is_cuda();

   // determine the appropriate scalar_type for the temporary tensors
   ScalarType values_type = input.scalar_type();
   ScalarType vectors_type = input.scalar_type();
   if (!input.is_complex()) {
     // for real-valued input we can have either real- or complex-valued output
     ScalarType input_complex_dtype = toComplexType(input.scalar_type());
     values_type = values.is_complex() ? input_complex_dtype : values_type;
     vectors_type = vectors.is_complex() ? input_complex_dtype : vectors_type;
   }

   if (values_tmp_needed && vectors_tmp_needed) {
     Tensor values_tmp = at::empty({0}, options.dtype(values_type));
     Tensor vectors_tmp = at::empty({0}, options.dtype(vectors_type));
     std::tie(values_tmp, vectors_tmp) = linalg_eig_out_info(input, values_tmp, vectors_tmp, infos, true);
     at::native::resize_output(values, values_tmp.sizes());
     values.copy_(values_tmp);
     at::native::resize_output(vectors, vectors_tmp.sizes());
     vectors.copy_(vectors_tmp);
   } else if (!values_tmp_needed && vectors_tmp_needed) {
     // use 'values' storage directly
     Tensor vectors_tmp = at::empty({0}, options.dtype(vectors_type));
     std::tie(values, vectors_tmp) = linalg_eig_out_info(input, values, vectors_tmp, infos, true);
     at::native::resize_output(vectors, vectors_tmp.sizes());
     vectors.copy_(vectors_tmp);
   } else if (values_tmp_needed && !vectors_tmp_needed) {
     // use 'vectors' storage directly
     Tensor values_tmp = at::empty({0}, options.dtype(values_type));
     std::tie(values_tmp, vectors) = linalg_eig_out_info(input, values_tmp, vectors, infos, true);
     at::native::resize_output(values, values_tmp.sizes());
     values.copy_(values_tmp);
   } else {
     // use 'values' and 'vectors' storage directly
     std::tie(values, vectors) = linalg_eig_out_info(input, values, vectors, infos, true);
   }

   // Now check LAPACK/MAGMA error codes
   if (input.dim() > 2) {
     batchCheckErrors(infos, "torch.linalg.eig");
   } else {
     singleCheckErrors(infos.item().toInt(), "torch.linalg.eig");
   }

   return std::tuple<Tensor&, Tensor&>(values, vectors);
 }

 std::tuple<Tensor, Tensor> linalg_eig(const Tensor& input) {
   ScalarType complex_dtype = toComplexType(input.scalar_type());
   Tensor values = at::empty({0}, input.options().dtype(complex_dtype));
   Tensor vectors = at::empty({0}, input.options().dtype(complex_dtype));

   at::linalg_eig_outf(input, values, vectors);

   return std::tuple<Tensor, Tensor>(values, vectors);
 }

 Tensor& linalg_eigvals_out(const Tensor& input, Tensor& values) {
   squareCheckInputs(input);

   // unlike NumPy for real-valued inputs the output is always complex-valued
   checkLinalgCompatibleDtype("torch.linalg.eigvals", values.scalar_type(), toComplexType(input.scalar_type()), "eigenvalues");
   checkSameDevice("torch.linalg.eigvals", values, input, "eigenvalues");

   // MAGMA doesn't have GPU interface for GEEV routine, it requires inputs to be on CPU
   auto options = input.options().device(at::kCPU);
   auto infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, options.dtype(kInt));

   bool values_expected_type = (values.scalar_type() == toComplexType(input.scalar_type()));

   auto expected_values_shape = IntArrayRef(input.sizes().data(), input.dim()-1);  // input.shape[:-1]
   bool values_equal_expected_shape = values.sizes().equals(expected_values_shape);

   // if result is not empty and not in batched column major format
   bool values_tmp_needed = (values.numel() != 0 && !values.is_contiguous());
   // or result does not have the expected shape
   values_tmp_needed |= (values.numel() != 0 && !values_equal_expected_shape);
   // or result does not have the expected dtype
   values_tmp_needed |= !values_expected_type;
   // we will allocate a temporary tensor and do the copy

   // because MAGMA's GEEV takes CPU inputs and returns CPU outputs
   // 'values' tensor that is on GPU device can't be used directly
   values_tmp_needed |= values.is_cuda();

   // determine the appropriate scalar_type for the temporary tensors
   ScalarType values_type = input.scalar_type();
   if (!input.is_complex()) {
     // for real-valued input we can have either real- or complex-valued output
     ScalarType input_complex_dtype = toComplexType(input.scalar_type());
     values_type = values.is_complex() ? input_complex_dtype : values_type;
   }

   Tensor vectors;
   if (values_tmp_needed) {
     Tensor values_tmp = at::empty({0}, options.dtype(values_type));
     std::tie(values_tmp, std::ignore) = linalg_eig_out_info(input, values_tmp, vectors, infos, /*compute_eigenvectors=*/false);
     at::native::resize_output(values, values_tmp.sizes());
     values.copy_(values_tmp);
   } else { // use 'values' storage directly
     std::tie(values, std::ignore) = linalg_eig_out_info(input, values, vectors, infos, /*compute_eigenvectors=*/false);
   }

   // Now check LAPACK/MAGMA error codes
   if (input.dim() > 2) {
     batchCheckErrors(infos, "torch.linalg.eigvals");
   } else {
     singleCheckErrors(infos.item().toInt(), "torch.linalg.eigvals");
   }

   return values;
 }

 Tensor linalg_eigvals(const Tensor& input) {
   // if input requires grad we must compute the eigenvectors to make this function differentiable
   // the eigenvectors are not exposed to the user
   if (at::GradMode::is_enabled() && input.requires_grad()) {
     return std::get<0>(at::linalg_eig(input));
   }

   ScalarType complex_dtype = toComplexType(input.scalar_type());
   Tensor values = at::empty({0}, input.options().dtype(complex_dtype));

   at::linalg_eigvals_outf(input, values);

   return values;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 DEFINE_DISPATCH(eig_stub);

 std::tuple<Tensor&, Tensor&> eig_out(const Tensor& self, bool eigenvectors, Tensor& e, Tensor& v) {
   TORCH_WARN_ONCE(
     "torch.eig is deprecated in favor of torch.linalg.eig and will be removed in a future ",
     "PyTorch release.\n",
     "torch.linalg.eig returns complex tensors of dtype cfloat or cdouble rather than real tensors ",
     "mimicking complex tensors.\n",
     "L, _ = torch.eig(A)\n",
     "should be replaced with\n",
     "L_complex = torch.linalg.eigvals(A)\n",
     "and\n",
     "L, V = torch.eig(A, eigenvectors=True)\n",
     "should be replaced with\n",
     "L_complex, V_complex = torch.linalg.eig(A)"
   );
   TORCH_CHECK(self.dim() == 2, "input should be 2 dimensional");
   TORCH_CHECK(self.size(0) == self.size(1), "input should be square");
   TORCH_CHECK(self.isfinite().all().item<bool>(), "input should not contain infs or NaNs");
   checkSameDevice("torch.eig", e, self, "eigenvalues");
   checkLinalgCompatibleDtype("torch.eig", e, self, "eigenvalues");
   if (eigenvectors) {
     checkSameDevice("torch.eig", v, self, "eigenvectors");
     checkLinalgCompatibleDtype("torch.eig", v, self, "eigenvectors");
   }
   int64_t n = self.size(-1);

   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
       at::native::resize_output(e, {n});
   } else {
       at::native::resize_output(e, {n, 2});
   }
   if (eigenvectors) {
       at::native::resize_output(v, self.sizes());
   }

   // optimization: if self is empty, we can immediately return the empty
   // tensors, instead of getting empty tensors from eig_helper
   if (self.numel() == 0) {
       return std::tuple<Tensor&, Tensor&>(e, v);
   }

   Tensor vals_, vecs_;
   std::tie(vals_, vecs_) = eig_stub(self.device().type(), self, eigenvectors);
   e.copy_(vals_);
   if (eigenvectors) {
     v.copy_(vecs_);
   }
   return std::tuple<Tensor&, Tensor&>(e, v);
 }

 std::tuple<Tensor,Tensor> eig(const Tensor& self, bool eigenvectors) {
   Tensor e = at::empty({0}, self.options());
   Tensor v = at::empty({0}, self.options());
   at::eig_out(e, v, self, eigenvectors);
   return std::tuple<Tensor, Tensor>(e, v);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ svd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 template <typename scalar_t>
 static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
                       char jobz, std::vector<int64_t>& infos) {
 #if !AT_BUILD_WITH_LAPACK()
   AT_ERROR("svd: LAPACK library not found in compilation");
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
   auto U_data = U.data_ptr<scalar_t>();
   auto S_data = S.data_ptr<value_t>();
   auto VT_data = VT.data_ptr<scalar_t>();
   auto self_stride = matrixStride(self);
   auto U_stride = matrixStride(U);
   auto S_stride = S.size(-1);
   auto VT_stride = matrixStride(VT);
   auto batchsize = batchCount(self);

   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int info;
   auto m = self.size(-2);
   auto n = self.size(-1);
   auto lda = std::max<int64_t>(1, m);
   auto ldvt = std::max<int64_t>(1, n);
   auto mn = std::min(m, n);
   Tensor iwork = at::empty({8 * mn}, at::kInt);
   auto iwork_data = iwork.data_ptr<int>();
   Tensor rwork;
   value_t* rwork_data = nullptr;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
     auto lrwork  = computeLRWorkDim(jobz, m, n);
     // rwork is an array of floats or doubles depending on the type
     rwork = at::empty({std::max(int64_t(1), lrwork)}, at::typeMetaToScalarType(S.dtype()));
     rwork_data = rwork.data_ptr<value_t>();
   }

   // Run once, first to get the optimum work size.
   // Since we deal with batches of matrices with the same dimensions, doing this outside
   // the loop saves (batch_size - 1) workspace queries which would provide the same result
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   int lwork = -1;
   scalar_t wkopt;
   lapackSvd<scalar_t, value_t>(jobz, m, n, self_data, lda, S_data, U_data, lda, VT_data, ldvt, &wkopt, lwork, rwork_data, iwork_data, &info);
   lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());
   auto work_data = work.data_ptr<scalar_t>();

   for (const auto i : c10::irange(batchsize)) {
     scalar_t* self_working_ptr = &self_data[i * self_stride];
     value_t* S_working_ptr = &S_data[i * S_stride];
     scalar_t* U_working_ptr = &U_data[i * U_stride];
     scalar_t* VT_working_ptr = &VT_data[i * VT_stride];

     // Compute S, U (optionally) and VT (optionally)
     lapackSvd<scalar_t, value_t>(jobz, m, n, self_working_ptr, lda,
                         S_working_ptr, U_working_ptr, lda, VT_working_ptr, ldvt, work_data, lwork, rwork_data, iwork_data, &info);
     infos[i] = info;
     if (info != 0) {
       return;
     }
   }
 #endif
 }

 std::tuple<Tensor, Tensor, Tensor> _svd_helper_cpu(const Tensor& self, bool some, bool compute_uv) {
   std::vector<int64_t> infos(batchCount(self), 0);
   int64_t m = self.size(-2), n = self.size(-1);
   int64_t k = std::min(m, n);

   char jobz = compute_uv ? (some ? 'S' : 'A') : 'N';

   Tensor U_working_copy, S_working_copy, VT_working_copy;
   std::tie(U_working_copy, S_working_copy, VT_working_copy) = _create_U_S_VT(self, some, compute_uv);

   auto self_working_copy = cloneBatchedColumnMajor(self);

   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "svd_cpu", [&]{
     apply_svd<scalar_t>(self_working_copy, U_working_copy, S_working_copy, VT_working_copy, jobz, infos);
   });

   if (self.dim() > 2) {
     batchCheckErrors(infos, "svd_cpu");
   } else {
     singleCheckErrors(infos[0], "svd_cpu");
   }

   if (!compute_uv) {
     VT_working_copy.zero_();
     U_working_copy.zero_();
   }

   if (some) {
     VT_working_copy = VT_working_copy.narrow(-2, 0, k);
   }

   // so far we have computed VT, but torch.svd returns V instead. Adjust accordingly.
   // Note that the 'apply_svd' routine returns VT = V^T (for real inputs) or VT = V^H (for complex inputs), not V.
   VT_working_copy = VT_working_copy.conj();
   VT_working_copy.transpose_(-2, -1);
   return std::make_tuple(U_working_copy, S_working_copy, VT_working_copy);
 }

 std::tuple<Tensor, Tensor, Tensor> svd(const Tensor& self, bool some, bool compute_uv) {
   // TODO: uncomment the following when svd is deprecated not only in docs
   // torch/xla is blocking the transition from at::svd to at::linalg_svd in at::linalg_pinv code
   // see https://github.com/pytorch/xla/issues/2755
   // TORCH_WARN_ONCE(
   //     "torch.svd is deprecated in favor of torch.linalg.svd and will be ",
   //     "removed in a future PyTorch release.\n",
   //     "U, S, V = torch.svd(A, some=some, compute_uv=True) (default)\n",
   //     "should be replaced with\n",
   //     "U, S, Vh = torch.linalg.svd(A, full_matrices=not some)\n",
   //     "V = Vh.transpose(-2, -1).conj()\n",
   //     "and\n",
   //     "_, S, _ = torch.svd(A, some=some, compute_uv=False)\n",
   //     "should be replaced with\n",
   //     "S = torch.linalg.svdvals(A)");

   TORCH_CHECK(self.dim() >= 2,
               "svd input should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
   return at::_svd_helper(self, some, compute_uv);
 }

 std::tuple<Tensor&, Tensor&, Tensor&> svd_out(const Tensor& self, bool some, bool compute_uv, Tensor& U, Tensor& S, Tensor& V) {
   checkSameDevice("svd", U, self, "U");
   checkSameDevice("svd", S, self, "S");
   checkSameDevice("svd", V, self, "V");
   checkLinalgCompatibleDtype("svd", U, self, "U");
   checkLinalgCompatibleDtype("svd", V, self, "V");
   // singular values are always real-valued here
   ScalarType real_dtype = toValueType(self.scalar_type());
   checkLinalgCompatibleDtype("svd", S.scalar_type(), real_dtype, "S");

   Tensor U_tmp, S_tmp, V_tmp;
   std::tie(U_tmp, S_tmp, V_tmp) = at::native::svd(self, some, compute_uv);

   at::native::resize_output(U, U_tmp.sizes());
   at::native::resize_output(S, S_tmp.sizes());
   at::native::resize_output(V, V_tmp.sizes());
   U.copy_(U_tmp);
   S.copy_(S_tmp);
   V.copy_(V_tmp);
   return std::tuple<Tensor&, Tensor&, Tensor&>(U, S, V);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_svd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 /* torch.linalg.svd, implemented in terms of torch.svd. There are two main
    differences:

     1. the 2nd parameter is bool some=True, which if effectively the opposite
        of full_matrices=True

     2. svd returns V, while linalg.svd returns Vh = V^T (for real inputs) or Vh = V^H (for complex inputs).
        To accommodate the difference, we transpose() and conj() V upon return
 */

 std::tuple<Tensor, Tensor, Tensor> linalg_svd(const Tensor& self, bool full_matrices) {
   TORCH_CHECK(self.dim() >= 2,
               "svd input should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");

     bool some = !full_matrices;
     Tensor U, S, V;
     std::tie(U, S, V) = at::_svd_helper(self, some, /*compute_uv=*/true);

     Tensor Vh = V.conj().transpose(-2, -1);
     return std::make_tuple(U, S, Vh);

 }

 static void svd_resize_and_copy(const char *name, const Tensor& src, Tensor &dst) {
   TORCH_CHECK(src.device() == dst.device(), "svd output tensor ", name, " is on the wrong device: expected ", src.device(), " got ", dst.device());
   at::native::resize_output(dst, src.sizes());
   dst.copy_(src);
 }

 std::tuple<Tensor&, Tensor&, Tensor&> linalg_svd_out(const Tensor& self, bool full_matrices, Tensor& U, Tensor& S, Tensor& Vh) {
   checkSameDevice("svd", U, self, "U");
   checkSameDevice("svd", S, self, "S");
   checkSameDevice("svd", Vh, self, "Vh");
   checkLinalgCompatibleDtype("linalg_svd", U, self, "U");
   checkLinalgCompatibleDtype("linalg_svd", Vh, self, "Vh");
   // singular values are always real-valued here
   ScalarType real_dtype = toValueType(self.scalar_type());
   checkLinalgCompatibleDtype("linalg_svd", S.scalar_type(), real_dtype, "S");
   Tensor U_tmp, S_tmp, Vh_tmp;
   std::tie(U_tmp, S_tmp, Vh_tmp) = at::native::linalg_svd(self, full_matrices);
   svd_resize_and_copy("U", U_tmp, U);
   svd_resize_and_copy("S", S_tmp, S);
   svd_resize_and_copy("V", Vh_tmp, Vh);
   return std::tuple<Tensor&, Tensor&, Tensor&>(U, S, Vh);
 }

 Tensor linalg_svdvals(const Tensor& input) {
   TORCH_CHECK(
       input.dim() >= 2,
       "torch.linalg.svdvals: input should have at least 2 dimensions, but has ",
       input.dim(),
       " dimensions instead");

   Tensor singular_values;

   // if input requires grad we must compute the singular vectors to make this function differentiable
   // the singular vectors are not exposed to the user
   const bool input_requires_grad = (at::GradMode::is_enabled() && input.requires_grad());
   std::tie(std::ignore, singular_values, std::ignore) =
       at::_svd_helper(input, /*some=*/input_requires_grad, /*compute_uv=*/input_requires_grad);
   return singular_values;
 }

 Tensor& linalg_svdvals_out(const Tensor& input, Tensor& result) {
   checkSameDevice("torch.linalg.svdvals", result, input);

   // singular values are always real-valued
   ScalarType real_dtype = toValueType(input.scalar_type());
   checkLinalgCompatibleDtype(
       "torch.linalg.svdvals", result.scalar_type(), real_dtype);

   Tensor singular_values_tmp;
   std::tie(std::ignore, singular_values_tmp, std::ignore) =
       // NOLINTNEXTLINE(bugprone-argument-comment)
       at::_svd_helper(input, /*full_matrices=*/false, /*compute_uv=*/false);

   at::native::resize_output(result, singular_values_tmp.sizes());
   result.copy_(singular_values_tmp);

   return result;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 DEFINE_DISPATCH(lstsq_stub);

 /*
   Solves a least squares problem. That is minimizing the squared Frobenius norm of |B - A X|.

   Input args:
   * 'input' - Tensor containing batches of m-by-n matrix A.
   * 'other' - Tensor containing batches of max(m, n)-by-nrhs matrix B.
   * 'cond' - relative tolerance for determining rank of A.
   * 'driver' - the name of the LAPACK driver that is used to compute the solution.
   Output args (modified in-place):
   * 'solution' - Tensor to store the solution matrix X.
   * 'residuals' - Tensor to store values of the residual sum of squares for each column of the solution.
   * 'rank' - Tensor to store the rank of A.
   * 'singular_values' - Tensor to store the singular values of A.
   * 'infos' - Tensor to store error codes of linear algebra math library.

   For further details, please see the LAPACK documentation for GELS/GELSY/GELSS/GELSD routines.
 */
 static void linalg_lstsq_out_info(
     Tensor& solution,
     Tensor& residuals,
     Tensor& rank,
     Tensor& singular_values,
     Tensor& infos,
     const Tensor& input,
     const Tensor& other,
     double rcond,
     std::string& driver) {
   // These internal asserts make explicit the assumptions in the implementation
   // Error check with the actual error messages are done on the higher level of
   // the hierarchy of calls
   TORCH_INTERNAL_ASSERT(input.dim() >= 2);
   TORCH_INTERNAL_ASSERT(other.dim() >= 1);

   auto dim_diff = input.dim() - other.dim();
   TORCH_INTERNAL_ASSERT(0 <= dim_diff && dim_diff <= 1);

   TORCH_INTERNAL_ASSERT(input.scalar_type() == other.scalar_type());
   TORCH_INTERNAL_ASSERT(input.device() == other.device());

   TORCH_INTERNAL_ASSERT(solution.scalar_type() == input.scalar_type());
   TORCH_INTERNAL_ASSERT(solution.device() == input.device());

   TORCH_INTERNAL_ASSERT(residuals.device() == input.device());

   TORCH_INTERNAL_ASSERT(rank.scalar_type() == at::kLong);
   TORCH_INTERNAL_ASSERT(rank.device() == input.device());

   auto real_dtype = toValueType(input.scalar_type());
   TORCH_INTERNAL_ASSERT(singular_values.scalar_type() == real_dtype);
   TORCH_INTERNAL_ASSERT(singular_values.device() == input.device());

   TORCH_INTERNAL_ASSERT(infos.scalar_type() == at::kInt);
   TORCH_INTERNAL_ASSERT(infos.device() == input.device());
   TORCH_INTERNAL_ASSERT(infos.numel() == std::max<int64_t>(1, batchCount(input)));
   TORCH_INTERNAL_ASSERT(infos.is_contiguous());

   bool vector_case = linalg_solve_is_vector_rhs(input, other);
   // we need to unsqueeze 'other' because 2-dimensional tensors are expected in the implementation
   Tensor other_2d = vector_case ? other.unsqueeze(-1) : other;

   TORCH_INTERNAL_ASSERT(input.size(-2) == other_2d.size(-2));

   std::vector<int64_t> expected_solution_shape = broadcast_batch_size(input, other_2d, input.dim() - 2);
   // the actual shape of the solution returned is (*, n,) or (*, n, nrhs)
   // but LAPACK requires extra dimensions to store raw residuals
   // so the expected shape is (*, max(m, n),) or (*, max(m, n), nrhs)
   auto m = input.size(-2);
   auto n = input.size(-1);
   auto nrhs = other.size(-1);
   expected_solution_shape.push_back(std::max(m, n));
   if (!vector_case) {
     expected_solution_shape.push_back(nrhs);
   }

   // if 'solution' has no elements we can modify it
   if (solution.numel() == 0) {
     if (vector_case) {
       solution.resize_(expected_solution_shape, MemoryFormat::Contiguous);
     } else {
       auto shape_transposed = expected_solution_shape;
       std::swap(shape_transposed.end()[-1], shape_transposed.end()[-2]);
       solution.resize_(shape_transposed, MemoryFormat::Contiguous);
       solution.transpose_(-2, -1);
     }
   }

   // if 'solution' is non-empty it must have the expected shape
   TORCH_INTERNAL_ASSERT(solution.sizes().equals(expected_solution_shape));

   // 'solution' must be in batched column major order (Fortran contiguous) for 2D inputs
   // or C contiguous for 1D input
   if (vector_case) {
     TORCH_INTERNAL_ASSERT(solution.is_contiguous());
   } else {
     TORCH_INTERNAL_ASSERT(solution.transpose(-2, -1).is_contiguous());
   }

   // for 1-dimensional 'other', we need to unsqueeze the 'solution' before passing to "apply_solve"
   if (vector_case) {
     solution = solution.unsqueeze_(-1);
   }

   // _linalg_lstsq_helper_ performs calculations in-place and 'solution' must be a copy of other_2d
   solution.narrow(-2, 0, other_2d.size(-2)).copy_(other_2d);

   // if 'rank' is empty we might resize it
   auto input_batch_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2);
   if (rank.numel() == 0 && driver != "gels") { // gels driver doesn't set 'rank'
     rank.resize_(input_batch_shape, MemoryFormat::Contiguous);
   }

   // if 'rank' is non-empty it must have the expected shape and be contiguous
   if (driver != "gels") {
     TORCH_INTERNAL_ASSERT(rank.sizes().equals(input_batch_shape));
     TORCH_INTERNAL_ASSERT(rank.is_contiguous());
   }

   // if 'singular_values' is empty we might resize it
   auto singular_values_shape = input_batch_shape.vec();
   singular_values_shape.push_back(std::min(m, n));
   if (singular_values.numel() == 0 && (driver == "gelsd" || driver == "gelss")) {
     singular_values.resize_(singular_values_shape, MemoryFormat::Contiguous);
   }

   // if 'singular_values' is non-empty it must have the expected shape and be contiguous
   if (driver == "gelsd" || driver == "gelss") {
     TORCH_INTERNAL_ASSERT(singular_values.sizes().equals(singular_values_shape));
     TORCH_INTERNAL_ASSERT(singular_values.is_contiguous());
   }

   // 'input' is modified in-place so we need a column-major copy
   auto input_working_copy = copyBatchedColumnMajor(input);

   // now the actual call that computes the result in-place (apply_lstsq)
   lstsq_stub(input.device().type(), input_working_copy, solution, rank, singular_values, infos, rcond, driver);

   // residuals are available only if m > n and drivers other than gelsy used
   if (m > n && driver != "gelsy") {
     // if the driver is gelss or gelsd then the residuals are available only if rank == n
     bool compute_residuals = true;
     if (driver == "gelss" || driver == "gelsd") {
       if (input.dim() == 2) {
         compute_residuals = (rank.item().toInt() == n);
       } else {
         // it is not clear what to do if some matrices have rank < n in case of batched input
         // For now let's compute the residuals only if all matrices have rank equal to n
         // This behaviour may be changed in the future
         // See https://github.com/pytorch/pytorch/issues/56483
         compute_residuals = at::all(rank == n).item().toBool();
       }
     }
     if (compute_residuals) {
       // LAPACK stores residuals data for postprocessing in rows n:(m-n)
       auto raw_residuals = solution.narrow(/*dim=*/-2, /*start=*/n, /*length*/m - n);
       if (raw_residuals.is_complex()) {
         raw_residuals.mul_(raw_residuals.conj());
         raw_residuals = at::real(raw_residuals);
       } else {
         raw_residuals.pow_(2);
       }
       at::sum_out(residuals, raw_residuals, /*dim=*/-2, /*keepdim=*/false, /*dtype*/real_dtype);
     }
   }
   solution = solution.narrow(/*dim=*/-2, /*start=*/0, /*length*/n);
   if (m == 0) {
     solution.zero_();
   }

   // for 1-dimensional 'other', we need to squeeze the solution after "apply_lstsq"
   if (vector_case) {
     solution = solution.squeeze_(-1);
   }
 }

 static std::string get_default_lstsq_driver(c10::optional<c10::string_view> driver, const Tensor& input) {
   // if `driver` is empty, we set driver_str to "gels" if working with CUDA tensors,
   // otherwise to "gelsy" driver.
   std::string driver_str;
   // check whether the user provided name is a valid driver name
   if (driver.has_value()) {
     driver_str = std::string(driver.value());
     // convert `driver_str` to lower case inplace.
     std::transform(driver_str.begin(), driver_str.end(), driver_str.begin(),
       [](unsigned char c) { return std::tolower(c); });
     static std::unordered_set<c10::string_view> allowed_drivers = {
       "gels", "gelsy", "gelsd", "gelss"
     };
     if (input.device() == at::kCPU) {
       TORCH_CHECK(
         allowed_drivers.find(driver_str) != allowed_drivers.end(),
         "torch.linalg.lstsq: parameter `driver` should be one of "
         "(gels, gelsy, gelsd, gelss)"
       );
     } else { // else if (input.is_cuda())
       TORCH_CHECK(
         driver_str == "gels",
         "torch.linalg.lstsq: `driver` other than `gels` is not supported on CUDA"
       );
     }
   } else {
     // if driver name is not provided, set to default 'gelsy' if on CPU,
     // or to `gels` if on CUDA.
     driver_str = input.is_cuda() ? "gels" : "gelsy";
   }
   return driver_str;
 }

 std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> linalg_lstsq_out(
     const Tensor& input,
     const Tensor& other,
     c10::optional<double> rcond,
     c10::optional<c10::string_view> driver,
     Tensor& solution,
     Tensor& residuals,
     Tensor& rank,
     Tensor& singular_values) {
   TORCH_CHECK(input.dim() >= 2, "torch.linalg.lstsq: input must have at least 2 dimensions.");
   TORCH_CHECK(other.dim() >= 1, "torch.linalg.lstsq: other must have at least 1 dimension.");
   TORCH_CHECK(
       input.scalar_type() == other.scalar_type(),
       "torch.linalg.lstsq: Expected input and other to have the same dtype, but got input's dtype ",
       input.scalar_type(),
       " and other's dtype ",
       other.scalar_type());

   auto dim_diff = input.dim() - other.dim();
   TORCH_CHECK(
       0 <= dim_diff && dim_diff <= 1,
       "torch.linalg.lstsq: input.dim() must be greater or equal to other.dim() and (input.dim() - other.dim()) <= 1");
   Tensor other_2d = dim_diff ? other.unsqueeze(-1) : other;
   TORCH_CHECK(
       input.size(-2) == other_2d.size(-2),
       dim_diff ? "torch.linalg.lstsq: input.size(-2) should match other.size(-1)"
                : "torch.linalg.lstsq: input.size(-2) should match other.size(-2)");

   checkSameDevice("torch.linalg.lstsq", other, input, "other");
   checkSameDevice("torch.linalg.lstsq", solution, input, "solution");
   checkSameDevice("torch.linalg.lstsq", residuals, input, "residuals");
   checkSameDevice("torch.linalg.lstsq", rank, input, "rank");
   checkSameDevice("torch.linalg.lstsq", singular_values, input, "singular_values");

   // 'solution' is expected to have same dtype as input
   checkLinalgCompatibleDtype("torch.linalg.lstsq", solution, input, "solution");

   // 'residuals' is expected to have real float dtype
   ScalarType real_dtype = c10::toValueType(input.scalar_type());
   checkLinalgCompatibleDtype("torch.linalg.lstsq", residuals.scalar_type(), real_dtype, "solution");

   // 'rank' is expected to have integer dtype
   // actual LAPACK calls use int32_t type for rank, but we promote it to int64_t
   // to be consistent with torch.linalg.matrix_rank output dtype
   ScalarType rank_expected_type = ScalarType::Long;
   checkLinalgCompatibleDtype("torch.linalg.lstsq", rank.scalar_type(), rank_expected_type, "rank");

   // 'singular_values' is expected to have real float dtype
   checkLinalgCompatibleDtype("torch.linalg.lstsq", singular_values.scalar_type(), real_dtype, "singular_values");

   std::string driver_name = get_default_lstsq_driver(driver, input);

   // set default rcond value
   double rcond_value = rcond.has_value()
     ? rcond.value()
     : _get_epsilon(c10::toValueType(input.scalar_type())) * std::max<int64_t>(input.size(-2), input.size(-1));

   auto infos = at::zeros({std::max<int64_t>(1, batchCount(input))}, input.options().dtype(kInt));

   // now check whether the provided output tensors can be used directly

   // Two types of 'other' tensors are supported:
   // - 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
   // - 2-dimensional (2D) tensor or batch of 2D tensors (matrix case)
   // original torch.lstsq supported only the matrix case, while NumPy works for both cases
   // for the batched input we need to be able to distinguish them
   // auto expected_batched_rhs_shape = IntArrayRef(input.sizes().data(), input.dim() - 1); // input.shape[:-1]
   // bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sizes().equals(expected_batched_rhs_shape));
   bool vector_case = linalg_solve_is_vector_rhs(input, other);

   // provided output tensor can be used directly if:
   // 1. the shape matches the expected shape
   // 2. the dtype matches the expected dtype
   // 3. the tensor is contiguous

   // Checks for the 'solution' tensor
   std::vector<int64_t> expected_solution_shape = broadcast_batch_size(input, other_2d, input.dim() - 2);
   // the actual shape of the shape of the solution returned in (*, n,) or (*, n, nrhs)
   // but LAPACK requires extra dimensions so the expected shape is (*, max(m, n),) or (*, max(m, n), nrhs)
   expected_solution_shape.push_back(std::max(input.size(-1), input.size(-2)));
   if (!vector_case && other.dim() > 2) {
     expected_solution_shape.push_back(other.size(-1));
   }

   bool solution_equal_expected_shape = solution.sizes().equals(expected_solution_shape);
   bool solution_input_same_type = (solution.scalar_type() == input.scalar_type());

   bool is_solution_batched_column_major = false;
   if (vector_case) {
     is_solution_batched_column_major = solution.is_contiguous();
   } else if (!vector_case && solution.dim() >= 2) {
     is_solution_batched_column_major = solution.transpose(-2, -1).is_contiguous();
   }

   // 'residuals' is not checked here because at::sum_out(residuals, ...) does that

   auto input_batch_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2);

   // Checks for the 'rank' tensor
   // rank is a scalar value for each matrix in the batch so
   // rank's expected shape is equal to input.shape[0:input.ndim-2]
   bool rank_equal_expected_shape = true;
   bool rank_equal_expected_type = true;
   bool rank_is_contiguous = true;
   if (driver_name != "gels") { // gels driver doesn't set 'rank'
     rank_equal_expected_shape = rank.sizes().equals(input_batch_shape);
     rank_equal_expected_type = (rank.scalar_type() == at::kLong);
     rank_is_contiguous = rank.is_contiguous();
   }

   // Checks for the 'singular_values' tensor
   // singular values are computed only with "gelsd" and "gelss" drivers currently
   bool singular_values_equal_expected_shape = true;
   bool singular_values_equal_expected_type = true;
   bool singular_values_is_contiguous = true;
   if (driver_name == "gelsd" || driver_name == "gelss") {
     auto singular_values_shape = input_batch_shape.vec();
     singular_values_shape.push_back(std::min(input.size(-1), input.size(-2)));
     singular_values_equal_expected_shape = singular_values.sizes().equals(singular_values_shape);
     singular_values_equal_expected_type = (singular_values.scalar_type() == real_dtype);
     singular_values_is_contiguous = singular_values.is_contiguous();
   }

   // if solution is not empty and not in batched column major format
   bool copy_needed = (solution.numel() != 0 && !is_solution_batched_column_major);
   copy_needed |= !solution_input_same_type;  // or solution does not have the same dtype as input
   copy_needed |= (solution.numel() != 0 && !solution_equal_expected_shape); // or solution does not have the expected shape

   copy_needed |= !rank_equal_expected_type;
   copy_needed |= (rank.numel() != 0 && !rank_equal_expected_shape);
   copy_needed |= (rank.numel() != 0 && !rank_is_contiguous);

   copy_needed |= !singular_values_equal_expected_type;
   copy_needed |= (singular_values.numel() != 0 && !singular_values_equal_expected_shape);
   copy_needed |= (singular_values.numel() != 0 && !singular_values_is_contiguous);

   if (copy_needed) { // we have to allocate temporary tensors
     Tensor solution_tmp = at::empty({0}, input.options());
     Tensor residuals_tmp = at::empty({0}, input.options().dtype(real_dtype));
     Tensor rank_tmp = at::empty({0}, input.options().dtype(at::kLong));
     Tensor singular_values_tmp = at::empty({0}, input.options().dtype(real_dtype));

     linalg_lstsq_out_info(solution_tmp, residuals_tmp, rank_tmp, singular_values_tmp, infos, input, other, rcond_value, driver_name);

     at::native::resize_output(solution, solution_tmp.sizes());
     solution.copy_(solution_tmp);

     at::native::resize_output(residuals, residuals_tmp.sizes());
     residuals.copy_(residuals_tmp);

     at::native::resize_output(rank, rank_tmp.sizes());
     rank.copy_(rank_tmp);

     at::native::resize_output(singular_values, singular_values_tmp.sizes());
     singular_values.copy_(singular_values_tmp);
   } else {
     // else use the provided output storage directly
     linalg_lstsq_out_info(solution, residuals, rank, singular_values, infos, input, other, rcond_value, driver_name);
   }

   if (infos.numel() > 1) {
     batchCheckErrors(infos, "torch.linalg.lstsq");
   } else {
     singleCheckErrors(infos.item<int64_t>(), "torch.linalg.lstsq");
   }

   return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(solution, residuals, rank, singular_values);
 }

 std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
     const Tensor& input, const Tensor& other,
     c10::optional<double> rcond,
     c10::optional<c10::string_view> driver) {
   Tensor solution = at::empty({0}, input.options());
   Tensor residuals = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
   Tensor rank = at::empty({0}, input.options().dtype(at::kLong));
   Tensor singular_values = at::empty({0}, input.options().dtype(toValueType(input.scalar_type())));
   std::tie(solution, residuals, rank, singular_values) =
       at::linalg_lstsq_outf(input, other, rcond, driver, solution, residuals, rank, singular_values);
   return std::make_tuple(solution, residuals, rank, singular_values);
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 DEFINE_DISPATCH(lu_solve_stub);

 // Supports arbitrary batch dimensions for self and LU_data (implicitly LU_pivots also)
 Tensor lu_solve(const Tensor& self, const Tensor& LU_data, const Tensor& LU_pivots) {
   TORCH_CHECK(self.dim() >= 2,
               "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
   TORCH_CHECK(LU_data.dim() >= 2,
               "LU_data should have at least 2 dimensions, but has ", LU_data.dim(), " dimensions instead");
   TORCH_CHECK(LU_pivots.size(-1) == LU_data.size(-1),
               "Number of pivots per batch should be same as the dimension of the matrix");
   TORCH_CHECK(LU_pivots.dtype() == at::kInt,
               "LU_pivots should be a Tensor of scalar type Int");
   TORCH_CHECK(LU_pivots.device() == LU_data.device(),
               "Expected LU_pivots and LU_data to be on the same device, "
               "but found LU_pivots on ", LU_pivots.device(), " and LU_data on ",
               LU_data.device(), " instead");

   // We check whether the batch dimensions of LU_pivots match the batch dimensions of LU_data
   // e.g.: LU_pivots.sizes() = 4 x 3 x 2, LU_data.sizes() = 4 x 3 x 2 x 2 is a pair of correct inputs
   // e.g.: LU_pivots.sizes() = 4 x 3 x 2, LU_data.sizes() = 12 x 2 x 2 is a pair of incorrect inputs
   IntArrayRef pivots_sizes(LU_pivots.sizes().data(), LU_pivots.dim() - 1);
   IntArrayRef lu_sizes(LU_data.sizes().data(), LU_data.dim() - 2);
   TORCH_CHECK(pivots_sizes == lu_sizes,
               "batch dimensions of LU_pivots doesn't match batch dimensions of LU_data");

   Tensor self_broadcasted, LU_data_broadcasted;
   std::tie(self_broadcasted, LU_data_broadcasted) = _linalg_broadcast_batch_dims(self, LU_data, "lu_solve");

   // Now, we need to broadcast pivots too for the batch dimensions to match
   IntArrayRef new_pivots_sizes(LU_data_broadcasted.sizes().data(), LU_data_broadcasted.dim() - 1);
   Tensor LU_pivots_broadcasted = LU_pivots.expand(new_pivots_sizes);

   // lu_solve_stub (apply_lu_solve) requires batched column major (Fortran-contiguous) tensors
   // 'result' tensor is modified in-place and must be a copy of 'self_broadcasted'
   Tensor result = cloneBatchedColumnMajor(self_broadcasted);

   // if LU_data is Fortran-contiguous no need to make a copy
   bool is_LU_data_batched_column_major = LU_data_broadcasted.transpose(-2, -1).is_contiguous();
   Tensor LU_data_working_copy = is_LU_data_batched_column_major ? LU_data_broadcasted : cloneBatchedColumnMajor(LU_data_broadcasted);
   Tensor LU_pivots_working_copy = LU_pivots_broadcasted.is_contiguous() ? LU_pivots_broadcasted : LU_pivots_broadcasted.contiguous();

   lu_solve_stub(self.device().type(), result, LU_data_working_copy, LU_pivots_working_copy);
   return result;
 }

 Tensor& lu_solve_out(const Tensor& self, const Tensor& LU_data, const Tensor& LU_pivots, Tensor& result) {
   checkSameDevice("lu_solve", result, self);
   checkLinalgCompatibleDtype("lu_solve", result, self);
   Tensor result_tmp = at::lu_solve(self, LU_data, LU_pivots);
   at::native::resize_output(result, result_tmp.sizes());
   result.copy_(result_tmp);
   return result;
 }

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ legacy_lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 // This wraps Lapack's gels routine, which uses a QR or LQ factorization to
 // solve any linear system, minimizing ||A.X - B||
 // A & B must be fortran-contiguous matrixes.
 // On exit, A is overwritten with the QR/LQ factorization of input A
 //          B is overwritten with the solution vectors
 template <typename scalar_t>
 static void apply_lstsq(const Tensor& B, const Tensor& A) {
 #if !AT_BUILD_WITH_LAPACK()
   TORCH_INTERNAL_ASSERT(false, "lstsq: LAPACK library not found in compilation");
 #else

   int m, n, nrhs, lda, ldb, info, lwork;
   scalar_t wkopt = 0.0;
   lwork = -1; // work length
   m = A.size(0);
   n = A.size(1);
   nrhs = B.size(1);
   info = 0;
   lda = m;
   ldb = (m > n) ? m : n;

   auto B_data = B.data_ptr<scalar_t>();
   auto A_data = A.data_ptr<scalar_t>();

   // get info how much space is needed
   lapackGels<scalar_t>('N', m, n, nrhs, A_data, lda, B_data, ldb, &wkopt, lwork, &info);

   lwork = static_cast<int>(wkopt);
   Tensor work_tensor = at::empty({lwork}, A.scalar_type());
   auto work = work_tensor.data_ptr<scalar_t>();

   lapackGels<scalar_t>('N', m, n, nrhs, A_data, lda, B_data, ldb, work, lwork, &info);

   TORCH_CHECK(
       info >= 0,
       "Lapack Error in gels : Illegal argument ", -info);
   TORCH_CHECK(
       info == 0,
       "Lapack Error in gels: The ", info, "-th diagonal element of the ",
       "triangular factor of A is zero");
 #endif
 }

 std::tuple<Tensor, Tensor> legacy_lstsq(const Tensor& B, const Tensor& A) {
   TORCH_WARN_ONCE(
     "torch.lstsq is deprecated in favor of torch.linalg.lstsq and will be removed in a future PyTorch release.\n",
     "torch.linalg.lstsq has reversed arguments and does not return the QR decomposition in "
     "the returned tuple (although it returns other information about the problem).\n",
     "To get the qr decomposition consider using torch.linalg.qr.\n",
     "The returned solution in torch.lstsq stored the residuals of the solution in the ",
     "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, the ",
     "residuals in the field 'residuals' of the returned named tuple.\n",
     "The unpacking of the solution, as in\n",
     "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n",
     "should be replaced with\n",
     "X = torch.linalg.lstsq(A, B).solution");

   TORCH_CHECK(A.scalar_type() == B.scalar_type(), "Exepected A and B dtypes to match but found ",
               A.scalar_type(), " and ", B.scalar_type());
   TORCH_CHECK(A.dim() == 2, "Expected A to have 2 dimensions, but got ", A.dim());
   TORCH_CHECK(A.numel() != 0, "A should not be empty");
   TORCH_CHECK(B.dim() == 1 || B.dim() == 2, "Expected B to have 1 or 2 "
       "dimensions, but got ", B.dim());
   TORCH_CHECK(B.numel() != 0, "B should not be empty");
   TORCH_CHECK(A.size(0) == B.size(0), "Expected A and B to have same size "
       "at dim 0, but A has ", A.size(0), " rows and B has ", B.size(0), " rows");

   const auto a_sizes = A.sizes();
   const auto ldb = std::max(a_sizes[0], a_sizes[1]);

   auto A_working = cloneBatchedColumnMajor(A);
   auto B_working = copyBatchedColumnMajor(B.dim() == 1 ? B.unsqueeze(1) : B, ldb);

   AT_DISPATCH_FLOATING_TYPES(B.scalar_type(), "lstsq_cpu", [&] {
     apply_lstsq<scalar_t>(B_working, A_working);
   });

   return std::tuple<Tensor, Tensor>(B_working, A_working);
 }

 std::tuple<Tensor&,Tensor&> legacy_lstsq_out(
     const Tensor& B, const Tensor& A, Tensor& B_out, Tensor& A_out) {
   const auto dtype = A.scalar_type();
   TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ",
               A.scalar_type(), " and ", B.scalar_type());
   TORCH_CHECK(A_out.scalar_type() == dtype, "A_out to have scalar type ", dtype,
               " but found", A_out.scalar_type());
   TORCH_CHECK(B_out.scalar_type() == dtype, "A_out to have scalar type ", dtype,
               " but found", B_out.scalar_type());
   Tensor A_tmp, B_tmp;
   std::tie(B_tmp, A_tmp) = native::legacy_lstsq(B, A);
   resize_output(A_out, A_tmp.sizes());
   A_out.copy_(A_tmp);
   resize_output(B_out, B_tmp.sizes());
   B_out.copy_(B_tmp);
   return std::tuple<Tensor&, Tensor&>(B_out, A_out);
 }

 }}  // namespace at::native