Revert D26723384: [pytorch][PR] Implements `torch.linalg.lstsq`

Test Plan: revert-hammer

Differential Revision:
D26723384 (https://github.com/pytorch/pytorch/commit/3ac90132351b3dab51d6ecede52db62e101257e1)

Original commit changeset: c9866a95f140

fbshipit-source-id: 3e5263d71facdc91ca09d7dcbbbe3ba818ee2821
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 5b0ab41..8df4322 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -122,78 +122,6 @@
 extern "C" void cgetrs_(char *trans, int *n, int *nrhs, std::complex<float> *a, int *lda, int *ipiv, std::complex<float> *b, int *ldb, int *info);
 extern "C" void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
 extern "C" void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
-
-// gels
-extern "C" void zgels_(char *trans, int *m, int *n, int *nrhs,
-    std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb,
-    std::complex<double> *work, int *lwork, int *info);
-extern "C" void cgels_(char *trans, int *m, int *n, int *nrhs,
-    std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb,
-    std::complex<float> *work, int *lwork, int *info);
-extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs,
-    double *a, int *lda, double *b, int *ldb,
-    double *work, int *lwork, int *info);
-extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs,
-    float *a, int *lda, float *b, int *ldb,
-    float *work, int *lwork, int *info);
-
-// gelsd
-extern "C" void zgelsd_(int *m, int *n, int *nrhs,
-    std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb,
-    double *s, double *rcond, int *rank,
-    std::complex<double> *work, int *lwork, double *rwork, int *iwork, int *info);
-extern "C" void cgelsd_(int *m, int *n, int *nrhs,
-    std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb,
-    float *s, float *rcond, int *rank,
-    std::complex<float> *work, int *lwork, float *rwork, int *iwork, int *info);
-extern "C" void dgelsd_(int *m, int *n, int *nrhs,
-    double *a, int *lda, double *b, int *ldb,
-    double *s, double *rcond, int *rank,
-    double *work, int *lwork, int *iwork, int *info);
-extern "C" void sgelsd_(int *m, int *n, int *nrhs,
-    float *a, int *lda, float *b, int *ldb,
-    float *s, float *rcond, int *rank,
-    float *work, int *lwork, int *iwork, int *info);
-
-// gelsy
-extern "C" void zgelsy_(int *m, int *n, int *nrhs,
-    std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb,
-    int *jpvt, double *rcond, int *rank,
-    std::complex<double> *work, int *lwork,
-    double *rwork, int *info);
-extern "C" void cgelsy_(int *m, int *n, int *nrhs,
-    std::complex<float> * a, int *lda, std::complex<float> *b, int *ldb,
-    int *jpvt, float *rcond, int *rank,
-    std::complex<float> *work, int *lwork,
-    float *rwork, int *info);
-extern "C" void dgelsy_(int *m, int *n, int *nrhs,
-    double *a, int *lda, double *b, int *ldb,
-    int *jpvt, double *rcond, int *rank,
-    double *work, int *lwork, int *info);
-extern "C" void sgelsy_(int *m, int *n, int *nrhs,
-    float *a, int *lda, float *b, int *ldb,
-    int *jpvt, float *rcond, int *rank,
-    float *work, int *lwork, int *info);
-
-// gelss
-extern "C" void zgelss_(int *m, int *n, int *nrhs,
-    std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb,
-    double *s, double *rcond, int *rank,
-    std::complex<double> *work, int *lwork,
-    double *rwork, int *info);
-extern "C" void cgelss_(int *m, int *n, int *nrhs,
-    std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb,
-    float *s, float *rcond, int *rank,
-    std::complex<float> *work, int *lwork,
-    float *rwork, int *info);
-extern "C" void dgelss_(int *m, int *n, int *nrhs,
-    double *a, int *lda, double *b, int *ldb,
-    double *s, double *rcond, int *rank,
-    double *work, int *lwork, int *info);
-extern "C" void sgelss_(int *m, int *n, int *nrhs,
-    float *a, int *lda, float *b, int *ldb,
-    float *s, float *rcond, int *rank,
-    float *work, int *lwork, int *info);
 #endif
 
 namespace at {
@@ -236,126 +164,6 @@
 template<class scalar_t>
 void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
 
-template<class scalar_t>
-void lapackGels(char trans, int m, int n, int nrhs,
-    scalar_t *a, int lda, scalar_t *b, int ldb,
-    scalar_t *work, int lwork, int *info);
-
-template<class scalar_t, class value_t = scalar_t>
-void lapackGelsd(int m, int n, int nrhs,
-    scalar_t *a, int lda, scalar_t *b, int ldb,
-    value_t *s, value_t rcond, int *rank,
-    scalar_t* work, int lwork,
-    value_t *rwork, int* iwork, int *info);
-
-template<class scalar_t, class value_t = scalar_t>
-void lapackGelsy(int m, int n, int nrhs,
-    scalar_t *a, int lda, scalar_t *b, int ldb,
-    int *jpvt, value_t rcond, int *rank,
-    scalar_t *work, int lwork, value_t* rwork, int *info);
-
-template<class scalar_t, class value_t = scalar_t>
-void lapackGelss(int m, int n, int nrhs,
-    scalar_t *a, int lda, scalar_t *b, int ldb,
-    value_t *s, value_t rcond, int *rank,
-    scalar_t *work, int lwork,
-    value_t *rwork, int *info);
-
-enum class LapackLstsqDriverType : int64_t { Gels, Gelsd, Gelsy, Gelss};
-
-template<LapackLstsqDriverType, class scalar_t, class value_t = scalar_t>
-struct lapackLstsq_impl;
-
-template<class scalar_t, class value_t>
-struct lapackLstsq_impl<LapackLstsqDriverType::Gels, scalar_t, value_t> {
-  static void call(
-      char trans, int m, int n, int nrhs,
-      scalar_t *a, int lda, scalar_t *b, int ldb,
-      scalar_t *work, int lwork, int *info, // Gels flavor
-      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
-      value_t *s, // Gelss flavor
-      int *iwork // Gelsd flavor
-      ) {
-    lapackGels<scalar_t>(
-        trans, m, n, nrhs,
-        a, lda, b, ldb,
-        work, lwork, info);
-  }
-};
-
-template<class scalar_t, class value_t>
-struct lapackLstsq_impl<LapackLstsqDriverType::Gelsy, scalar_t, value_t> {
-  static void call(
-      char trans, int m, int n, int nrhs,
-      scalar_t *a, int lda, scalar_t *b, int ldb,
-      scalar_t *work, int lwork, int *info, // Gels flavor
-      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
-      value_t *s, // Gelss flavor
-      int *iwork // Gelsd flavor
-      ) {
-    lapackGelsy<scalar_t, value_t>(
-        m, n, nrhs,
-        a, lda, b, ldb,
-        jpvt, rcond, rank,
-        work, lwork, rwork, info);
-  }
-};
-
-template<class scalar_t, class value_t>
-struct lapackLstsq_impl<LapackLstsqDriverType::Gelsd, scalar_t, value_t> {
-  static void call(
-      char trans, int m, int n, int nrhs,
-      scalar_t *a, int lda, scalar_t *b, int ldb,
-      scalar_t *work, int lwork, int *info, // Gels flavor
-      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
-      value_t *s, // Gelss flavor
-      int *iwork // Gelsd flavor
-      ) {
-    lapackGelsd<scalar_t, value_t>(
-        m, n, nrhs,
-        a, lda, b, ldb,
-        s, rcond, rank,
-        work, lwork,
-        rwork, iwork, info);
-  }
-};
-
-template<class scalar_t, class value_t>
-struct lapackLstsq_impl<LapackLstsqDriverType::Gelss, scalar_t, value_t> {
-  static void call(
-      char trans, int m, int n, int nrhs,
-      scalar_t *a, int lda, scalar_t *b, int ldb,
-      scalar_t *work, int lwork, int *info, // Gels flavor
-      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
-      value_t *s, // Gelss flavor
-      int *iwork // Gelsd flavor
-      ) {
-    lapackGelss<scalar_t, value_t>(
-        m, n, nrhs,
-        a, lda, b, ldb,
-        s, rcond, rank,
-        work, lwork,
-        rwork, info);
-  }
-};
-
-template<LapackLstsqDriverType driver_type, class scalar_t, class value_t = scalar_t>
-void lapackLstsq(
-    char trans, int m, int n, int nrhs,
-    scalar_t *a, int lda, scalar_t *b, int ldb,
-    scalar_t *work, int lwork, int *info, // Gels flavor
-    int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
-    value_t *s, // Gelss flavor
-    int *iwork // Gelsd flavor
-    ) {
-  lapackLstsq_impl<driver_type, scalar_t, value_t>::call(
-      trans, m, n, nrhs,
-      a, lda, b, ldb,
-      work, lwork, info,
-      jpvt, rcond, rank, rwork,
-      s,
-      iwork);
-}
 
 template<> void lapackSolve<c10::complex<double>>(int n, int nrhs, c10::complex<double> *a, int lda, int *ipiv, c10::complex<double> *b, int ldb, int *info) {
   zgesv_(&n, &nrhs, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, reinterpret_cast<std::complex<double>*>(b), &ldb, info);
@@ -614,196 +422,6 @@
 template<> void lapackLuSolve<float>(char trans, int n, int nrhs, float *a, int lda, int *ipiv, float *b, int ldb, int *info) {
   sgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
 }
-
-template<> void lapackGels<c10::complex<double>>(
-    char trans, int m, int n, int nrhs,
-    c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb,
-    c10::complex<double> *work, int lwork, int *info) {
-  zgels_(&trans, &m, &n, &nrhs,
-      reinterpret_cast<std::complex<double>*>(a), &lda,
-      reinterpret_cast<std::complex<double>*>(b), &ldb,
-      reinterpret_cast<std::complex<double>*>(work), &lwork, info);
-}
-
-template<> void lapackGels<c10::complex<float>>(
-    char trans, int m, int n, int nrhs,
-    c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb,
-    c10::complex<float> *work, int lwork, int *info) {
-  cgels_(&trans, &m, &n, &nrhs,
-      reinterpret_cast<std::complex<float>*>(a), &lda,
-      reinterpret_cast<std::complex<float>*>(b), &ldb,
-      reinterpret_cast<std::complex<float>*>(work), &lwork, info);
-}
-
-template<> void lapackGels<double>(
-    char trans, int m, int n, int nrhs,
-    double *a, int lda, double *b, int ldb,
-    double *work, int lwork, int *info) {
-  dgels_(&trans, &m, &n, &nrhs,
-      a, &lda, b, &ldb, work, &lwork, info);
-}
-
-template<> void lapackGels<float>(
-    char trans, int m, int n, int nrhs,
-    float *a, int lda, float *b, int ldb,
-    float *work, int lwork, int *info) {
-  sgels_(&trans, &m, &n, &nrhs,
-      a, &lda, b, &ldb, work, &lwork, info);
-}
-
-template<> void lapackGelsd<c10::complex<double>, double>(
-    int m, int n, int nrhs,
-    c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb,
-    double *s, double rcond, int *rank,
-    c10::complex<double> *work, int lwork,
-    double *rwork, int *iwork, int *info) {
-  zgelsd_(&m, &n, &nrhs,
-      reinterpret_cast<std::complex<double>*>(a), &lda,
-      reinterpret_cast<std::complex<double>*>(b), &ldb,
-      s, &rcond, rank,
-      reinterpret_cast<std::complex<double>*>(work), &lwork,
-      rwork, iwork, info);
-}
-
-template<> void lapackGelsd<c10::complex<float>, float>(
-    int m, int n, int nrhs,
-    c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb,
-    float *s, float rcond, int *rank,
-    c10::complex<float> *work, int lwork,
-    float *rwork, int *iwork, int *info) {
-  cgelsd_(&m, &n, &nrhs,
-      reinterpret_cast<std::complex<float>*>(a), &lda,
-      reinterpret_cast<std::complex<float>*>(b), &ldb,
-      s, &rcond, rank,
-      reinterpret_cast<std::complex<float>*>(work), &lwork,
-      rwork, iwork, info);
-}
-
-template<> void lapackGelsd<double>(
-    int m, int n, int nrhs,
-    double *a, int lda, double *b, int ldb,
-    double *s, double rcond, int *rank,
-    double *work, int lwork,
-    double *rwork, int *iwork, int *info) {
-  dgelsd_(&m, &n, &nrhs,
-      a, &lda, b, &ldb,
-      s, &rcond, rank,
-      work, &lwork, iwork, info);
-}
-
-template<> void lapackGelsd<float>(
-    int m, int n, int nrhs,
-    float *a, int lda, float *b, int ldb,
-    float *s, float rcond, int *rank,
-    float *work, int lwork,
-    float *rwork, int *iwork, int *info) {
-  sgelsd_(&m, &n, &nrhs,
-      a, &lda, b, &ldb,
-      s, &rcond, rank,
-      work, &lwork, iwork, info);
-}
-
-template<> void lapackGelsy<c10::complex<double>, double>(
-    int m, int n, int nrhs,
-    c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb,
-    int *jpvt, double rcond, int *rank,
-    c10::complex<double> *work, int lwork, double *rwork, int *info) {
-  zgelsy_(&m, &n, &nrhs,
-      reinterpret_cast<std::complex<double>*>(a), &lda,
-      reinterpret_cast<std::complex<double>*>(b), &ldb,
-      jpvt, &rcond, rank,
-      reinterpret_cast<std::complex<double>*>(work), &lwork,
-      rwork, info);
-}
-
-template<> void lapackGelsy<c10::complex<float>, float>(
-    int m, int n, int nrhs,
-    c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb,
-    int *jpvt, float rcond, int *rank,
-    c10::complex<float> *work, int lwork, float *rwork, int *info) {
-  cgelsy_(&m, &n, &nrhs,
-      reinterpret_cast<std::complex<float>*>(a), &lda,
-      reinterpret_cast<std::complex<float>*>(b), &ldb,
-      jpvt, &rcond, rank,
-      reinterpret_cast<std::complex<float>*>(work), &lwork,
-      rwork, info);
-}
-
-template<> void lapackGelsy<double>(
-    int m, int n, int nrhs,
-    double *a, int lda, double *b, int ldb,
-    int *jpvt, double rcond, int *rank,
-    double *work, int lwork, double *rwork, int *info) {
-  dgelsy_(&m, &n, &nrhs,
-      a, &lda, b, &ldb,
-      jpvt, &rcond, rank,
-      work, &lwork, info);
-}
-
-template<> void lapackGelsy<float>(
-    int m, int n, int nrhs,
-    float *a, int lda, float *b, int ldb,
-    int *jpvt, float rcond, int *rank,
-    float *work, int lwork, float *rwork, int *info) {
-  sgelsy_(&m, &n, &nrhs,
-      a, &lda, b, &ldb,
-      jpvt, &rcond, rank,
-      work, &lwork, info);
-}
-
-template<> void lapackGelss<c10::complex<double>, double>(
-    int m, int n, int nrhs,
-    c10::complex<double> *a, int lda, c10::complex<double> *b, int ldb,
-    double *s, double rcond, int *rank,
-    c10::complex<double> *work, int lwork,
-    double *rwork, int *info
-    ) {
-  zgelss_(&m, &n, &nrhs,
-      reinterpret_cast<std::complex<double>*>(a), &lda,
-      reinterpret_cast<std::complex<double>*>(b), &ldb,
-      s, &rcond, rank,
-      reinterpret_cast<std::complex<double>*>(work), &lwork,
-      rwork, info);
-}
-
-template<> void lapackGelss<c10::complex<float>, float>(
-    int m, int n, int nrhs,
-    c10::complex<float> *a, int lda, c10::complex<float> *b, int ldb,
-    float *s, float rcond, int *rank,
-    c10::complex<float> *work, int lwork,
-    float *rwork, int *info
-    ) {
-  cgelss_(&m, &n, &nrhs,
-      reinterpret_cast<std::complex<float>*>(a), &lda,
-      reinterpret_cast<std::complex<float>*>(b), &ldb,
-      s, &rcond, rank,
-      reinterpret_cast<std::complex<float>*>(work), &lwork,
-      rwork, info);
-}
-
-template<> void lapackGelss<double>(
-    int m, int n, int nrhs,
-    double *a, int lda, double *b, int ldb,
-    double *s, double rcond, int *rank,
-    double *work, int lwork,
-    double *rwork, int *info) {
-  dgelss_(&m, &n, &nrhs,
-      a, &lda, b, &ldb,
-      s, &rcond, rank,
-      work, &lwork, info);
-}
-
-template<> void lapackGelss<float>(
-    int m, int n, int nrhs,
-    float *a, int lda, float *b, int ldb,
-    float *s, float rcond, int *rank,
-    float *work, int lwork,
-    float *rwork, int *info) {
-  sgelss_(&m, &n, &nrhs,
-      a, &lda, b, &ldb,
-      s, &rcond, rank,
-      work, &lwork, info);
-}
 #endif
 
 // Below of the definitions of the functions operating on a batch that are going to be dispatched
@@ -2306,474 +1924,6 @@
   return std::tuple<Tensor&, Tensor&, Tensor&>(U, S, VT);
 }
 
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#ifdef USE_LAPACK
-template<class scalar_t, class value_t, class func_t>
-struct LapackLstsqHelper {
-  using self_type = LapackLstsqHelper;
-
-  // we use `driver_type` to decide how to initialize
-  // relevant to specific drivers parameters
-  LapackLstsqDriverType driver_type;
-  func_t driver;
-
-  bool is_complex;
-  at::ScalarType scalar_type;
-  IntArrayRef batch_shape;
-  // the strides below store the offsets to different lstsq problems in a batch
-  int64_t a_stride;
-  int64_t b_stride;
-  int64_t s_stride;
-
-  // variables below correspond to LAPACK inputs.
-  // for more information check the LAPACK documentation on
-  // `?gels`, `?gelsy`, `?gelsd`, `?gelss`
-  char trans;
-  int m;
-  int n;
-  int nrhs;
-  scalar_t* a_working_ptr = nullptr;
-  int lda;
-  scalar_t* b_working_ptr = nullptr;
-  int ldb;
-  Tensor work;
-  scalar_t work_opt; // used to decide the opt `work` size with lwork=-1
-  scalar_t* work_ptr = &work_opt;
-  int lwork = -1; // default value to decide the opt size for workspace arrays
-  int info = 0;
-  Tensor jpvt;
-  int* jpvt_ptr = nullptr;
-  value_t rcond;
-  Tensor rank, rank_1d;
-  int rank_opt;
-  int64_t* rank_working_ptr = nullptr;
-  Tensor rwork;
-  value_t rwork_opt; // used to decide the opt `rwork` size with lwork=-1
-  value_t* rwork_ptr = &rwork_opt;
-  Tensor s, s_2d;
-  value_t* s_working_ptr = nullptr;
-  Tensor iwork;
-  int iwork_opt; // used to decide the opt `iwork` size with lwork=-1
-  int* iwork_ptr = &iwork_opt;
-
-  LapackLstsqHelper(LapackLstsqDriverType driver_type, func_t driver)
-    : driver_type{driver_type}, driver{driver}
-  {}
-
-  self_type& set_trans(char trans) { this->trans = trans; return *this; }
-  self_type& set_m(int m) { this->m = m; return *this; }
-  self_type& set_n(int n) { this->n = n; return *this; }
-  self_type& set_nrhs(int nrhs) { this->nrhs = nrhs; return *this; }
-  self_type& set_a(const Tensor& a) {
-    this->a_working_ptr = a.data_ptr<scalar_t>();
-    this->scalar_type = a.scalar_type();
-    this->is_complex = a.is_complex();
-    // `a` is persistent, should be safe to store its properties in references.
-    this->batch_shape = IntArrayRef(a.sizes().data(), a.dim() - 2);
-    this->a_stride = matrixStride(a);
-    return *this;
-  }
-  self_type& set_lda(int lda) { this->lda = lda; return *this; }
-  self_type& set_b(const Tensor& b) {
-    this->b_working_ptr = b.data_ptr<scalar_t>();
-    this->b_stride = matrixStride(b);
-    return *this;
-  }
-  self_type& set_ldb(int ldb) { this->ldb = ldb; return *this; }
-  self_type& set_work() {
-    lwork = static_cast<int>(real_impl<scalar_t, value_t>(work_opt));
-    work = at::empty({lwork}, scalar_type);
-    work_ptr = work.data_ptr<scalar_t>();
-    return *this;
-  }
-  self_type& set_jpvt() {
-    // handle `jpvt` workspace array (relevant for `?gelsy` which uses
-    // a QR factorization with column pivoting).
-    if (LapackLstsqDriverType::Gelsy == driver_type) {
-      jpvt = at::empty({std::max<int64_t>(1, n)}, at::kInt);
-      jpvt_ptr = jpvt.data_ptr<int>();
-    }
-    return *this;
-  }
-  self_type& set_rcond(double cond) { this->rcond = static_cast<value_t>(cond); return *this; }
-  self_type& set_rank() {
-    // only `?gels` is not rank-revealing
-    if (LapackLstsqDriverType::Gels != driver_type) {
-      if (!batch_shape.size()) {
-        rank = at::empty({}, at::kLong);
-      }
-      else {
-        rank = at::empty(batch_shape, at::kLong);
-      }
-      rank_1d = rank.view({-1});
-      rank_working_ptr = rank.data_ptr<int64_t>();
-    }
-    return *this;
-  }
-  self_type& set_rwork() {
-    // `rwork` only makes sense for complex flavors and
-    // `?gelsy`, `?gelsd` and `?gelss` drivers
-    if (!this->is_complex || LapackLstsqDriverType::Gels == driver_type) {
-      return *this;
-    }
-
-    int64_t rwork_len;
-    switch (this->driver_type) {
-      case LapackLstsqDriverType::Gelsy:
-        rwork_len = std::max<int64_t>(1, 2 * n);
-        break;
-      case LapackLstsqDriverType::Gelss:
-        rwork_len = std::max<int64_t>(1, 5 * std::min(m, n));
-        break;
-      // case LapackLstsqDriverType::Gelsd:
-      default:
-        rwork_len = static_cast<int64_t>(rwork_opt);
-    }
-    rwork = at::empty({rwork_len}, c10::toValueType(scalar_type));
-    rwork_ptr = rwork.data_ptr<value_t>();
-    return *this;
-  }
-  self_type& set_s() {
-    // `?gelsd` and `?gelss` are SVD-based
-    // so we can extract singular values from them.
-    if (LapackLstsqDriverType::Gelsd == driver_type
-      || LapackLstsqDriverType::Gelss == driver_type) {
-      auto s_shape = batch_shape.vec();
-      s_shape.push_back(std::min(m, n));
-      s = at::empty(s_shape, c10::toValueType(scalar_type));
-      s_working_ptr = s.data_ptr<value_t>();
-      s_stride = s.size(-1);
-      s_2d = s.view({-1, std::min(m, n)});
-    }
-    return *this;
-  }
-  self_type& set_iwork() {
-    // handle `iwork` workspace array (relevant only for `?gelsd`)
-    if (LapackLstsqDriverType::Gelsd == driver_type) {
-      iwork = at::empty({iwork_opt}, at::kInt);
-      iwork_ptr = iwork.data_ptr<int>();
-    }
-    return *this;
-  }
-
-  self_type& call_driver() {
-    driver(trans, m, n, nrhs,
-      a_working_ptr, lda,
-      b_working_ptr, ldb,
-      work_ptr, lwork,
-      &info,
-      jpvt_ptr,
-      rcond,
-      &rank_opt,
-      rwork_ptr,
-      s_working_ptr,
-      iwork_ptr);
-    // we want the output `rank` Tensor to be of type int64_t,
-    // however LAPACK accepts int. That is why we use an integer
-    // variable that then gets promoted and written into `rank`.
-    // We use this approach over a tensor cast for better performance.
-    if (rank_working_ptr) {
-      *rank_working_ptr = static_cast<int64_t>(rank_opt);
-    }
-    return *this;
-  }
-
-  self_type& next() {
-    // advance to the next problem in a batch.
-    // Should only be used if a.shape[:-2] == b.shape[:-2]
-    a_working_ptr += a_stride;
-    b_working_ptr += b_stride;
-    rank_working_ptr = rank_working_ptr ? rank_working_ptr + 1 : nullptr;
-    s_working_ptr = s_working_ptr ? s_working_ptr + s_stride : nullptr;
-    return *this;
-  }
-
-  self_type& next(scalar_t* a_working_ptr, scalar_t* b_working_ptr,
-    int64_t a_linear_batch_idx) {
-    // advance to the next problem in a batch.
-    // Designed to be used with the `batch_iterator_with_broadcasting` method.
-    this->a_working_ptr = a_working_ptr;
-    this->b_working_ptr = b_working_ptr;
-    rank_working_ptr = rank_working_ptr
-      ? rank_1d.select(0, a_linear_batch_idx).template data_ptr<int64_t>()
-      : nullptr;
-    s_working_ptr = s_working_ptr
-      ? s_2d.select(0, a_linear_batch_idx).template data_ptr<value_t>()
-      : nullptr;
-    return *this;
-  }
-};
-
-// we use `enum class LapackLstsqDriverType` as keys in an unordered_map.
-// Clang5 and Gcc5 do not support std::hash for enum classes, hence
-// we provide our own hash function.
-struct LapackLstsqDriverTypeHash {
-  std::size_t operator()(const LapackLstsqDriverType& driver_type) const {
-    return static_cast<std::size_t>(driver_type);
-  }
-};
-#endif
-
-std::tuple<Tensor, Tensor, Tensor> _lstsq_helper_cpu(
-    const Tensor& a, const Tensor& b, double cond, c10::optional<std::string> driver_name) {
-#ifndef USE_LAPACK
-  AT_ERROR("torch.linalg.lstsq: LAPACK library not found in compilation");
-#else
-
-  static auto driver_string_to_type = std::unordered_map<std::string, LapackLstsqDriverType>({
-    {"gels", LapackLstsqDriverType::Gels},
-    {"gelsy", LapackLstsqDriverType::Gelsy},
-    {"gelsd", LapackLstsqDriverType::Gelsd},
-    {"gelss", LapackLstsqDriverType::Gelss}
-  });
-  // driver_name is never nullopt for CPU
-  auto driver_str = driver_name.value();
-  auto driver_type = driver_string_to_type[driver_str];
-
-  Tensor rank;
-  Tensor singular_values;
-
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(a.scalar_type(), "torch.linalg.lstsq_cpu", [&] {
-    using value_t = typename c10::scalar_value_type<scalar_t>::type;
-
-    auto driver = lapackLstsq<LapackLstsqDriverType::Gelsd, scalar_t, value_t>;
-    static auto driver_type_to_func
-      = std::unordered_map<LapackLstsqDriverType, decltype(driver), LapackLstsqDriverTypeHash>({
-      {LapackLstsqDriverType::Gels, lapackLstsq<LapackLstsqDriverType::Gels, scalar_t, value_t>},
-      {LapackLstsqDriverType::Gelsy, lapackLstsq<LapackLstsqDriverType::Gelsy, scalar_t, value_t>},
-      {LapackLstsqDriverType::Gelsd, lapackLstsq<LapackLstsqDriverType::Gelsd, scalar_t, value_t>},
-      {LapackLstsqDriverType::Gelss, lapackLstsq<LapackLstsqDriverType::Gelss, scalar_t, value_t>}
-    });
-    driver = driver_type_to_func[driver_type];
-
-    auto m = a.size(-2);
-    auto n = a.size(-1);
-    auto nrhs = b.size(-1);
-    auto driver_helper = LapackLstsqHelper<scalar_t, value_t, decltype(driver)>(driver_type, driver)
-      .set_trans('N')
-      .set_m(m)
-      .set_n(n)
-      .set_nrhs(nrhs)
-      .set_a(a)
-      .set_lda(std::max<int64_t>(1, m))
-      .set_b(b)
-      .set_ldb(std::max<int64_t>(1, std::max(m, n)))
-      .set_jpvt()
-      .set_rcond(cond)
-      .set_rank()
-      .set_s()
-      .call_driver() // initial call to deduce optimal sizes for workspace arrays
-      .set_work()
-      .set_rwork()
-      .set_iwork();
-
-    // If batch dims for `a` and `b` are equivalent, i.e.
-    // a.shape[:-2] == b.shape[:-2], the call to `batch_iterator_with_broadcasting`
-    // is equivalent to:
-    // for (int64_t i = 0; i < batchCount(a); ++i) {
-    //   driver_helper.call_driver().next();
-    //   if (driver_helper.info) {
-    //     break;
-    //   }
-    // }
-    // which does correspond to a batch-wise iteration for methods that do not
-    // broadcast with size expansion over batch dimensions.
-    batch_iterator_with_broadcasting<scalar_t>(a, b,
-      [&](scalar_t* a_working_ptr, scalar_t* b_working_ptr,
-        int64_t a_linear_batch_idx) {
-        driver_helper.next(a_working_ptr, b_working_ptr, a_linear_batch_idx)
-          .call_driver();
-        singleCheckErrors(driver_helper.info, "torch.linalg.lstsq_cpu");
-      }
-    );
-
-    rank = driver_helper.rank;
-    singular_values = driver_helper.s;
-  });
-
-  return std::make_tuple(b, rank, singular_values);
-#endif
-}
-
-std::tuple<Tensor, Tensor, Tensor, Tensor> linalg_lstsq(
-    const Tensor& self, const Tensor& b,
-    c10::optional<double> cond,
-    c10::optional<std::string> driver) {
-  TORCH_CHECK(
-    self.device().type() == b.device().type(),
-    "torch.linalg.lstsq: input tensors should be on the same device"
-  );
-  TORCH_CHECK(
-    self.scalar_type() == b.scalar_type(),
-    "torch.linalg.lstsq: input tensors should be of the same dtype"
-  );
-  TORCH_CHECK(
-    self.dim() >= 2,
-    "torch.linalg.lstsq: input `self` Tensor should be at least 2D"
-  );
-  TORCH_CHECK(
-    b.dim() >= 1,
-    "torch.linalg.lstsq: input `b` Tensor should be at least 1D"
-  );
-  auto dim_diff = self.dim() - b.dim();
-  TORCH_CHECK(
-    0 <= dim_diff && dim_diff <= 1,
-    "torch.linalg.lstsq: self.dim() must be greater or equal to b.dim() and "
-    "(self.dim() - b.dim()) <= 1"
-  );
-  Tensor b_2d = dim_diff ? b.unsqueeze(-1) : b;
-  TORCH_CHECK(
-    self.size(-2) == b_2d.size(-2),
-    dim_diff ? "torch.linalg.lstsq: self.size(-2) should match b.size(-1)" :
-      "torch.linalg.lstsq: self.size(-2) should match b.size(-2)"
-  );
-
-  // if `driver` is empty, we use `driver_opt` to be set to
-  // c10::nullopt if working with CUDA tensors,
-  // otherwise to "gelsy" driver.
-  // CUDA tensors are treated specially because MAGMA
-  // has only 'gels' driver supported.
-  c10::optional<std::string> driver_opt = driver;
-  // check whether the user provided name is a valid driver name
-  if (driver.has_value()) {
-    auto driver_str = driver.value();
-    // convert `driver_str` to lower case inplace.
-    std::transform(driver_str.begin(), driver_str.end(), driver_str.begin(),
-      [](unsigned char c) { return std::tolower(c); });
-    static std::unordered_set<std::string> allowed_drivers = {
-      "gels", "gelsy", "gelsd", "gelss"
-    };
-    if (at::kCPU == self.device().type()) {
-      TORCH_CHECK(
-        allowed_drivers.find(driver_str) != allowed_drivers.end(),
-        "torch.linalg.lstsq: parameter `driver` should be one of "
-        "(gels, gelsy, gelsd, gelss)"
-      );
-    }
-    //else if (at::kCUDA == self.device().type()) {
-    else {
-      TORCH_CHECK(
-        driver_str == "gels",
-        "torch.linalg.lstsq: `driver` other than `gels` is not supported on CUDA"
-      );
-    }
-  }
-  // if driver name is not provided, set to default 'gelsy' if on CPU,
-  // or to `gels` if on CUDA.
-  else {
-    driver_opt = (at::kCPU == self.device().type())
-      ? c10::optional<std::string>("gelsy")
-      : c10::optional<std::string>("gels");
-  }
-
-  // CUDA has only `gels` driver now which ONLY works with overdetermined systems
-  if (at::kCUDA == self.device().type()) {
-    TORCH_CHECK(
-      self.size(-2) >= self.size(-1),
-      "torch.linalg.lstsq: only overdetermined systems (m >= n) are allowed on CUDA"
-    );
-  }
-
-  // LAPACK/MAGMA requries inputs to be in the column-major-order.
-  auto self_working_copy = copyBatchedColumnMajor(self);
-
-  // Tensor b must be of size (..., max(m, n), nrhs)
-  // and in the column-major order.
-  // We allow the batch dims of `self` to broadcast over the batch
-  // dims of `b` so that it is possible to solve multiple systems with
-  // with the same lhs (encoded by `self`) / rhs (encoded by `b`).
-  // `b_working_copy` is modified in-place and the combination of
-  // batch broadcasting plus LAPACK/MAGMA requirements impose the following
-  // restrictions on sizes/strides of `b`:
-  // 1. b.size = (broadcasted_batch_size(self, b), max(m, n), nrhs).
-  // 2. b.stride should correspond to an almost contiguous Tensor in the column-major-order,
-  //   i.e. b.stride = b.transpose(-2, -1).contiguous().transpose(-2, -1).strides()
-  auto m = self.size(-2);
-  auto n = self.size(-1);
-  auto b_working_copy = copyBatchedColumnMajor(b_2d,
-    /*nrows=*/std::max(m, n),
-    /*desired_batch_sizes=*/broadcast_batch_size(self, b_2d, self.dim() - 2));
-
-  double rcond = cond.has_value() && (cond.value() > 0)
-    ? cond.value()
-    : _get_epsilon(c10::toValueType(self.scalar_type()));
-
-  Tensor x, residuals, rank, singular_values;
-  // path if neither `self` nor `b` is empty
-  if (self.numel() && b.numel()) {
-    std::tie(x, rank, singular_values) =
-      at::_lstsq_helper(self_working_copy, b_working_copy, rcond, driver_opt);
-    if (m > n && driver_opt.value() != "gelsy") {
-      residuals = x.narrow(-2, n, std::max(m, n) - n).abs().pow_(2).sum(-2);
-    }
-    x = x.narrow(-2, 0, n);
-  }
-  // if either `self` or `b` is empty, return an empty tensor or,
-  // if non-zero sizes, return a tensor of zeros.
-  else {
-    x = b_working_copy.zero_().narrow(-2, 0, n);
-  }
-
-  auto return_empty_if_undefined = [&self](Tensor& t,
-      c10::optional<at::ScalarType> dtype = c10::nullopt,
-      c10::optional<std::vector<int64_t>> shape = c10::nullopt) {
-    if (t.defined()) {
-      return t;
-    }
-    else {
-      auto output_dtype = dtype.has_value() ? dtype.value() : self.scalar_type();
-      if (shape.has_value()) {
-        return at::empty(shape.value(), self.options().dtype(output_dtype));
-      }
-      else {
-        return at::empty({0}, self.options().dtype(output_dtype));
-      }
-    }
-  };
-
-  // Some output stays undefined for some values of driver.
-  // Instead of returning undefined tensors which get exposed as
-  // Nones in the Python interface, we return empty tensors.
-  // This way we follow the convention of output types in the
-  // torch.linalg namespace.
-  // NOTE: we run drivers only if both inputs are non-empty!
-  // Hence the code below explicitly handles each and every output
-  // if `self` is empty.
-
-  // Numpy and Scipy always return ranks for empty matrices,
-  // even for drivers which are not rank-revealing.
-  auto batch_sizes = IntArrayRef(self.sizes().data(), self.dim() - 2);
-  if (self.numel()) {
-    rank = return_empty_if_undefined(rank, at::kLong);
-  }
-  else {
-    rank = at::zeros(batch_sizes, self.options().dtype(at::kLong));
-  }
-
-  // undefined residuals could only be an empty Tensor of shape (0)
-  residuals = return_empty_if_undefined(residuals);
-
-  if (!self.numel() 
-    && (driver_opt.value() == "gelss" || driver_opt.value() == "gelsd")) {
-    // when `self` is empty, return singular_values of shape
-    // (*self.shape[:-2], 0) only if driver is in ('gelss', 'gelsd')
-    auto singular_values_empty_shape = batch_sizes.vec();
-    singular_values_empty_shape.push_back(0);
-    singular_values = return_empty_if_undefined(
-      singular_values,
-      at::toValueType(self.scalar_type()),
-      singular_values_empty_shape);
-  }
-  else {
-    // otherwise return an empty tensor of shape (0)
-    singular_values = return_empty_if_undefined(
-      singular_values,
-      at::toValueType(self.scalar_type()));
-  }
-
-  return std::make_tuple(x, residuals, rank, singular_values);
-}
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template<typename scalar_t>
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 5572a8d..59625d6 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -4,7 +4,6 @@
 #include <ATen/ATen.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/native/TensorIterator.h>
 #include <limits>
 #include <sstream>
 #include <cstring>
@@ -33,32 +32,6 @@
 }
 
 /*
- * This method is designed to be a faster alternative to
- * `cloneBatchedColumnMajor` with some additional features,
- * namely:
- * 1. It uses `copy` instead of `clone` which could be much faster.
- * 2. `nrows` parameter used to create inputs with the number of rows larger
- *  than the original input, which is required for some LAPACK/MAGMA methods.
- * 3. `desired_batch_size` is used to create copies with the batch size
- *  which is either the original batch size of the input, or its larger
- *  broadcasted shape.
- */
-static inline Tensor copyBatchedColumnMajor(const Tensor& src, int64_t nrows = -1,
-    c10::optional<IntArrayRef> desired_batch_sizes = c10::nullopt) {
-  nrows = (nrows == -1) ? src.size(-2) : nrows;
-  auto copy_sizes = desired_batch_sizes.has_value()
-    ? desired_batch_sizes.value().vec()
-    : IntArrayRef(src.sizes().data(), src.dim() - 2).vec();
-  copy_sizes.insert(copy_sizes.end(), {nrows, src.size(-1)});
-  auto copy_strides = at::detail::defaultStrides(copy_sizes);
-  copy_strides[src.dim() - 2] = 1;
-  copy_strides[src.dim() - 1] = nrows;
-  auto copy = at::empty_strided(copy_sizes, copy_strides, src.options());
-  copy.narrow(-2, 0, src.size(-2)).copy_(src);
-  return copy;
-}
-
-/*
  * Given batches of matrices with arbitrary batch dim,
  * computes the number of batches.
  */
@@ -75,103 +48,6 @@
   return batched_matrices.size(-1) * batched_matrices.size(-2);
 }
 
-// This function is designed to be used with linear algebra methods that minimize
-// L(ax - b) = 0, where L is generally the identity map (`solve`, for example)
-// or the L2 norm (`lstsq`).
-// It is expected that `a` and `b` are contiguous tensors of column-major matrices
-// (so that a.view({-1, a.size(-2), a.size(-1)}) succeeds, same for `b`),
-// with the following additional properties:
-//
-// 1. a.dim() == b.dim()
-// 2. a.shape[:-2] broadcasts over b.shape[:-2]
-// 3. a.size(i) <= b.size(i) for i=0,..., a.dim() - 3 (only for batch dimensions)
-//
-// MAGMA/LAPACK modify tensor `a` in-place, and the main goal of this method
-// is to be memory efficient, which means that if there exists an index i such that
-// a.shape[i] < b.shape[i], 0 <= i <= a.dim() - 3,
-// then instead of materializing copies of `a` in the broadcasted shape, we keep
-// a buffer copy of `a` along with flags that check whether specific batch dimension
-// indices for `a` were already accessed. If they were, we copy the data from the buffer
-// into `a`. The number of copies does not exceed 
-// prod(max(a.shape[:-2], b.shape[:-2]) - a.shape[:-2] + 1)
-// and this value is attained by tensors with non-empty batch dimensions.
-//
-// func_t `f` is a callable that is being supplied with
-// scalar_t* a_working_ptr, scalar_t* b_working_ptr, int64_t a_linear_batch_idx.
-// a_working_ptr and b_working_ptr can directly be passed to LAPACK/MAGMA routines,
-// and a_linear_batch_idx is an index in the 3d representation which corresponds to
-// the memory a_working_ptr points to, in other words:
-// a_working_ptr == a.view({-1, a.size(-2), a.size(-1)}.select(0, a_linear_batch_idx).data_ptr<scalar_t>();
-// a_linear_batch_idx is useful to store metadata related to `a`, such as, for example,
-// its rank or singular values (see linalg_lstsq).
-template<typename scalar_t, typename func_t>
-void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const func_t& f) {
-  IntArrayRef a_batch_sizes(a.sizes().data(), a.dim() - 2);
-  IntArrayRef b_batch_sizes(b.sizes().data(), b.dim() - 2);
-
-  auto a_linear_batch_idx = at::arange(batchCount(a)).view(a_batch_sizes);
-  auto b_linear_batch_idx = at::arange(batchCount(b)).view(b_batch_sizes);
-
-  TensorIterator iter = TensorIteratorConfig()
-    .set_check_mem_overlap(false)
-    .check_all_same_dtype(false)
-    .resize_outputs(false)
-    .add_output(b_linear_batch_idx)
-    .add_input(a_linear_batch_idx)
-    .build();
-
-  auto m = a.size(-2);
-  auto n = a.size(-1);
-  auto a_3d = a.view({batchCount(a), m, n});
-  auto b_3d = b.view({batchCount(b), b.size(-2), b.size(-1)});
-
-  auto a_broadcasts_over_b = (a_batch_sizes != b_batch_sizes);
-  Tensor a_buffer, a_was_accessed, a_buffer_3d;
-  std::function<void(int64_t)> check_if_copy_needed_for_a
-    = [](int64_t a_curr_linear_batch_idx){};
-  if (a_broadcasts_over_b) {
-    a_buffer = at::empty_strided(a.sizes(), a.strides(), a.options())
-      .copy_(a);
-    a_was_accessed = at::zeros(batchCount(a), at::kBool);
-    a_buffer_3d = a_buffer.view({batchCount(a), m, n});
-    check_if_copy_needed_for_a = [&](int64_t a_curr_linear_batch_idx) {
-      auto* a_was_accessed_flag = a_was_accessed
-        .select(0, a_curr_linear_batch_idx)
-        .data_ptr<bool>();
-      if (!(*a_was_accessed_flag)) {
-        *a_was_accessed_flag = true;
-      }
-      else {
-        a_3d.select(0, a_curr_linear_batch_idx)
-          .copy_(a_buffer_3d.select(0, a_curr_linear_batch_idx));
-      }
-    };
-  }
-
-  auto loop = [&](char** data, const int64_t* strides, int64_t nelems) {
-    auto* b_batch_idx_ptr = data[0];
-    auto* a_batch_idx_ptr = data[1];
-
-    for (int64_t elem = 0; elem < nelems; ++elem) {
-      auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
-      auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
-
-      check_if_copy_needed_for_a(a_curr_linear_batch_idx);
-
-      auto* a_working_ptr = a_3d.select(0, a_curr_linear_batch_idx)
-        .data_ptr<scalar_t>();
-      auto* b_working_ptr = b_3d.select(0, b_curr_linear_batch_idx)
-        .data_ptr<scalar_t>();
-      f(a_working_ptr, b_working_ptr, a_curr_linear_batch_idx);
-
-      b_batch_idx_ptr += strides[0];
-      a_batch_idx_ptr += strides[1];
-    }
-  };
-  iter.serial_for_each(loop, {0, batchCount(b)});
-}
-
-
 // Returns the epsilon value for floating types except half
 static inline double _get_epsilon(const ScalarType& sc_type) {
   switch (sc_type) {
@@ -294,13 +170,6 @@
   return std::make_tuple(arg1_broadcasted, arg2_broadcasted);
 }
 
-static inline std::vector<int64_t> broadcast_batch_size(const Tensor& t1, const Tensor& t2, int64_t n_batch_dims) {
-  IntArrayRef t1_batch_sizes(t1.sizes().data(), n_batch_dims);
-  IntArrayRef t2_batch_sizes(t2.sizes().data(), n_batch_dims);
-  auto broadcasted_batch_sizes = infer_size(t1_batch_sizes, t2_batch_sizes);
-  return broadcasted_batch_sizes;
-}
-
 // Return a permutation with the given axes moved to the end.
 static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) {
   const std::vector<int64_t> a = axes.vec();
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 66ee412..fd4b37c 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -160,12 +160,6 @@
     scalar_t** dB_array, magma_int_t lddb, magma_int_t& info,
     magma_int_t batchsize, const MAGMAQueue& magma_queue);
 
-template<class scalar_t>
-void magmaGels(
-    magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs,
-    scalar_t* dA, magma_int_t ldda, scalar_t* dB, magma_int_t lddb,
-    scalar_t* hwork, magma_int_t lwork, magma_int_t* info);
-
 template<>
 void magmaSolve<double>(
     magma_int_t n, magma_int_t nrhs, double* dA, magma_int_t ldda,
@@ -1269,56 +1263,6 @@
  info = magma_cgetrs_batched(MagmaNoTrans, n, nrhs, reinterpret_cast<magmaFloatComplex**>(dA_array), ldda, dipiv_array, reinterpret_cast<magmaFloatComplex**>(dB_array), lddb, batchsize, magma_queue.get_queue());
  AT_CUDA_CHECK(cudaGetLastError());
 }
-
-template<>
-void magmaGels<float>(
-    magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs,
-    float* dA, magma_int_t ldda, float* dB, magma_int_t lddb,
-    float* hwork, magma_int_t lwork, magma_int_t* info) {
-  MagmaStreamSyncGuard guard;
-  magma_sgels_gpu(trans, m, n, nrhs,
-      dA, ldda, dB, lddb,
-      hwork, lwork, info);
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-template<>
-void magmaGels<double>(
-    magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs,
-    double* dA, magma_int_t ldda, double* dB, magma_int_t lddb,
-    double* hwork, magma_int_t lwork, magma_int_t* info) {
-  MagmaStreamSyncGuard guard;
-  magma_dgels_gpu(trans, m, n, nrhs,
-      dA, ldda, dB, lddb,
-      hwork, lwork, info);
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-template<>
-void magmaGels<c10::complex<float>>(
-    magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs,
-    c10::complex<float>* dA, magma_int_t ldda, c10::complex<float>* dB, magma_int_t lddb,
-    c10::complex<float>* hwork, magma_int_t lwork, magma_int_t* info) {
-  MagmaStreamSyncGuard guard;
-  magma_cgels_gpu(trans, m, n, nrhs,
-      reinterpret_cast<magmaFloatComplex*>(dA), ldda,
-      reinterpret_cast<magmaFloatComplex*>(dB), lddb,
-      reinterpret_cast<magmaFloatComplex*>(hwork), lwork, info);
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-template<>
-void magmaGels<c10::complex<double>>(
-    magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs,
-    c10::complex<double>* dA, magma_int_t ldda, c10::complex<double>* dB, magma_int_t lddb,
-    c10::complex<double>* hwork, magma_int_t lwork, magma_int_t* info) {
-  MagmaStreamSyncGuard guard;
-  magma_zgels_gpu(trans, m, n, nrhs,
-      reinterpret_cast<magmaDoubleComplex*>(dA), ldda,
-      reinterpret_cast<magmaDoubleComplex*>(dB), lddb,
-      reinterpret_cast<magmaDoubleComplex*>(hwork), lwork, info);
-  AT_CUDA_CHECK(cudaGetLastError());
-}
 #endif
 
 #define ALLOCATE_ARRAY(name, type, size) \
@@ -2555,44 +2499,6 @@
   TORCH_CHECK(info == 0, "MAGMA lu_solve : invalid argument: ", -info);
   return self_working_copy;
 }
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lstsq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-std::tuple<Tensor, Tensor, Tensor> _lstsq_helper_cuda(
-    const Tensor& a, const Tensor& b, double cond, c10::optional<std::string> driver_name) {
-#ifndef USE_MAGMA
-AT_ERROR("torch.linalg.lstsq: MAGMA library not found in "
-    "compilation. Please rebuild with MAGMA.");
-#else
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(a.scalar_type(), "torch.linalg.lstsq_cuda", [&] {
-    auto trans = MagmaNoTrans;
-    auto m = magma_int_cast(a.size(-2), "m");
-    auto n = magma_int_cast(a.size(-1), "n");
-    auto nrhs = magma_int_cast(b.size(-1), "nrhs");
-    auto ldda = std::max<magma_int_t>(1, m);
-    auto lddb = std::max<magma_int_t>(1, std::max(m, n));
-    auto nb = magmaGeqrfOptimalBlocksize<scalar_t>(m, n);
-    auto lwork = (m - n + nb) * (nrhs + nb) + nrhs * nb;
-    Tensor hwork = at::empty({static_cast<int64_t>(lwork)}, a.scalar_type());
-    auto* hwork_ptr = hwork.data_ptr<scalar_t>();
-    magma_int_t info;
-
-    batch_iterator_with_broadcasting<scalar_t>(a, b,
-      [&](scalar_t* a_working_ptr, scalar_t* b_working_ptr,
-        int64_t a_linear_batch_idx) {
-        magmaGels<scalar_t>(trans, m, n, nrhs,
-          a_working_ptr, ldda, b_working_ptr, lddb,
-          hwork_ptr, lwork, &info);
-        singleCheckErrors(static_cast<int64_t>(info), "torch.linalg.lstsq_cuda");
-      }
-    );
-  });
-
-  Tensor rank, singular_values;
-  return std::make_tuple(b, rank, singular_values);
-#endif
-}
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 }}  // namespace at::native
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index a4ec077..7ee6969 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9015,18 +9015,6 @@
   dispatch:
     DefaultBackend: det
 
-- func: linalg_lstsq(Tensor self, Tensor b, float? cond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
-  python_module: linalg
-  variants: function
-  dispatch:
-    DefaultBackend: linalg_lstsq
-
-- func: _lstsq_helper(Tensor a, Tensor b, float cond, str? driver_name) -> (Tensor, Tensor, Tensor)
-  variants: function
-  dispatch:
-    CPU: _lstsq_helper_cpu
-    CUDA: _lstsq_helper_cuda
-
 - func: linalg_slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
   python_module: linalg
   variants: function
diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index 697fd2e..668632d 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -31,4 +31,3 @@
 .. autofunction:: tensorsolve
 .. autofunction:: inv
 .. autofunction:: qr
-.. autofunction:: lstsq
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 2543bd8..1671be3 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -119,250 +119,6 @@
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
-    def test_linalg_lstsq(self, device, dtype):
-        from torch.testing._internal.common_utils import random_well_conditioned_matrix
-        if self.device_type == 'cpu':
-            drivers = ('gels', 'gelsy', 'gelsd', 'gelss', None)
-        else:
-            drivers = ('gels', None)
-
-        def check_correctness(a, b, sol):
-            sol2 = a.pinverse() @ b
-            self.assertEqual(sol, sol2, atol=1e-5, rtol=1e-5)
-
-        def check_correctness_ref(a, b, res, ref):
-            def apply_if_not_empty(t, f):
-                if t.numel():
-                    return f(t)
-                else:
-                    return t
-
-            def select_if_not_empty(t, i):
-                selected = apply_if_not_empty(t, lambda x: x.select(0, i).numpy())
-                return selected
-
-            m = a.size(-2)
-            n = a.size(-1)
-            nrhs = b.size(-1)
-            batch_size = int(np.prod(a.shape[:-2]))
-            if batch_size == 0:
-                batch_size = 1
-            a_3d = a.view(batch_size, m, n)
-            b_3d = b.view(batch_size, m, nrhs)
-
-            solution_3d = res.solution.view(batch_size, n, nrhs)
-            residuals_2d = apply_if_not_empty(res.residuals, lambda t: t.view(-1, nrhs))
-            rank_1d = apply_if_not_empty(res.rank, lambda t: t.view(-1))
-            singular_values_2d = res.singular_values.view(batch_size, res.singular_values.shape[-1])
-
-            for i in range(batch_size):
-                sol, residuals, rank, singular_values = ref(
-                    a_3d.select(0, i).numpy(),
-                    b_3d.select(0, i).numpy()
-                )
-                # Singular values are None when lapack_driver='gelsy' in SciPy
-                if singular_values is None:
-                    singular_values = []
-                self.assertEqual(sol, solution_3d.select(0, i), atol=1e-5, rtol=1e-5)
-                self.assertEqual(residuals, select_if_not_empty(residuals_2d, i), atol=1e-5, rtol=1e-5)
-                self.assertEqual(rank, select_if_not_empty(rank_1d, i), atol=1e-5, rtol=1e-5)
-                self.assertEqual(singular_values, singular_values_2d.select(0, i), atol=1e-5, rtol=1e-5)
-
-        def check_correctness_scipy(a, b, res, driver, cond):
-            if TEST_SCIPY and driver not in (None, 'gels'):
-                import scipy.linalg
-
-                def scipy_ref(a, b):
-                    return scipy.linalg.lstsq(a, b, lapack_driver=driver, cond=cond)
-                check_correctness_ref(a, b, res, scipy_ref)
-
-        def check_correctness_numpy(a, b, res, driver, cond):
-            if driver in ('gelsd', 'gelss'):
-                import numpy.linalg
-
-                def numpy_ref(a, b):
-                    return numpy.linalg.lstsq(a, b, rcond=-1 if cond is None else cond)
-                check_correctness_ref(a, b, res, numpy_ref)
-
-        def check_ranks(a, ranks, cond=1e-7):
-            ranks2 = torch.matrix_rank(a, tol=cond)
-            self.assertEqual(ranks, ranks2)
-
-        def check_singular_values(a, sv):
-            sv2 = a.svd()[1]
-            self.assertEqual(sv, sv2)
-
-        ms = [2 ** i for i in range(5)]
-        m_ge_n_sizes = [(m, m // 2) for m in ms] + [(m, m) for m in ms]
-        # cases m < n are only supported on CPU
-        m_l_n_sizes = [(m // 2, m) for m in ms]
-        matrix_sizes = m_ge_n_sizes + (m_l_n_sizes if device == 'cpu' else [])
-        batches = [(), (2,), (2, 2), (2, 2, 2)]
-        # we generate matrices with singular values sampled from a normal distribution,
-        # that is why we use `cond=1.0`, the mean to cut roughly half of all
-        # the singular values and compare whether torch.linalg.lstsq agrees with
-        # SciPy and NumPy.
-        cond = (None, 1.0)
-
-        for batch, matrix_size, driver, cond in itertools.product(batches, matrix_sizes, drivers, cond):
-            shape = batch + matrix_size
-            a = random_well_conditioned_matrix(*shape, dtype=dtype, device=device)
-            b = torch.rand(*shape, dtype=dtype, device=device)
-
-            cond = 1e-7
-            m = a.size(-2)
-            n = a.size(-1)
-            res = torch.linalg.lstsq(a, b, cond=cond, driver=driver)
-            sol = res.solution.narrow(-2, 0, n)
-
-            check_correctness_scipy(a, b, res, driver, cond)
-            check_correctness_numpy(a, b, res, driver, cond)
-
-            check_correctness(a, b, sol)
-            if self.device_type == 'cpu' and driver != 'gels':
-                # rank-revealing drivers are only available for the CPU.
-                # `gels` is not rank-revealing and is only for full
-                # rank inputs.
-                check_ranks(a, res.rank, cond)
-            if self.device_type == 'cpu' and driver in ('gelsd', 'gelss'):
-                # SVD-based drivers are only available for the CPU.
-                # These are only `gelsd` and `gelss`.
-                check_singular_values(a, res.singular_values)
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
-    def test_linalg_lstsq_batch_broadcasting(self, device, dtype):
-        from torch.testing._internal.common_utils import random_well_conditioned_matrix
-
-        def check_correctness(a, b):
-            sol = torch.linalg.lstsq(a, b).solution
-            sol2 = a.pinverse() @ b
-            self.assertEqual(sol, sol2, rtol=1e-5, atol=1e-5)
-
-        ms = [2 ** i for i in range(5)]
-        batches = [(), (0,), (2,), (2, 2), (2, 2, 2)]
-        # the case when a single matrix is batch-broadcasted over the rhs
-        for m, batch in itertools.product(ms, batches):
-            a = random_well_conditioned_matrix(m, m, dtype=dtype, device=device).view(*([1] * len(batch)), m, m)
-            b = torch.rand(*(batch + (m, m)), dtype=dtype, device=device)
-            check_correctness(a, b)
-
-        # cases with broadcastable shapes
-        for m in ms:
-            a = random_well_conditioned_matrix(1, 3, 1, 3, m, m, dtype=dtype, device=device)
-            b = torch.rand(3, 1, 3, 1, m, m // 2, dtype=dtype, device=device)
-            check_correctness(a, b)
-
-            # rhs are vectors, not matrices in this test
-            b = torch.rand(3, 1, 3, 1, m, dtype=dtype, device=device)
-            # unsqueeze for b because `check_correctness` checks against
-            # a.pinverse() @ b, which requires b to be a matrix
-            check_correctness(a, b.unsqueeze(-1))
-
-            a = random_well_conditioned_matrix(3, 1, 3, 1, m, m, dtype=dtype, device=device)
-            b = torch.rand(1, 3, 1, 3, m, m // 2, dtype=dtype, device=device)
-            check_correctness(a, b)
-
-            # rhs are vectors, not matrices in this test
-            b = torch.rand(1, 3, 1, 3, m, dtype=dtype, device=device)
-            check_correctness(a, b.unsqueeze(-1))
-
-    @skipCPUIfNoLapack
-    @skipCUDAIfNoMagma
-    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
-    def test_linalg_lstsq_input_checks(self, device, dtype):
-        # check empty inputs
-        # empty batches
-        a = torch.rand(0, 0, 3, 3, dtype=dtype, device=device)
-        b = torch.rand(0, 0, 3, 2, dtype=dtype, device=device)
-        self.assertEqual(
-            torch.linalg.lstsq(a, b)[0],
-            torch.zeros(0, 0, 3, 2, dtype=dtype, device=device)
-        )
-        # empty a and b
-        a = torch.rand(2, 2, 0, 0, dtype=dtype, device=device)
-        b = torch.rand(2, 2, 0, 0, dtype=dtype, device=device)
-        self.assertEqual(
-            torch.linalg.lstsq(a, b)[0],
-            torch.zeros(2, 2, 0, 0, dtype=dtype, device=device)
-        )
-        # empty a and b
-        a = torch.rand(2, 2, 3, 0, dtype=dtype, device=device)
-        b = torch.rand(2, 2, 3, 0, dtype=dtype, device=device)
-        self.assertEqual(
-            torch.linalg.lstsq(a, b)[0],
-            torch.zeros(2, 2, 0, 0, dtype=dtype, device=device)
-        )
-        # empty a but not b
-        a = torch.rand(2, 2, 3, 0, dtype=dtype, device=device)
-        b = torch.rand(2, 2, 3, 2, dtype=dtype, device=device)
-        self.assertEqual(
-            torch.linalg.lstsq(a, b)[0],
-            torch.zeros(2, 2, 0, 2, dtype=dtype, device=device)
-        )
-
-        # empty a and b
-        if torch.device(device).type == 'cpu':
-            # only CPU since CUDA does not support overdetermined systems
-            a = torch.rand(2, 2, 0, 3, dtype=dtype, device=device)
-            b = torch.rand(2, 2, 0, 3, dtype=dtype, device=device)
-            self.assertEqual(
-                torch.linalg.lstsq(a, b)[0],
-                torch.zeros(2, 2, 3, 3, dtype=dtype, device=device)
-            )
-
-        a = torch.rand(2, 3, dtype=dtype, device=device)
-        b = torch.rand(3, dtype=dtype, device=device)
-
-        with self.assertRaisesRegex(RuntimeError, 'input `self` Tensor should be at least 2D'):
-            torch.linalg.lstsq(b, b)
-
-        with self.assertRaisesRegex(RuntimeError, 'input `b` Tensor should be at least 1D'):
-            torch.linalg.lstsq(a, torch.tensor(1, dtype=dtype, device=device))
-
-        with self.assertRaisesRegex(RuntimeError, r'self.size\(-2\) should match b.size\(-1\)'):
-            torch.linalg.lstsq(a, b)
-
-        with self.assertRaisesRegex(RuntimeError, r'self.size\(-2\) should match b.size\(-2\)'):
-            torch.linalg.lstsq(a, b.unsqueeze(-1))
-
-        def complement_device(device):
-            if device == 'cpu' and torch.cuda.is_available():
-                return 'cuda'
-            else:
-                return 'cpu'
-
-        a = torch.rand(2, 2, 2, 2, dtype=dtype, device=device)
-        b = torch.rand(2, 2, 2, dtype=dtype, device=complement_device(device))
-        if a.device != b.device:
-            with self.assertRaisesRegex(RuntimeError, 'input tensors should be on the same device'):
-                torch.linalg.lstsq(a, b)
-
-        b = (torch.rand(2, 2, 2, dtype=dtype, device=device) * 100).long()
-        with self.assertRaisesRegex(RuntimeError, 'input tensors should be of the same dtype'):
-            torch.linalg.lstsq(a, b)
-
-        a = torch.rand(2, 2, 2, 2, dtype=dtype, device=device)
-        b = torch.rand(2, 2, 2, dtype=dtype, device=device)
-
-        if device != 'cpu':
-            with self.assertRaisesRegex(RuntimeError, '`driver` other than `gels` is not supported on CUDA'):
-                torch.linalg.lstsq(a, b, driver='fictitious_driver')
-        # if on cpu
-        else:
-            with self.assertRaisesRegex(RuntimeError, r'parameter `driver` should be one of \(gels, gelsy, gelsd, gelss\)'):
-                torch.linalg.lstsq(a, b, driver='fictitious_driver')
-
-        if device != 'cpu':
-            a = torch.rand(2, 3, dtype=dtype, device=device)
-            b = torch.rand(2, 1, dtype=dtype, device=device)
-            with self.assertRaisesRegex(RuntimeError, r'only overdetermined systems \(m >= n\) are allowed on CUDA'):
-                torch.linalg.lstsq(a, b)
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_cholesky(self, device, dtype):
         from torch.testing._internal.common_utils import random_hermitian_pd_matrix
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index 034a5e7..89b9576 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -16,7 +16,6 @@
     'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "_unpack_dual", 'linalg_qr',
     '_svd_helper', 'linalg_svd', 'linalg_slogdet', 'fake_quantize_per_tensor_affine_cachemask',
     'fake_quantize_per_channel_affine_cachemask',
-    'linalg_lstsq'
 }
 
 
@@ -78,7 +77,6 @@
                input=(per_channel_scale, per_channel_zp, 1, 0, 255),
                names=('output', 'mask',), hasout=False),
             op(operators=['_unpack_dual'], input=(0,), names=('primal', 'tangent'), hasout=False),
-            op(operators=['linalg_lstsq'], input=(a,), names=('solution', 'residuals', 'rank', 'singular_values'), hasout=False),
         ]
 
         def get_func(f):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index ba364ed..7076abf 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -666,11 +666,6 @@
   self: not_implemented("lstsq")
   A: not_implemented("lstsq")
 
-- name: linalg_lstsq(Tensor self, Tensor b, float? cond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
-  self: not_implemented("linalg_lstsq")
-  b: not_implemented("linalg_lstsq")
-  output_differentiability: [True, True]
-
 - name: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   self: zeros_like(self)
 
diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index 594d3c6..26d9e66 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -44,10 +44,6 @@
   return torch::linalg_eigvalsh_out(result, self, uplo);
 }
 
-inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(const Tensor& self, const Tensor& b, c10::optional<double> cond, c10::optional<std::string> driver) {
-  return torch::linalg_lstsq(self, b, cond, driver);
-}
-
 inline Tensor norm(const Tensor& self, optional<Scalar> opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return torch::linalg_norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
@@ -172,10 +168,6 @@
   return detail::eigvalsh_out(result, self, uplo);
 }
 
-inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(const Tensor& self, const Tensor& b, c10::optional<double> cond, c10::optional<std::string> driver) {
-  return detail::lstsq(self, b, cond, driver);
-}
-
 inline Tensor linalg_norm(const Tensor& self, optional<Scalar> opt_ord, optional<IntArrayRef> opt_dim, bool keepdim, optional<ScalarType> opt_dtype) {
   return detail::norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
 }
diff --git a/torch/csrc/autograd/utils/wrap_outputs.h b/torch/csrc/autograd/utils/wrap_outputs.h
index e532109..538181a 100644
--- a/torch/csrc/autograd/utils/wrap_outputs.h
+++ b/torch/csrc/autograd/utils/wrap_outputs.h
@@ -109,16 +109,6 @@
   return r.release();
 }
 
-inline PyObject* wrap(PyTypeObject *type, std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> tensors) {
-  auto r = THPObjectPtr{PyStructSequence_New(type)};
-  if (!r) throw python_error();
-  PyStructSequence_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 2, wrap(std::get<2>(tensors)));
-  PyStructSequence_SET_ITEM(r.get(), 3, wrap(std::get<3>(tensors)));
-  return r.release();
-}
-
 inline PyObject* wrap(std::tuple<at::Tensor, at::Tensor, at::Tensor, int64_t> tensors) {
   auto r = THPObjectPtr{PyTuple_New(4)};
   if (!r) throw python_error();
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 0b44cc8..c7a0c07 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -367,99 +367,6 @@
             [-3.1113,  2.7381]], dtype=torch.float64)
 """)
 
-lstsq = _add_docstr(_linalg.linalg_lstsq, r"""
-torch.linalg.lstsq(input, b, cond=None, *, driver=None)
-    -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
-
-Computes the least squares solution to the system with a batch of matrices :math:`a` (represented by :attr:`input`)
-and a batch of vectors or matrices :math:`b` such that
-
-.. math::
-    x = \text{argmin}_x \|ax - b\|_F,
-
-where :math:`a` is of size :math:`(..., m, n)` and :math:`b` is of size :math:`(..., m, k)` or
-:math:`(..., m)`. The batch dimensions of :math:`a` and :math:`b` have to be broadcastable.
-
-The returned solution :math:`solution` is of shape :math:`(..., n, k)` if :math:`b` is of shape :math:`(..., m, k)`,
-and is of shape :math:`(..., n, 1)` if :math:`b` is of shape :math:`(..., m)`.
-The batch sizes of :math:`x` is the broadcasted shape of the batch dimensions of :math:`a` and :math:`b`.
-
-.. note::
-    The case when :math:`m < n` is not supported on CUDA.
-
-Args:
-    input (Tensor): the batch of left-hand side matrices :math:`a`
-        of shape :math:`(..., m, n)` with :math:`m > 0, n > 0`
-    b (Tensor): the batch of righ-hand side vectors or matrices :math:`b`
-        of shape :math:`(..., m)` or :math:`(..., m, k)` with :math:`m > 0, k > 0`
-    cond (float, optional): used to determine the effective rank of :math:`a`
-        for the rank-revealing drivers (see :attr:`driver`).
-        Singular values :math:`s[i] \le cond * s[0]` are treated as zero.
-        If :attr:`cond` is ``None`` or is smaller than zero,
-        the machine precision based on :attr:`input`'s dtype is used.
-        Default: ``None``
-    driver (str, optional): the name of the LAPACK/MAGMA driver that is used
-        to compute the solution.
-        For CPU inputs the valid values are
-        (``'gels'``, ``'gelsy'``, ``'gelsd``, ``'gelss'``, ``None``).
-        For CUDA inputs the valid values are (``'gels'``, ``None``).
-        If ``driver == None``, ``'gelsy'`` is used for CPU inputs and ``'gels'`` for GPU inputs.
-        Default: ``None``
-
-.. note::
-    Driver ``'gels'`` assumes only full-rank inputs, i.e. ``torch.matrix_rank(a) == min(m, n)``.
-    Drivers ``'gelsy'``, ``'gelsd'``, ``'gelss'`` are rank-revealing and hence handle rank-deficient inputs.
-    ``'gelsy'`` uses QR factorization with column pivoting, ``'gelsd'`` and ``'gelss'`` use SVD.
-    ``'gelsy'`` is the fastest among the rank-revealing algorithms that also handles rank-deficient inputs.
-
-.. warning::
-    The default value for :attr:`cond` is subject to a potential change.
-    It is therefore recommended to use some fixed value to avoid potential
-    issues upon the library update.
-
-
-Returns:
-    (Tensor, Tensor, Tensor, Tensor): a namedtuple (solution, residuals, rank, singular_values) containing:
-        - **solution** (*Tensor*): the least squares solution
-        - **residuals** (*Tensor*):  if :math:`m > n` then for full rank matrices in :attr:`input` the tensor encodes
-            the squared residuals of the solutions, that is :math:`||\text{input} @ x - b||_F^2`.
-            If :math:`m \le n`, an empty tensor is returned instead.
-        - **rank** (*Tensor*): the tensor of ranks of the matrix :attr:`input` with shape ``input.shape[:-2]``.
-            Only computed if :attr:`driver` is one of (``'gelsy'``, ``'gelsd'``, ``'gelss'``),
-            an empty tensor is returned otherwise.
-        - **singular_values** (*Tensor*): the tensor of singular values
-            of the matrix :attr:`input` with shape ``input.shape[:-2] + (min(m, n),)``.
-            Only computed if :attr:`driver` is one of (``'gelsd'``, ``'gelss'``),
-            an empty tensor is returend otherwise.
-
-Example::
-
-    >>> a = torch.tensor([[10, 2, 3], [3, 10, 5], [5, 6, 12]], dtype=torch.float)
-    >>> b = torch.tensor([[[2, 5, 1], [3, 2, 1], [5, 1, 9]],
-                          [[4, 2, 9], [2, 0, 3], [2, 5, 3]]], dtype=torch.float)
-    >>> x = torch.linalg.lstsq(a.view(1, 3, 3), b).solution
-    >>> x
-    tensor([[[ 0.0793,  0.5345, -0.1228],
-             [ 0.1125,  0.1458, -0.3517],
-             [ 0.3274, -0.2123,  0.9770]],
-
-            [[ 0.3939,  0.1023,  0.9361],
-             [ 0.1074, -0.2903,  0.1189],
-             [-0.0512,  0.5192, -0.1995]]])
-
-    >>> (x - a.pinverse() @ b).abs().max()
-    tensor(2.0862e-07)
-
-    >>> aa = a.clone().select(1, -1).zero_()
-    >>> xx, rank, _ = torch.linalg.lstsq(aa.view(1, 3, 3), b)
-    >>> rank
-    tensor([2])
-
-    >>> sv = torch.linalg.lstsq(a.view(1, 3, 3), b, driver='gelsd').singular_values
-    >>> (sv - a.svd()[1]).max().abs()
-    tensor(5.7220e-06)
-""")
-
 matrix_rank = _add_docstr(_linalg.linalg_matrix_rank, r"""
 matrix_rank(input, tol=None, hermitian=False, *, out=None) -> Tensor
 
diff --git a/torch/overrides.py b/torch/overrides.py
index 885ef15..46c0ed6 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -1028,7 +1028,6 @@
         Tensor.view: lambda self, shape: -1,
         Tensor.view_as: lambda self, other: -1,
         Tensor.zero_: lambda self: -1,
-        torch.linalg.lstsq: lambda self, b, cond=None, driver=None: -1
     }
 
     ret2 = {}
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 37e17ba..156fae0 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1004,17 +1004,6 @@
             out.append(SampleInput(a, kwargs=dict(upper=True)))
         return out
 
-def sample_inputs_linalg_lstsq(op_info, device, dtype, requires_grad=False):
-    from torch.testing._internal.common_utils import random_well_conditioned_matrix
-    out = []
-    for batch in ((), (3,), (3, 3)):
-        shape = batch + (3, 3)
-        # NOTE: inputs are not marked with `requires_grad` since
-        # linalg_lstsq is not differentiable
-        a = random_well_conditioned_matrix(*shape, dtype=dtype, device=device)
-        b = make_tensor(shape, device, dtype, low=None, high=None)
-        out.append(SampleInput((a, b)))
-    return out
 
 def sample_inputs_linalg_pinv(op_info, device, dtype, requires_grad=False):
     """
@@ -2356,21 +2345,6 @@
            sample_inputs_func=sample_inputs_linalg_solve,
            check_batched_gradgrad=False,
            decorators=[skipCUDAIfNoMagma, skipCUDAIfRocm, skipCPUIfNoLapack]),
-    OpInfo('linalg.lstsq',
-           aten_name='linalg_lstsq',
-           op=torch.linalg.lstsq,
-           dtypes=floating_and_complex_types(),
-           test_inplace_grad=False,
-           supports_tensor_out=False,
-           sample_inputs_func=sample_inputs_linalg_lstsq,
-           check_batched_grad=False,
-           check_batched_gradgrad=False,
-           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
-           skips=(
-               # skip because `linalg_lstsq` is not differentiable
-               SkipInfo('TestGradients', 'test_fn_grad'),
-               SkipInfo('TestCommon', 'test_variant_consistency_jit'),
-           )),
     OpInfo('linalg.pinv',
            aten_name='linalg_pinv',
            op=torch.linalg.pinv,
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 3286087..f1c217b 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1598,27 +1598,6 @@
             s[i] = 1
     return u.mm(torch.diag(s).to(dtype)).mm(v.transpose(0, 1))
 
-def random_well_conditioned_matrix(*shape, dtype, device, mean=1.0, sigma=0.001):
-    """
-    Returns a random rectangular matrix (batch of matrices)
-    with singular values sampled from a Gaussian with
-    mean `mean` and standard deviation `sigma`.
-    The smaller the `sigma`, the better conditioned
-    the output matrix is.
-    """
-    primitive_dtype = {
-        torch.float: torch.float,
-        torch.double: torch.double,
-        torch.cfloat: torch.float,
-        torch.cdouble: torch.double
-    }
-    x = torch.rand(shape, dtype=dtype, device=device)
-    m = x.size(-2)
-    n = x.size(-1)
-    u, _, v = x.svd()
-    s = (torch.randn(*(shape[:-2] + (min(m, n),)), dtype=primitive_dtype[dtype], device=device) * sigma + mean) \
-        .sort(-1, descending=True).values.to(dtype)
-    return (u * s.unsqueeze(-2)) @ v.transpose(-2, -1).conj()
 
 def random_symmetric_matrix(l, *batches, **kwargs):
     dtype = kwargs.get('dtype', torch.double)