[RenderScript] Add docs for IntrinsicBLAS

   bug: 21902810

(cherry picked from commit ec3a4e64fd36b436e22b7760fb4e4c5db64673fa)

Change-Id: I3c0addaf24afb239ff87d153b6c6ea19b4f8fc72
diff --git a/v8/renderscript/java/src/android/support/v8/renderscript/ScriptIntrinsicBLAS.java b/v8/renderscript/java/src/android/support/v8/renderscript/ScriptIntrinsicBLAS.java
index 1b407b4..e708082 100644
--- a/v8/renderscript/java/src/android/support/v8/renderscript/ScriptIntrinsicBLAS.java
+++ b/v8/renderscript/java/src/android/support/v8/renderscript/ScriptIntrinsicBLAS.java
@@ -21,10 +21,14 @@
 import java.lang.annotation.RetentionPolicy;
 
 /**
- *
- * BLAS
- *
  * @hide
+ * ScriptIntrinsicBLAS class provides high performance RenderScript APIs to BLAS.
+ *
+ * The BLAS (Basic Linear Algebra Subprograms) are routines that provide standard
+ * building blocks for performing basic vector and matrix operations.
+ *
+ * For detailed description of BLAS, please refer to http://www.netlib.org/blas/
+ *
  **/
 public final class ScriptIntrinsicBLAS extends ScriptIntrinsic {
     private Allocation mLUT;
@@ -181,6 +185,10 @@
     private static final int RsBlas_bnnm = 1000;
 
     /**
+     * Create an intrinsic to access BLAS subroutines.
+     *
+     * @param rs The RenderScript context
+     * @return ScriptIntrinsicBLAS
      */
     public static ScriptIntrinsicBLAS create(RenderScript rs) {
         long id;
@@ -288,6 +296,22 @@
             throw new RSRuntimeException("Incorrect vector dimensions for GEMV");
         }
     }
+
+    /**
+     * SGEMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d58/sgemv_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void SGEMV(@Transpose int TransA, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
         validateGEMV(Element.F32(mRS), TransA, A, X, incX, Y, incY);
         int M = A.getType().getY();
@@ -304,6 +328,22 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha, aID, xID, beta, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DGEMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/da8/dgemv_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void DGEMV(@Transpose int TransA, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
         validateGEMV(Element.F64(mRS), TransA, A, X, incX, Y, incY);
         int M = A.getType().getY();
@@ -320,6 +360,22 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha, aID, xID, beta, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CGEMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/d8a/cgemv_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void CGEMV(@Transpose int TransA, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
         validateGEMV(Element.F32_2(mRS), TransA, A, X, incX, Y, incY);
         int M = A.getType().getY();
@@ -336,6 +392,22 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, aID, xID, beta.x, beta.y, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZGEMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d40/zgemv_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void ZGEMV(@Transpose int TransA, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
         validateGEMV(Element.F64_2(mRS), TransA, A, X, incX, Y, incY);
         int M = A.getType().getY();
@@ -353,6 +425,30 @@
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgemv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, aID, xID, beta.x, beta.y, yID, incX, incY, 0, 0, mUseIncSupp);
     }
 
+    /**
+     * SGBMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d46/sgbmv_8f.html
+     *
+     * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
+     *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, m):
+     *              for j in range(max(0, i-kl), min(i+ku+1, n)):
+     *                  b[i, j-i+kl] = a[i, j]
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param KL The number of sub-diagonals of the matrix A.
+     * @param KU The number of super-diagonals of the matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void SGBMV(@Transpose int TransA, int KL, int KU, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
         validateGEMV(Element.F32(mRS), TransA, A, X, incX, Y, incY);
@@ -373,6 +469,31 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha, aID, xID, beta, yID, incX, incY, KL, KU, mUseIncSupp);
     }
+
+    /**
+     * DGBMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d3f/dgbmv_8f.html
+     *
+     * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
+     *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, m):
+     *              for j in range(max(0, i-kl), min(i+ku+1, n)):
+     *                  b[i, j-i+kl] = a[i, j]
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param KL The number of sub-diagonals of the matrix A.
+     * @param KU The number of super-diagonals of the matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void DGBMV(@Transpose int TransA, int KL, int KU, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
         validateGEMV(Element.F64(mRS), TransA, A, X, incX, Y, incY);
@@ -393,6 +514,31 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha, aID, xID, beta, yID, incX, incY, KL, KU, mUseIncSupp);
     }
+
+    /**
+     * CGBMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d75/cgbmv_8f.html
+     *
+     * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
+     *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, m):
+     *              for j in range(max(0, i-kl), min(i+ku+1, n)):
+     *                  b[i, j-i+kl] = a[i, j]
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param KL The number of sub-diagonals of the matrix A.
+     * @param KU The number of super-diagonals of the matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void CGBMV(@Transpose int TransA, int KL, int KU, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
         validateGEMV(Element.F32_2(mRS), TransA, A, X, incX, Y, incY);
@@ -413,6 +559,31 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgbmv, TransA, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, aID, xID, beta.x, beta.y, yID, incX, incY, KL, KU, mUseIncSupp);
     }
+
+    /**
+     * ZGBMV performs one of the matrix-vector operations
+     * y := alpha*A*x + beta*y   or   y := alpha*A**T*x + beta*y   or   y := alpha*A**H*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d9/d46/zgbmv_8f.html
+     *
+     * Note: For a M*N matrix, the input Allocation should also be of size M*N (dimY = M, dimX = N),
+     *       but only the region M*(KL+KU+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert the original matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, m):
+     *              for j in range(max(0, i-kl), min(i+ku+1, n)):
+     *                  b[i, j-i+kl] = a[i, j]
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param KL The number of sub-diagonals of the matrix A.
+     * @param KU The number of super-diagonals of the matrix A.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains the band matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void ZGBMV(@Transpose int TransA, int KL, int KU, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
         // GBMV has the same validation requirements as GEMV + KL and KU >= 0
         validateGEMV(Element.F64_2(mRS), TransA, A, X, incX, Y, incY);
@@ -491,6 +662,19 @@
         return N;
     }
 
+    /**
+     * STRMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d45/strmv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void STRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
         int N = A.getType().getY();
@@ -504,6 +688,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, aID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DTRMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/d7e/dtrmv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void DTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
         int N = A.getType().getY();
@@ -517,6 +715,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, aID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CTRMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d78/ctrmv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void CTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
         int N = A.getType().getY();
@@ -530,6 +742,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, aID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZTRMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/dd1/ztrmv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void ZTRMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Allocation A, Allocation X, int incX) {
         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
         int N = A.getType().getY();
@@ -544,6 +770,27 @@
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, aID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
 
+    /**
+     * STBMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d7d/stbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void STBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
         // TBMV has the same requirements as TRMV + K >= 0
         if (K < 0) {
@@ -561,6 +808,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, aID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DTBMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d29/dtbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void DTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
         // TBMV has the same requirements as TRMV + K >= 0
         if (K < 0) {
@@ -578,6 +847,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, aID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CTBMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/dcd/ctbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void CTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
         // TBMV has the same requirements as TRMV + K >= 0
         if (K < 0) {
@@ -595,6 +886,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, aID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZTBMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d39/ztbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void ZTBMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
         // TBMV has the same requirements as TRMV + K >= 0
         if (K < 0) {
@@ -612,6 +925,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztbmv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, aID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * STPMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/db1/stpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void STPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
         int N = validateTPMV(Element.F32(mRS), Uplo, TransA, Diag, Ap, X, incX);
 
@@ -624,6 +959,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, apID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DTPMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/dcd/dtpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void DTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
         int N = validateTPMV(Element.F64(mRS), Uplo, TransA, Diag, Ap, X, incX);
 
@@ -636,6 +993,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, apID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CTPMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/dbb/ctpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void CTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
         int N = validateTPMV(Element.F32_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
 
@@ -648,6 +1027,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, apID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZTPMV performs one of the matrix-vector operations
+     * x := A*x   or   x := A**T*x   or   x := A**H*x
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d9e/ztpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void ZTPMV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
         int N = validateTPMV(Element.F64_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
 
@@ -660,6 +1061,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztpmv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, apID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * STRSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d2a/strsv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void STRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
         // TRSV is the same as TRMV
         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
@@ -675,6 +1090,20 @@
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, aID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
 
     }
+
+    /**
+     * DTRSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d96/dtrsv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void DTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
         // TRSV is the same as TRMV
         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
@@ -690,6 +1119,20 @@
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, aID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
 
     }
+
+    /**
+     * CTRSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/dc8/ctrsv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void CTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
         // TRSV is the same as TRMV
         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
@@ -705,6 +1148,20 @@
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, aID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
 
     }
+
+    /**
+     * ZTRSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/d2f/ztrsv_8f.html
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void ZTRSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation A,  Allocation X,  int incX) {
         // TRSV is the same as TRMV
         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
@@ -720,6 +1177,28 @@
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztrsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, aID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
 
     }
+
+    /**
+     * STBSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d1f/stbsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void STBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
         // TBSV is the same as TRMV + K >= 0
         validateTRMV(Element.F32(mRS), Uplo, TransA, Diag, A, X, incX);
@@ -737,6 +1216,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, aID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DTBSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/dcf/dtbsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void DTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
         // TBSV is the same as TRMV + K >= 0
         validateTRMV(Element.F64(mRS), Uplo, TransA, Diag, A, X, incX);
@@ -754,6 +1255,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, aID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CTBSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d9/d5f/ctbsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void CTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
         // TBSV is the same as TRMV + K >= 0
         validateTRMV(Element.F32_2(mRS), Uplo, TransA, Diag, A, X, incX);
@@ -771,6 +1294,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, aID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZTBSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/d5a/ztbsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param K The number of off-diagonals of the matrix A
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void ZTBSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  int K, Allocation A,  Allocation X,  int incX) {
         // TBSV is the same as TRMV + K >= 0
         validateTRMV(Element.F64_2(mRS), Uplo, TransA, Diag, A, X, incX);
@@ -788,6 +1333,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_ztbsv, TransA, 0, 0, Uplo, Diag, 0, N, K, 0, 0, aID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * STPSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d7c/stpsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void STPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
         // TPSV is same as TPMV
         int N = validateTPMV(Element.F32(mRS), Uplo, TransA, Diag, Ap, X, incX);
@@ -801,6 +1368,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_stpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, apID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DTPSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d9/d84/dtpsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void DTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
         // TPSV is same as TPMV
         int N = validateTPMV(Element.F64(mRS), Uplo, TransA, Diag, Ap, X, incX);
@@ -814,6 +1403,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, apID, xID, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CTPSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/d56/ctpsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void CTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
         // TPSV is same as TPMV
         int N = validateTPMV(Element.F32_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
@@ -827,6 +1438,28 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctpsv, TransA, 0, 0, Uplo, Diag, 0, N, 0, 0, 0, apID, xID, 0, 0, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZTPSV solves one of the systems of equations
+     * A*x = b   or   A**T*x = b   or   A**H*x = b
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/da/d57/ztpsv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the matrix is an upper or lower triangular matrix.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param Ap The input allocation contains packed matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     */
     public void ZTPSV(@Uplo int Uplo, @Transpose int TransA, @Diag int Diag,  Allocation Ap,  Allocation X,  int incX) {
         // TPSV is same as TPMV
         int N = validateTPMV(Element.F64_2(mRS), Uplo, TransA, Diag, Ap, X, incX);
@@ -1048,6 +1681,21 @@
         return N;
     }
 
+    /**
+     * SSYMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d94/ssymv_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void SSYMV(@Uplo int Uplo, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
         int N = validateSYMV(Element.F32(mRS), Uplo, A, X, Y, incX, incY);
 
@@ -1062,6 +1710,30 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssymv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, aID, xID, beta, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * SSBMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/da1/ssbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
+     * @param K The number of off-diagonals of the matrix A
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void SSBMV(@Uplo int Uplo, int K, float alpha, Allocation A, Allocation X, int incX, float beta, Allocation Y, int incY) {
         // SBMV is the same as SYMV + K >= 0
         if (K < 0) {
@@ -1080,6 +1752,30 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha, aID, xID, beta, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * SSPMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/d68/sspmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
+     * @param alpha The scalar alpha.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void SSPMV(@Uplo int Uplo, float alpha, Allocation Ap, Allocation X, int incX, float beta, Allocation Y, int incY) {
         int N = validateSPMV(Element.F32(mRS), Uplo, Ap, X, incX, Y, incY);
 
@@ -1094,6 +1790,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, apID, xID, beta, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * SGER performs the rank 1 operation
+     * A := alpha*x*y**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d5c/sger_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     */
     public void SGER(float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         int M = A.getType().getY();
         int N = A.getType().getX();
@@ -1110,6 +1820,19 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sger, 0, 0, 0, 0, 0, M, N, 0, alpha, xID, yID, 0.f, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * SSYR performs the rank 1 operation
+     * A := alpha*x*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/dac/ssyr_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     */
     public void SSYR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation A) {
         int N = validateSYR(Element.F32(mRS), Uplo, X, incX, A);
 
@@ -1122,6 +1845,27 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, xID, aID, 0.f, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * SSPR performs the rank 1 operation
+     * A := alpha*x*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d9b/sspr_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     */
     public void SSPR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Ap) {
         int N = validateSPR(Element.F32(mRS), Uplo, X, incX, Ap);
 
@@ -1134,6 +1878,21 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, xID, apID, 0.f, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * SSYR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**T + alpha*y*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d99/ssyr2_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     */
     public void SSYR2(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         int N = validateSYR2(Element.F32(mRS), Uplo, X, incX, Y, incY, A);
 
@@ -1148,6 +1907,29 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, xID, yID, 0, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * SSPR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**T + alpha*y*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d3e/sspr2_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     */
     public void SSPR2(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
         int N = validateSPR2(Element.F32(mRS), Uplo, X, incX, Y, incY, Ap);
 
@@ -1162,6 +1944,22 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sspr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, xID, yID, 0, apID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSYMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/dbe/dsymv_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void DSYMV(@Uplo int Uplo, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
         int N = validateSYMV(Element.F64(mRS), Uplo, A, X, Y, incX, incY);
 
@@ -1176,6 +1974,30 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsymv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, aID, xID, beta, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSBMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/d1e/dsbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
+     * @param K The number of off-diagonals of the matrix A
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void DSBMV(@Uplo int Uplo, int K, double alpha, Allocation A, Allocation X, int incX, double beta, Allocation Y, int incY) {
         // SBMV is the same as SYMV + K >= 0
         if (K < 0) {
@@ -1194,6 +2016,30 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha, aID, xID, beta, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSPMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/d85/dspmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
+     * @param alpha The scalar alpha.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void DSPMV(@Uplo int Uplo, double alpha, Allocation Ap, Allocation X, int incX, double beta, Allocation Y, int incY) {
         int N = validateSPMV(Element.F64(mRS), Uplo, Ap, X, incX, Y, incY);
 
@@ -1208,6 +2054,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dspmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, apID, xID, beta, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DGER performs the rank 1 operation
+     * A := alpha*x*y**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/da8/dger_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     */
     public void DGER(double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         int M = A.getType().getY();
         int N = A.getType().getX();
@@ -1224,6 +2084,19 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dger, 0, 0, 0, 0, 0, M, N, 0, alpha, xID, yID, 0.f, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSYR performs the rank 1 operation
+     * A := alpha*x*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d60/dsyr_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     */
     public void DSYR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation A) {
         int N = validateSYR(Element.F64(mRS), Uplo, X, incX, A);
 
@@ -1236,6 +2109,27 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, xID, aID, 0.f, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSPR performs the rank 1 operation
+     * A := alpha*x*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dd/dba/dspr_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     */
     public void DSPR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Ap) {
         int N = validateSPR(Element.F64(mRS), Uplo, X, incX, Ap);
 
@@ -1248,6 +2142,21 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dspr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, xID, apID, 0.f, 0, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSYR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**T + alpha*y*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d41/dsyr2_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     */
     public void DSYR2(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         int N = validateSYR2(Element.F64(mRS), Uplo, X, incX, Y, incY, A);
 
@@ -1262,6 +2171,29 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, xID, yID, 0, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSPR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**T + alpha*y*x**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dd/d9e/dspr2_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     */
     public void DSPR2(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
         int N = validateSPR2(Element.F64(mRS), Uplo, X, incX, Y, incY, Ap);
 
@@ -1308,6 +2240,21 @@
 
     }
 
+    /**
+     * CHEMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d51/chemv_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void CHEMV(@Uplo int Uplo, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
         // HEMV is the same as SYR2 validation-wise
         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
@@ -1323,6 +2270,30 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chemv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, aID, xID, beta.x, beta.y, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CHBMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/dc2/chbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
+     * @param K The number of off-diagonals of the matrix A
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void CHBMV(@Uplo int Uplo, int K, Float2 alpha, Allocation A, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
         // HBMV is the same as SYR2 validation-wise
         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
@@ -1341,6 +2312,30 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha.x, alpha.y, aID, xID, beta.x, beta.y, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CHPMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d06/chpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
+     * @param alpha The scalar alpha.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void CHPMV(@Uplo int Uplo, Float2 alpha, Allocation Ap, Allocation X, int incX, Float2 beta, Allocation Y, int incY) {
         // HPMV is the same as SPR2
         int N = validateSPR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, Ap);
@@ -1356,6 +2351,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, apID, xID, beta.x, beta.y, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CGERU performs the rank 1 operation
+     * A := alpha*x*y**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d5f/cgeru_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     */
     public void CGERU(Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         validateGERU(Element.F32_2(mRS), X, incX, Y, incY, A);
         int M = A.getType().getY();
@@ -1372,6 +2381,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgeru, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, xID, yID, 0, 0, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CGERC performs the rank 1 operation
+     * A := alpha*x*y**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dd/d84/cgerc_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     */
     public void CGERC(Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         // same as GERU
         validateGERU(Element.F32_2(mRS), X, incX, Y, incY, A);
@@ -1389,6 +2412,19 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cgerc, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, xID, yID, 0, 0, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CHER performs the rank 1 operation
+     * A := alpha*x*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d6d/cher_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     */
     public void CHER(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation A) {
         // same as SYR
         int N = validateSYR(Element.F32_2(mRS), Uplo, X, incX, A);
@@ -1402,6 +2438,27 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, xID, 0, 0, 0, aID, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CHPR performs the rank 1 operation
+     * A := alpha*x*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/dcd/chpr_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     */
     public void CHPR(@Uplo int Uplo, float alpha, Allocation X, int incX, Allocation Ap) {
         // equivalent to SPR for validation
         int N = validateSPR(Element.F32_2(mRS), Uplo, X, incX, Ap);
@@ -1415,6 +2472,21 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, xID, 0, 0, 0, apID, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CHER2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**H + alpha*y*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d87/cher2_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     */
     public void CHER2(@Uplo int Uplo, Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         // same as SYR2
         int N = validateSYR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, A);
@@ -1430,6 +2502,29 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, xID, yID, 0, 0, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CHPR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**H + alpha*y*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d44/chpr2_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F32_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F32_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     */
     public void CHPR2(@Uplo int Uplo, Float2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
         // same as SPR2
         int N = validateSPR2(Element.F32_2(mRS), Uplo, X, incX, Y, incY, Ap);
@@ -1445,6 +2540,22 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chpr2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, xID, yID, 0, 0, apID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHEMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/ddd/zhemv_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void ZHEMV(@Uplo int Uplo, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
         // HEMV is the same as SYR2 validation-wise
         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
@@ -1460,6 +2571,30 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhemv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, aID, xID, beta.x, beta.y, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHBMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d1a/zhbmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should also be of size N*N (dimY = N, dimX = N),
+     *       but only the region N*(K+1) will be referenced. The following subroutine can is an
+     *       example showing how to convert a UPPER trianglar matrix 'a' to row-based band matrix 'b'.
+     *           for i in range(0, n):
+     *              for j in range(i, min(i+k+1, n)):
+     *                  b[i, j-i] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the band matrix A is being supplied.
+     * @param K The number of off-diagonals of the matrix A
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void ZHBMV(@Uplo int Uplo, int K, Double2 alpha, Allocation A, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
         // HBMV is the same as SYR2 validation-wise
         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
@@ -1478,6 +2613,30 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhbmv, 0, 0, 0, Uplo, 0, 0, N, K, alpha.x, alpha.y, aID, xID, beta.x, beta.y, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHPMV performs the matrix-vector operation
+     * y := alpha*A*x + beta*y
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d60/zhpmv_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of the matrix A is supplied in packed form.
+     * @param alpha The scalar alpha.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param beta The scalar beta.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     */
     public void ZHPMV(@Uplo int Uplo, Double2 alpha, Allocation Ap, Allocation X, int incX, Double2 beta, Allocation Y, int incY) {
         // HPMV is the same as SPR2
         int N = validateSPR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, Ap);
@@ -1493,6 +2652,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhpmv, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, apID, xID, beta.x, beta.y, yID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZGERU performs the rank 1 operation
+     * A := alpha*x*y**T + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d12/zgeru_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     */
     public void ZGERU(Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         validateGERU(Element.F64_2(mRS), X, incX, Y, incY, A);
         int M = A.getType().getY();
@@ -1509,6 +2682,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgeru, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, xID, yID, 0, 0, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZGERC performs the rank 1 operation
+     * A := alpha*x*y**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/dad/zgerc_8f.html
+     *
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     */
     public void ZGERC(Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         // same as GERU
         validateGERU(Element.F64_2(mRS), X, incX, Y, incY, A);
@@ -1526,6 +2713,19 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zgerc, 0, 0, 0, 0, 0, M, N, 0, alpha.x, alpha.y, xID, yID, 0, 0, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHER performs the rank 1 operation
+     * A := alpha*x*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d0e/zher_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     */
     public void ZHER(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation A) {
         // same as SYR
         int N = validateSYR(Element.F64_2(mRS), Uplo, X, incX, A);
@@ -1539,6 +2739,27 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zher, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, xID, 0, 0, 0, aID, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHPR performs the rank 1 operation
+     * A := alpha*x*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/de1/zhpr_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     */
     public void ZHPR(@Uplo int Uplo, double alpha, Allocation X, int incX, Allocation Ap) {
         // equivalent to SPR for validation
         int N = validateSPR(Element.F64_2(mRS), Uplo, X, incX, Ap);
@@ -1552,6 +2773,21 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zhpr, 0, 0, 0, Uplo, 0, 0, N, 0, alpha, 0, xID, 0, 0, 0, apID, incX, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHER2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**H + alpha*y*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/da/d8a/zher2_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     */
     public void ZHER2(@Uplo int Uplo, Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
         // same as SYR2
         int N = validateSYR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, A);
@@ -1567,6 +2803,29 @@
         }
         mRS.nScriptIntrinsicBLAS_Z(getID(mRS), RsBlas_zher2, 0, 0, 0, Uplo, 0, 0, N, 0, alpha.x, alpha.y, xID, yID, 0, 0, aID, incX, incY, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHPR2 performs the symmetric rank 2 operation
+     * A := alpha*x*y**H + alpha*y*x**H + A
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d5/d52/zhpr2_8f.html
+     *
+     * Note: For a N*N matrix, the input Allocation should be a 1D allocation of size dimX = N*(N+1)/2,
+     *       The following subroutine can is an example showing how to convert a UPPER trianglar matrix
+     *       'a' to packed matrix 'b'.
+     *           k = 0
+     *           for i in range(0, n):
+     *              for j in range(i, n):
+     *                  b[k++] = a[i, j]
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part is to be supplied in the packed form.
+     * @param alpha The scalar alpha.
+     * @param X The input allocation contains vector x, supported elements type {@link Element#F64_2}.
+     * @param incX The increment for the elements of vector x, must be larger than zero.
+     * @param Y The input allocation contains vector y, supported elements type {@link Element#F64_2}.
+     * @param incY The increment for the elements of vector y, must be larger than zero.
+     * @param Ap The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     */
     public void ZHPR2(@Uplo int Uplo, Double2 alpha, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
         // same as SPR2
         int N = validateSPR2(Element.F64_2(mRS), Uplo, X, incX, Y, incY, Ap);
@@ -1655,6 +2914,20 @@
 
     }
 
+    /**
+     * SGEMM performs one of the matrix-matrix operations
+     * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param TransB The type of transpose applied to matrix B.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
+     */
     public void SGEMM(@Transpose int TransA, @Transpose int TransB, float alpha, Allocation A,
                       Allocation B, float beta, Allocation C) {
         validateTranspose(TransA);
@@ -1687,6 +2960,21 @@
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_sgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha, aID, bID,
                                         beta, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DGEMM performs one of the matrix-matrix operations
+     * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d2b/dgemm_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param TransB The type of transpose applied to matrix B.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
+     */
     public void DGEMM(@Transpose int TransA, @Transpose int TransB, double alpha, Allocation A,
                       Allocation B, double beta, Allocation C) {
         validateTranspose(TransA);
@@ -1718,6 +3006,21 @@
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dgemm, TransA, TransB, 0, 0, 0, M, N, K,  alpha, aID, bID,
                                         beta, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CGEMM performs one of the matrix-matrix operations
+     * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T  or  op(X) = X**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d5b/cgemm_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param TransB The type of transpose applied to matrix B.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
+     */
     public void CGEMM(@Transpose int TransA, @Transpose int TransB, Float2 alpha, Allocation A,
                       Allocation B, Float2 beta, Allocation C) {
         validateTranspose(TransA);
@@ -1750,6 +3053,20 @@
                                          beta.x, beta.y, cID, 0, 0, 0, 0, mUseIncSupp);
     }
 
+    /**
+     * ZGEMM performs one of the matrix-matrix operations
+     * C := alpha*op(A)*op(B) + beta*C   where op(X) is one of op(X) = X  or  op(X) = X**T  or  op(X) = X**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d76/zgemm_8f.html
+     *
+     * @param TransA The type of transpose applied to matrix A.
+     * @param TransB The type of transpose applied to matrix B.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2
+     */
     public void ZGEMM(@Transpose int TransA, @Transpose int TransB, Double2 alpha, Allocation A,
                       Allocation B, Double2 beta, Allocation C) {
         validateTranspose(TransA);
@@ -1782,6 +3099,20 @@
                                    beta.x, beta.y, cID, 0, 0, 0, 0, mUseIncSupp);
     }
 
+    /**
+     * SSYMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/d42/ssymm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
+     */
     public void SSYMM(@Side int Side, @Uplo int Uplo, float alpha, Allocation A,
                       Allocation B, float beta, Allocation C) {
         validateSide(Side);
@@ -1804,6 +3135,21 @@
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha, aID, bID,
                                         beta, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSYMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/db0/dsymm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
+     */
     public void DSYMM(@Side int Side, @Uplo int Uplo, double alpha, Allocation A,
                       Allocation B, double beta, Allocation C) {
         validateSide(Side);
@@ -1825,6 +3171,21 @@
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha, aID, bID,
                                         beta, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CSYMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/db/d59/csymm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
+     */
     public void CSYMM(@Side int Side, @Uplo int Uplo, Float2 alpha, Allocation A,
                       Allocation B, Float2 beta, Allocation C) {
         validateSide(Side);
@@ -1846,6 +3207,21 @@
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csymm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0, alpha.x, alpha.y, aID, bID,
                                          beta.x, beta.y, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZSYMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d51/zsymm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
+     */
     public void ZSYMM(@Side int Side, @Uplo int Uplo, Double2 alpha, Allocation A,
                       Allocation B, Double2 beta, Allocation C) {
         validateSide(Side);
@@ -1868,6 +3244,19 @@
                                    beta.x, beta.y, cID, 0, 0, 0, 0, mUseIncSupp);
     }
 
+    /**
+     * SSYRK performs one of the symmetric rank k operations
+     * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d0/d40/ssyrk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
+     */
     public void SSYRK(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, float beta, Allocation C) {
         validateTranspose(Trans);
         validateUplo(Uplo);
@@ -1889,6 +3278,19 @@
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, aID, 0, beta, cID, 0, 0, 0, 0, mUseIncSupp);
     }
 
+    /**
+     * DSYRK performs one of the symmetric rank k operations
+     * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dc/d05/dsyrk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
+     */
     public void DSYRK(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, double beta, Allocation C) {
         validateTranspose(Trans);
         validateUplo(Uplo);
@@ -1909,6 +3311,20 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, aID, 0, beta, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CSYRK performs one of the symmetric rank k operations
+     * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d6a/csyrk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
+     */
     public void CSYRK(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Float2 beta, Allocation C) {
         validateTranspose(Trans);
         validateUplo(Uplo);
@@ -1930,6 +3346,20 @@
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csyrk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, aID, 0, beta.x, beta.y,
                                          C.getID(mRS), 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZSYRK performs one of the symmetric rank k operations
+     * C := alpha*A*A**T + beta*C   or   C := alpha*A**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d54/zsyrk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
+     */
     public void ZSYRK(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Double2 beta, Allocation C) {
         validateTranspose(Trans);
         validateUplo(Uplo);
@@ -1977,6 +3407,21 @@
             throw new RSRuntimeException("Invalid A and B in SYR2K");
         }
     }
+
+    /**
+     * SSYR2K performs one of the symmetric rank 2k operations
+     * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d3d/ssyr2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32}.
+     */
     public void SSYR2K(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, Allocation B, float beta, Allocation C) {
         validateUplo(Uplo);
         validateSYR2K(Element.F32(mRS), Trans, A, B, C);
@@ -1998,6 +3443,21 @@
         }
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_ssyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, aID, bID, beta, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DSYR2K performs one of the symmetric rank 2k operations
+     * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/dec/dsyr2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64}.
+     */
     public void DSYR2K(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, Allocation B, double beta, Allocation C) {
         validateUplo(Uplo);
         validateSYR2K(Element.F64(mRS), Trans, A, B, C);
@@ -2019,6 +3479,21 @@
         }
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dsyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha, aID, bID, beta, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CSYR2K performs one of the symmetric rank 2k operations
+     * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d7e/csyr2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
+     */
     public void CSYR2K(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C) {
         validateUplo(Uplo);
         validateSYR2K(Element.F32_2(mRS), Trans, A, B, C);
@@ -2040,6 +3515,21 @@
         }
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_csyr2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), K, alpha.x, alpha.y, aID, bID, beta.x, beta.y, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZSYR2K performs one of the symmetric rank 2k operations
+     * C := alpha*A*B**T + alpha*B*A**T + beta*C   or   C := alpha*A**T*B + alpha*B**T*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d20/zsyr2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
+     */
     public void ZSYR2K(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C) {
         validateUplo(Uplo);
         validateSYR2K(Element.F64_2(mRS), Trans, A, B, C);
@@ -2089,6 +3579,22 @@
             }
         }
     }
+
+    /**
+     * STRMM performs one of the matrix-matrix operations
+     * B := alpha*op(A)*B   or   B := alpha*B*op(A)
+     * op(A) is one of  op(A) = A  or  op(A) = A**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/df/d01/strmm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
+     */
     public void STRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, float alpha, Allocation A, Allocation B) {
         validateUplo(Uplo);
         validateDiag(Diag);
@@ -2104,6 +3610,22 @@
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
                                         alpha, aID, bID, 0.f, 0, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DTRMM performs one of the matrix-matrix operations
+     * B := alpha*op(A)*B   or   B := alpha*B*op(A)
+     * op(A) is one of  op(A) = A  or  op(A) = A**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/dd/d19/dtrmm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
+     */
     public void DTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, double alpha, Allocation A, Allocation B) {
         validateUplo(Uplo);
         validateDiag(Diag);
@@ -2119,6 +3641,22 @@
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
                                         alpha, aID, bID, 0, 0, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CTRMM performs one of the matrix-matrix operations
+     * B := alpha*op(A)*B   or   B := alpha*B*op(A)
+     * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d4/d9b/ctrmm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
+     */
     public void CTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Float2 alpha, Allocation A, Allocation B) {
         validateUplo(Uplo);
         validateDiag(Diag);
@@ -2134,6 +3672,22 @@
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrmm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
                                          alpha.x, alpha.y, aID, bID, 0, 0, 0, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZTRMM performs one of the matrix-matrix operations
+     * B := alpha*op(A)*B   or   B := alpha*B*op(A)
+     * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/de1/ztrmm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
+     */
     public void ZTRMM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Double2 alpha, Allocation A, Allocation B) {
         validateUplo(Uplo);
         validateDiag(Diag);
@@ -2179,6 +3733,22 @@
             }
         }
     }
+
+    /**
+     * STRSM solves one of the matrix equations
+     * op(A)*X := alpha*B   or   X*op(A) := alpha*B
+     * op(A) is one of  op(A) = A  or  op(A) = A**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d2/d8b/strsm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32}.
+     */
     public void STRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, float alpha, Allocation A, Allocation B) {
         validateUplo(Uplo);
         validateDiag(Diag);
@@ -2194,6 +3764,22 @@
         mRS.nScriptIntrinsicBLAS_Single(getID(mRS), RsBlas_strsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
                                         alpha, aID, bID, 0, 0, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * DTRSM solves one of the matrix equations
+     * op(A)*X := alpha*B   or   X*op(A) := alpha*B
+     * op(A) is one of  op(A) = A  or  op(A) = A**T
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/da7/dtrsm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64}.
+     */
     public void DTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, double alpha, Allocation A, Allocation B) {
         validateUplo(Uplo);
         validateDiag(Diag);
@@ -2209,6 +3795,22 @@
         mRS.nScriptIntrinsicBLAS_Double(getID(mRS), RsBlas_dtrsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
                                         alpha, aID, bID, 0, 0, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * CTRSM solves one of the matrix equations
+     * op(A)*X := alpha*B   or   X*op(A) := alpha*B
+     * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/de/d30/ctrsm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
+     */
     public void CTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Float2 alpha, Allocation A, Allocation B) {
         validateUplo(Uplo);
         validateDiag(Diag);
@@ -2224,6 +3826,22 @@
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_ctrsm, TransA, 0, Side, Uplo, Diag, B.getType().getY(), B.getType().getX(), 0,
                                          alpha.x, alpha.y, aID, bID, 0, 0, 0, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZTRSM solves one of the matrix equations
+     * op(A)*X := alpha*B   or   X*op(A) := alpha*B
+     * op(A) is one of  op(A) = A  or  op(A) = A**T  or  op(A) = A**H
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/d39/ztrsm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether matrix A is upper or lower triangular.
+     * @param TransA The type of transpose applied to matrix A.
+     * @param Diag Specifies whether or not A is unit triangular.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
+     */
     public void ZTRSM(@Side int Side, @Uplo int Uplo, @Transpose int TransA, @Diag int Diag, Double2 alpha, Allocation A, Allocation B) {
         validateUplo(Uplo);
         validateDiag(Diag);
@@ -2263,6 +3881,21 @@
             throw new RSRuntimeException("Called HEMM with mismatched B and C");
         }
     }
+
+    /**
+     * CHEMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d3/d66/chemm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
+     */
     public void CHEMM(@Side int Side, @Uplo int Uplo, Float2 alpha, Allocation A, Allocation B, Float2 beta, Allocation C) {
         validateUplo(Uplo);
         validateHEMM(Element.F32_2(mRS), Side, A, B, C);
@@ -2279,6 +3912,21 @@
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_chemm, 0, 0, Side, Uplo, 0, C.getType().getY(), C.getType().getX(), 0,
                                          alpha.x, alpha.y, aID, bID, beta.x, beta.y, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHEMM performs one of the matrix-matrix operations
+     * C := alpha*A*B + beta*C   or   C := alpha*B*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d6/d3e/zhemm_8f.html
+     *
+     * @param Side Specifies whether the symmetric matrix A appears on the left or right.
+     * @param Uplo Specifies whether the upper or lower triangular part is to be referenced.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
+     */
     public void ZHEMM(@Side int Side, @Uplo int Uplo, Double2 alpha, Allocation A, Allocation B, Double2 beta, Allocation C) {
         validateUplo(Uplo);
         validateHEMM(Element.F64_2(mRS), Side, A, B, C);
@@ -2316,6 +3964,20 @@
             }
         }
     }
+
+    /**
+     * CHERK performs one of the hermitian rank k operations
+     * C := alpha*A*A**H + beta*C   or   C := alpha*A**H*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d8/d52/cherk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
+     */
     public void CHERK(@Uplo int Uplo, @Transpose int Trans, float alpha, Allocation A, float beta, Allocation C) {
         validateUplo(Uplo);
         validateHERK(Element.F32_2(mRS), Trans, A, C);
@@ -2336,6 +3998,20 @@
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cherk, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k,
                                          alpha, 0, aID, 0, beta, 0, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHERK performs one of the hermitian rank k operations
+     * C := alpha*A*A**H + beta*C   or   C := alpha*A**H*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/db1/zherk_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
+     */
     public void ZHERK(@Uplo int Uplo, @Transpose int Trans, double alpha, Allocation A, double beta, Allocation C) {
         validateUplo(Uplo);
         validateHERK(Element.F64_2(mRS), Trans, A, C);
@@ -2381,6 +4057,21 @@
             throw new RSRuntimeException("Called HER2K with invalid A and B matrices");
         }
     }
+
+    /**
+     * CHER2K performs one of the hermitian rank 2k operations
+     * C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C   or   C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d1/d82/cher2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F32_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F32_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F32_2}.
+     */
     public void CHER2K(@Uplo int Uplo, @Transpose int Trans, Float2 alpha, Allocation A, Allocation B, float beta, Allocation C) {
         validateUplo(Uplo);
         validateHER2K(Element.F32_2(mRS), Trans, A, B, C);
@@ -2403,6 +4094,21 @@
         mRS.nScriptIntrinsicBLAS_Complex(getID(mRS), RsBlas_cher2k, Trans, 0, 0, Uplo, 0, 0, C.getType().getX(), k, alpha.x, alpha.y,
                                          A.getID(mRS), bID, beta, 0, cID, 0, 0, 0, 0, mUseIncSupp);
     }
+
+    /**
+     * ZHER2K performs one of the hermitian rank 2k operations
+     * C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C   or   C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C
+     *
+     * Details: http://www.netlib.org/lapack/explore-html/d7/dfa/zher2k_8f.html
+     *
+     * @param Uplo Specifies whether the upper or lower triangular part of C is to be referenced.
+     * @param Trans The type of transpose applied to the operation.
+     * @param alpha The scalar alpha.
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#F64_2}.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#F64_2}.
+     * @param beta The scalar beta.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#F64_2}.
+     */
     public void ZHER2K(@Uplo int Uplo, @Transpose int Trans, Double2 alpha, Allocation A, Allocation B, double beta, Allocation C) {
         validateUplo(Uplo);
         validateHER2K(Element.F64_2(mRS), Trans, A, B, C);
@@ -2428,9 +4134,19 @@
 
 
     /**
-     *
-     * 8-bit GEMM-like operation for neural networks
-     *
+     * 8-bit GEMM-like operation for neural networks: C = B.transposed() * A
+     * Calculations are done in 1.10.21 fixed-point format for the final output,
+     * just before there's a shift down to drop the fractional parts. The output
+     * values are gated to 0 to 255 to fit in a byte, but the 10-bit format
+     * gives some headroom to avoid wrapping around on small overflows.
+     * 
+     * @param A The input allocation contains matrix A, supported elements type {@link Element#U8}.
+     * @param a_offset The offset for all values in matrix A, e.g A[i,j] = A[i,j] - a_offset.
+     * @param B The input allocation contains matrix B, supported elements type {@link Element#U8}.
+     * @param b_offset The offset for all values in matrix B, e.g B[i,j] = B[i,j] - b_offset.
+     * @param C The input allocation contains matrix C, supported elements type {@link Element#U8}.
+     * @param c_offset The offset for all values in matrix C.
+     * @param c_mult The multiplier for all values in matrix C, e.g C[i,j] = (C[i,j] + c_offset) * c_mult.
      **/
     public void BNNM(Allocation A, int a_offset, Allocation B, int b_offset, Allocation C, int c_offset, int c_mult) {
         validateL3(Element.U8(mRS), NO_TRANSPOSE, TRANSPOSE, 0, A, B, C);