/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package android.renderscript.cts;

import android.renderscript.*;
import android.util.Log;
import java.util.ArrayList;

public class IntrinsicBLAS extends IntrinsicBase {
    private ScriptIntrinsicBLAS mBLAS;
    private BLASData mBLASData;
    private boolean mInitialized = false;

    private ArrayList<Allocation> mMatrixS;
    private final float alphaS = 1.0f;
    private final float betaS = 1.0f;

    private ArrayList<Allocation> mMatrixD;
    private final double alphaD = 1.0;
    private final double betaD = 1.0;

    private ArrayList<Allocation> mMatrixC;
    private final Float2 alphaC = new Float2(1.0f, 0.0f);
    private final Float2 betaC = new Float2(1.0f, 0.0f);

    private ArrayList<Allocation> mMatrixZ;
    private final Double2 alphaZ = new Double2(1.0, 0.0);
    private final Double2 betaZ = new Double2(1.0, 0.0);

    private int[] mTranspose = {ScriptIntrinsicBLAS.NO_TRANSPOSE,
                                ScriptIntrinsicBLAS.TRANSPOSE,
                                ScriptIntrinsicBLAS.CONJ_TRANSPOSE,
                                0};

    private int[] mUplo = {ScriptIntrinsicBLAS.UPPER,
                           ScriptIntrinsicBLAS.LOWER,
                           0};

    private int[] mDiag = {ScriptIntrinsicBLAS.NON_UNIT,
                           ScriptIntrinsicBLAS.UNIT,
                           0};

    private int[] mSide = {ScriptIntrinsicBLAS.LEFT,
                           ScriptIntrinsicBLAS.RIGHT,
                           0};

    private int[] mInc = {0, 1, 2};
    private int[] mK = {-1, 0, 1};
    private int[] mDim = {1, 2, 3, 256};

    @Override
    protected void setUp() throws Exception {
        super.setUp();

        // Now populate the test Matrixes and Vectors.
        if (!mInitialized) {
            mBLASData = new BLASData();
            mBLASData.loadData(mCtx);
            mBLAS = ScriptIntrinsicBLAS.create(mRS);
            mMatrixS = new ArrayList<Allocation>();
            mMatrixD = new ArrayList<Allocation>();
            mMatrixC = new ArrayList<Allocation>();
            mMatrixZ = new ArrayList<Allocation>();
            for (int x : mDim) {
                for (int y : mDim) {
                    mMatrixS.add(Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), x, y)));
                    mMatrixD.add(Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), x, y)));
                    mMatrixC.add(Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), x, y)));
                    mMatrixZ.add(Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), x, y)));
                }
            }
            // Also need Allocation with mismatch Element.
            Allocation misAlloc = Allocation.createTyped(mRS, Type.createXY(mRS, Element.U8(mRS), 1, 1));
            mMatrixS.add(misAlloc);
            mMatrixD.add(misAlloc);
            mMatrixC.add(misAlloc);
            mMatrixZ.add(misAlloc);
            mInitialized = true;
        }
    }

    @Override
    protected void tearDown() throws Exception {
        super.tearDown();
    }

    // Calculate the square of the L2 norm of a matrix.
    private double calcL2Norm(float[] input) {
        double l2Norm = 0;
        for (int i = 0; i < input.length; ++i) {
            l2Norm += input[i] * input[i];
        }
        return l2Norm;
    }

    private double calcL2Norm(double[] input) {
        double l2Norm = 0;
        for (int i = 0; i < input.length; ++i) {
            l2Norm += input[i] * input[i];
        }
        return l2Norm;
    }

    // Routine to verify if matrix are equivalent.
    private void verifyMatrix(Allocation ref, Allocation out) {
        verifyMatrix(ref, out, false);
    }

    // Use L2 norm of a matrix as the scale to determine whether two matrices are equivalent:
    // if the absolute square error of any elements is smaller than the average L2 Norm
    // per element times an allowed error range (1e-6), then the two matrices are considered equivalent.
    // Criterion: (a[i,j] - a'[i,j])^2 < epsilon * ||A||/(M*N)
    // M, N: the dimensions of the matrix; epsilon: allowed relative error.
    private void verifyMatrix(Allocation ref, Allocation out, boolean isUpperMatrix) {
        double l2Norm;
        int size;
        Element e = ref.getType().getElement();
        if (e.isCompatible(Element.F32(mRS)) || e.isCompatible(Element.F32_2(mRS))) {
            size = out.getBytesSize() / 4;
            float[] outArr = new float[size];
            float[] refArr = new float[size];
            out.copyTo(outArr);
            ref.copyTo(refArr);

            double l2NormOut = calcL2Norm(outArr);
            double l2NormRef = calcL2Norm(refArr);
            l2Norm = (l2NormOut < l2NormRef ? l2NormOut : l2NormRef) / size;
        } else {
            size = out.getBytesSize() / 8;
            double[] outArr = new double[size];
            double[] refArr = new double[size];
            out.copyTo(outArr);
            ref.copyTo(refArr);

            double l2NormOut = calcL2Norm(outArr);
            double l2NormRef = calcL2Norm(refArr);
            l2Norm = (l2NormOut < l2NormRef ? l2NormOut : l2NormRef) / size;
        }
        mVerify.invoke_verifyMatrix(ref, out, l2Norm, isUpperMatrix);
    }


    private boolean validateSide(int Side) {
        if (Side != ScriptIntrinsicBLAS.LEFT && Side != ScriptIntrinsicBLAS.RIGHT) {
            return false;
        }
        return true;
    }

    private boolean validateTranspose(int Trans) {
        if (Trans != ScriptIntrinsicBLAS.NO_TRANSPOSE &&
            Trans != ScriptIntrinsicBLAS.TRANSPOSE &&
            Trans != ScriptIntrinsicBLAS.CONJ_TRANSPOSE) {
            return false;
        }
        return true;
    }

    private boolean validateConjTranspose(int Trans) {
        if (Trans != ScriptIntrinsicBLAS.NO_TRANSPOSE &&
            Trans != ScriptIntrinsicBLAS.CONJ_TRANSPOSE) {
            return false;
        }
        return true;
    }

    private boolean validateDiag(int Diag) {
        if (Diag != ScriptIntrinsicBLAS.NON_UNIT &&
            Diag != ScriptIntrinsicBLAS.UNIT) {
            return false;
        }
        return true;
    }

    private boolean validateUplo(int Uplo) {
        if (Uplo != ScriptIntrinsicBLAS.UPPER &&
            Uplo != ScriptIntrinsicBLAS.LOWER) {
            return false;
        }
        return true;
    }

    private boolean validateVecInput(Allocation X) {
        if (X.getType().getY() > 2) {
            // For testing vector, need a mismatch Y for complete test coverage.
            return false;
        }
        return true;
    }

    private boolean validateGEMV(Element e, int TransA, Allocation A, Allocation X, int incX, Allocation Y, int incY) {
        if (!validateTranspose(TransA)) {
            return false;
        }
        int M = A.getType().getY();
        int N = A.getType().getX();
        if (!A.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e) ||
            !Y.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
            return false;
        }

        if (incX <= 0 || incY <= 0) {
            return false;
        }
        int expectedXDim = -1, expectedYDim = -1;
        if (TransA == ScriptIntrinsicBLAS.NO_TRANSPOSE) {
            expectedXDim = 1 + (N - 1) * incX;
            expectedYDim = 1 + (M - 1) * incY;
        } else {
            expectedXDim = 1 + (M - 1) * incX;
            expectedYDim = 1 + (N - 1) * incY;
        }
        if (X.getType().getX() != expectedXDim ||
            Y.getType().getX() != expectedYDim) {
            return false;
        }
        return true;
    }

    private void xGEMV_API_test(int trans, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateGEMV(elemA, trans, matA, vecX, incX, vecY, incY)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SGEMV(trans, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DGEMV(trans, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CGEMV(trans, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZGEMV(trans, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SGEMV(trans, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            fail("should throw RSRuntimeException for SGEMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DGEMV(trans, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            fail("should throw RSRuntimeException for DGEMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.CGEMV(trans, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            fail("should throw RSRuntimeException for CGEMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZGEMV(trans, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            fail("should throw RSRuntimeException for ZGEMV");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xGEMV_API(ArrayList<Allocation> mMatrix) {
        for (int trans : mTranspose) {
            for (int incX : mInc) {
                xGEMV_API_test(trans, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_SGEMV_API() {
        L2_xGEMV_API(mMatrixS);
    }

    public void test_L2_DGEMV_API() {
        L2_xGEMV_API(mMatrixD);
    }

    public void test_L2_CGEMV_API() {
        L2_xGEMV_API(mMatrixC);
    }

    public void test_L2_ZGEMV_API() {
        L2_xGEMV_API(mMatrixZ);
    }

    public void test_L2_SGEMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        Allocation vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, 1));
        matrixAS.copyFrom(mBLASData.L2_sGEMV_A_mn);
        vectorXS.copyFrom(mBLASData.L2_sGEMV_x_n1);
        vectorYS.copyFrom(mBLASData.L2_sGEMV_y_m1);

        // Test for the default case: NO_TRANS
        mBLAS.SGEMV(trans, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, 1));
        vectorYRef.copyFrom(mBLASData.L2_sGEMV_o_N);
        verifyMatrix(vectorYRef, vectorYS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector Y, since it was overwritten by BLAS.
        vectorYS.copyFrom(mBLASData.L2_sGEMV_y_m1);
        // After Transpose matrixA, vectorX and vectorY are exchanged to match the dim of A.T
        mBLAS.SGEMV(trans, alphaS, matrixAS, vectorYS, incY, betaS, vectorXS, incX);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_sGEMV_o_T);
        verifyMatrix(vectorYRef, vectorXS);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXS.copyFrom(mBLASData.L2_sGEMV_x_n1);
        mBLAS.SGEMV(trans, alphaS, matrixAS, vectorYS, incY, betaS, vectorXS, incX);
        vectorYRef.copyFrom(mBLASData.L2_sGEMV_o_H);
        verifyMatrix(vectorYRef, vectorXS);

        // Test for incX = 2 & incY = 3;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dM - 1) * incY;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorXS.copyFrom(mBLASData.L2_sGEMV_x_n2);
        vectorYS.copyFrom(mBLASData.L2_sGEMV_y_m2);

        mBLAS.SGEMV(trans, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_sGEMV_o_N2);
        verifyMatrix(vectorYRef, vectorYS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DGEMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        Allocation vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, 1));
        matrixAD.copyFrom(mBLASData.L2_dGEMV_A_mn);
        vectorXD.copyFrom(mBLASData.L2_dGEMV_x_n1);
        vectorYD.copyFrom(mBLASData.L2_dGEMV_y_m1);

        // Test for the default case: NO_TRANS
        mBLAS.DGEMV(trans, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, 1));
        vectorYRef.copyFrom(mBLASData.L2_dGEMV_o_N);
        verifyMatrix(vectorYRef, vectorYD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector Y, since it was overwritten by BLAS.
        vectorYD.copyFrom(mBLASData.L2_dGEMV_y_m1);
        // After Transpose matrixA, vectorX and vectorY are exchanged to match the dim of A.T
        mBLAS.DGEMV(trans, alphaD, matrixAD, vectorYD, incY, betaD, vectorXD, incX);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_dGEMV_o_T);
        verifyMatrix(vectorYRef, vectorXD);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXD.copyFrom(mBLASData.L2_dGEMV_x_n1);
        mBLAS.DGEMV(trans, alphaD, matrixAD, vectorYD, incY, betaD, vectorXD, incX);
        vectorYRef.copyFrom(mBLASData.L2_dGEMV_o_H);
        verifyMatrix(vectorYRef, vectorXD);

        // Test for incX = 2 & incY = 3;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dM - 1) * incY;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorXD.copyFrom(mBLASData.L2_dGEMV_x_n2);
        vectorYD.copyFrom(mBLASData.L2_dGEMV_y_m2);

        mBLAS.DGEMV(trans, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_dGEMV_o_N2);
        verifyMatrix(vectorYRef, vectorYD);

        mRS.finish();
        checkError();
    }

    public void test_L2_CGEMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, 1));
        matrixAC.copyFrom(mBLASData.L2_cGEMV_A_mn);
        vectorXC.copyFrom(mBLASData.L2_cGEMV_x_n1);
        vectorYC.copyFrom(mBLASData.L2_cGEMV_y_m1);

        // Test for the default case: NO_TRANS
        mBLAS.CGEMV(trans, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, 1));
        vectorYRef.copyFrom(mBLASData.L2_cGEMV_o_N);
        verifyMatrix(vectorYRef, vectorYC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector Y, since it was overwritten by BLAS.
        vectorYC.copyFrom(mBLASData.L2_cGEMV_y_m1);
        // After Transpose matrixA, vectorX and vectorY are exchanged to match the dim of A.T
        mBLAS.CGEMV(trans, alphaC, matrixAC, vectorYC, incY, betaC, vectorXC, incX);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_cGEMV_o_T);
        verifyMatrix(vectorYRef, vectorXC);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXC.copyFrom(mBLASData.L2_cGEMV_x_n1);
        mBLAS.CGEMV(trans, alphaC, matrixAC, vectorYC, incY, betaC, vectorXC, incX);
        vectorYRef.copyFrom(mBLASData.L2_cGEMV_o_H);
        verifyMatrix(vectorYRef, vectorXC);

        // Test for incX = 2 & incY = 3;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dM - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cGEMV_x_n2);
        vectorYC.copyFrom(mBLASData.L2_cGEMV_y_m2);

        mBLAS.CGEMV(trans, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_cGEMV_o_N2);
        verifyMatrix(vectorYRef, vectorYC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZGEMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, 1));
        matrixAZ.copyFrom(mBLASData.L2_zGEMV_A_mn);
        vectorXZ.copyFrom(mBLASData.L2_zGEMV_x_n1);
        vectorYZ.copyFrom(mBLASData.L2_zGEMV_y_m1);

        // Test for the default case: NO_TRANS
        mBLAS.ZGEMV(trans, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, 1));
        vectorYRef.copyFrom(mBLASData.L2_zGEMV_o_N);
        verifyMatrix(vectorYRef, vectorYZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector Y, since it was overwritten by BLAS.
        vectorYZ.copyFrom(mBLASData.L2_zGEMV_y_m1);
        // After Transpose matrixA, vectorX and vectorY are exchanged to match the dim of A.T
        mBLAS.ZGEMV(trans, alphaZ, matrixAZ, vectorYZ, incY, betaZ, vectorXZ, incX);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_zGEMV_o_T);
        verifyMatrix(vectorYRef, vectorXZ);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXZ.copyFrom(mBLASData.L2_zGEMV_x_n1);
        mBLAS.ZGEMV(trans, alphaZ, matrixAZ, vectorYZ, incY, betaZ, vectorXZ, incX);
        vectorYRef.copyFrom(mBLASData.L2_zGEMV_o_H);
        verifyMatrix(vectorYRef, vectorXZ);

        // Test for incX = 2 & incY = 3;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dM - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zGEMV_x_n2);
        vectorYZ.copyFrom(mBLASData.L2_zGEMV_y_m2);

        mBLAS.ZGEMV(trans, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_zGEMV_o_N2);
        verifyMatrix(vectorYRef, vectorYZ);

        mRS.finish();
        checkError();
    }



    private void xGBMV_API_test(int trans, int KL, int KU, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateGEMV(elemA, trans, matA, vecX, incX, vecY, incY) && KU >= 0 && KL >= 0) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SGBMV(trans, KL, KU, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DGBMV(trans, KL, KU, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CGBMV(trans, KL, KU, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZGBMV(trans, KL, KU, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SGBMV(trans, KL, KU, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            fail("should throw RSRuntimeException for SGBMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DGBMV(trans, KL, KU, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            fail("should throw RSRuntimeException for DGBMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.CGBMV(trans, KL, KU, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            fail("should throw RSRuntimeException for CGBMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZGBMV(trans, KL, KU, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            fail("should throw RSRuntimeException for ZGBMV");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xGBMV_API(ArrayList<Allocation> mMatrix) {
        for (int trans : mTranspose) {
            for (int incX : mInc) {
                for (int K : mK) {
                    xGBMV_API_test(trans, K, K, incX, incX, mMatrix);
                }
            }
        }
    }

    public void test_L2_SGBMV_API() {
        L2_xGBMV_API(mMatrixS);
    }

    public void test_L2_DGBMV_API() {
        L2_xGBMV_API(mMatrixD);
    }

    public void test_L2_CGBMV_API() {
        L2_xGBMV_API(mMatrixC);
    }

    public void test_L2_ZGBMV_API() {
        L2_xGBMV_API(mMatrixZ);
    }

    public void test_L2_SGBMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        Allocation vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, 1));
        matrixAS.copy2DRangeFrom(0, 0, mBLASData.KL + mBLASData.KU + 1, mBLASData.dM, mBLASData.L2_sGBMV_A_mn);
        vectorXS.copyFrom(mBLASData.L2_sGBMV_x_n1);
        vectorYS.copyFrom(mBLASData.L2_sGBMV_y_m1);

        // Test for the default case: NO_TRANS
        mBLAS.SGBMV(trans, mBLASData.KL, mBLASData.KU, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, 1));
        vectorYRef.copyFrom(mBLASData.L2_sGBMV_o_N);
        verifyMatrix(vectorYRef, vectorYS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector Y, since it was overwritten by BLAS.
        vectorYS.copyFrom(mBLASData.L2_sGBMV_y_m1);
        // After Transpose matrixA, vectorX and vectorY are exchanged to match the dim of A.T
        mBLAS.SGBMV(trans, mBLASData.KL, mBLASData.KU, alphaS, matrixAS, vectorYS, incY, betaS, vectorXS, incX);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_sGBMV_o_T);
        verifyMatrix(vectorYRef, vectorXS);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXS.copyFrom(mBLASData.L2_sGBMV_x_n1);
        mBLAS.SGBMV(trans, mBLASData.KL, mBLASData.KU, alphaS, matrixAS, vectorYS, incY, betaS, vectorXS, incX);
        vectorYRef.copyFrom(mBLASData.L2_sGBMV_o_H);
        verifyMatrix(vectorYRef, vectorXS);

        // Test for incX = 2 & incY = 3;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dM - 1) * incY;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorXS.copyFrom(mBLASData.L2_sGBMV_x_n2);
        vectorYS.copyFrom(mBLASData.L2_sGBMV_y_m2);

        mBLAS.SGBMV(trans, mBLASData.KL, mBLASData.KU, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_sGBMV_o_N2);
        verifyMatrix(vectorYRef, vectorYS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DGBMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        Allocation vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, 1));
        matrixAD.copy2DRangeFrom(0, 0, mBLASData.KL + mBLASData.KU + 1, mBLASData.dM, mBLASData.L2_dGBMV_A_mn);
        vectorXD.copyFrom(mBLASData.L2_dGBMV_x_n1);
        vectorYD.copyFrom(mBLASData.L2_dGBMV_y_m1);

        // Test for the default case: NO_TRANS
        mBLAS.DGBMV(trans, mBLASData.KL, mBLASData.KU, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, 1));
        vectorYRef.copyFrom(mBLASData.L2_dGBMV_o_N);
        verifyMatrix(vectorYRef, vectorYD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector Y, since it was overwritten by BLAS.
        vectorYD.copyFrom(mBLASData.L2_dGBMV_y_m1);
        // After Transpose matrixA, vectorX and vectorY are exchanged to match the dim of A.T
        mBLAS.DGBMV(trans, mBLASData.KL, mBLASData.KU, alphaD, matrixAD, vectorYD, incY, betaD, vectorXD, incX);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_dGBMV_o_T);
        verifyMatrix(vectorYRef, vectorXD);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXD.copyFrom(mBLASData.L2_dGBMV_x_n1);
        mBLAS.DGBMV(trans, mBLASData.KL, mBLASData.KU, alphaD, matrixAD, vectorYD, incY, betaD, vectorXD, incX);
        vectorYRef.copyFrom(mBLASData.L2_dGBMV_o_H);
        verifyMatrix(vectorYRef, vectorXD);

        // Test for incX = 2 & incY = 3;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dM - 1) * incY;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorXD.copyFrom(mBLASData.L2_dGBMV_x_n2);
        vectorYD.copyFrom(mBLASData.L2_dGBMV_y_m2);

        mBLAS.DGBMV(trans, mBLASData.KL, mBLASData.KU, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_dGBMV_o_N2);
        verifyMatrix(vectorYRef, vectorYD);

        mRS.finish();
        checkError();
    }

    public void test_L2_CGBMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, 1));
        matrixAC.copy2DRangeFrom(0, 0, mBLASData.KL + mBLASData.KU + 1, mBLASData.dM, mBLASData.L2_cGBMV_A_mn);
        vectorXC.copyFrom(mBLASData.L2_cGBMV_x_n1);
        vectorYC.copyFrom(mBLASData.L2_cGBMV_y_m1);

        // Test for the default case: NO_TRANS
        mBLAS.CGBMV(trans, mBLASData.KL, mBLASData.KU, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, 1));
        vectorYRef.copyFrom(mBLASData.L2_cGBMV_o_N);
        verifyMatrix(vectorYRef, vectorYC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector Y, since it was overwritten by BLAS.
        vectorYC.copyFrom(mBLASData.L2_cGBMV_y_m1);
        // After Transpose matrixA, vectorX and vectorY are exchanged to match the dim of A.T
        mBLAS.CGBMV(trans, mBLASData.KL, mBLASData.KU, alphaC, matrixAC, vectorYC, incY, betaC, vectorXC, incX);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_cGBMV_o_T);
        verifyMatrix(vectorYRef, vectorXC);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXC.copyFrom(mBLASData.L2_cGBMV_x_n1);
        mBLAS.CGBMV(trans, mBLASData.KL, mBLASData.KU, alphaC, matrixAC, vectorYC, incY, betaC, vectorXC, incX);
        vectorYRef.copyFrom(mBLASData.L2_cGBMV_o_H);
        verifyMatrix(vectorYRef, vectorXC);

        // Test for incX = 2 & incY = 3;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dM - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cGBMV_x_n2);
        vectorYC.copyFrom(mBLASData.L2_cGBMV_y_m2);

        mBLAS.CGBMV(trans, mBLASData.KL, mBLASData.KU, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_cGBMV_o_N2);
        verifyMatrix(vectorYRef, vectorYC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZGBMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, 1));
        matrixAZ.copy2DRangeFrom(0, 0, mBLASData.KL + mBLASData.KU + 1, mBLASData.dM, mBLASData.L2_zGBMV_A_mn);
        vectorXZ.copyFrom(mBLASData.L2_zGBMV_x_n1);
        vectorYZ.copyFrom(mBLASData.L2_zGBMV_y_m1);

        // Test for the default case: NO_TRANS
        mBLAS.ZGBMV(trans, mBLASData.KL, mBLASData.KU, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, 1));
        vectorYRef.copyFrom(mBLASData.L2_zGBMV_o_N);
        verifyMatrix(vectorYRef, vectorYZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector Y, since it was overwritten by BLAS.
        vectorYZ.copyFrom(mBLASData.L2_zGBMV_y_m1);
        // After Transpose matrixA, vectorX and vectorY are exchanged to match the dim of A.T
        mBLAS.ZGBMV(trans, mBLASData.KL, mBLASData.KU, alphaZ, matrixAZ, vectorYZ, incY, betaZ, vectorXZ, incX);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_zGBMV_o_T);
        verifyMatrix(vectorYRef, vectorXZ);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXZ.copyFrom(mBLASData.L2_zGBMV_x_n1);
        mBLAS.ZGBMV(trans, mBLASData.KL, mBLASData.KU, alphaZ, matrixAZ, vectorYZ, incX, betaZ, vectorXZ, incY);
        vectorYRef.copyFrom(mBLASData.L2_zGBMV_o_H);
        verifyMatrix(vectorYRef, vectorXZ);

        // Test for incX = 2 & incY = 3;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dM - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zGBMV_x_n2);
        vectorYZ.copyFrom(mBLASData.L2_zGBMV_y_m2);

        mBLAS.ZGBMV(trans, mBLASData.KL, mBLASData.KU, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_zGBMV_o_N2);
        verifyMatrix(vectorYRef, vectorYZ);

        mRS.finish();
        checkError();
    }


    private void xHEMV_API_test(int Uplo, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSYR2(elemA, Uplo, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CHEMV(Uplo, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZHEMV(Uplo, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CHEMV(Uplo, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            fail("should throw RSRuntimeException for CHEMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZHEMV(Uplo, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            fail("should throw RSRuntimeException for ZHEMV");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xHEMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xHEMV_API_test(Uplo, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_CHEMV_API() {
        L2_xHEMV_API(mMatrixC);
    }

    public void test_L2_ZHEMV_API() {
        L2_xHEMV_API(mMatrixZ);
    }

    public void test_L2_CHEMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copyFrom(mBLASData.L2_cHEMV_A_nn);
        vectorXC.copyFrom(mBLASData.L2_cHEMV_x_n1);
        vectorYC.copyFrom(mBLASData.L2_cHEMV_y_n1);

        // Test for the default case:
        mBLAS.CHEMV(uplo, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_cHEMV_o_N);
        verifyMatrix(vectorYRef, vectorYC);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cHEMV_x_n2);
        vectorYC.copyFrom(mBLASData.L2_cHEMV_y_n2);

        mBLAS.CHEMV(uplo, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_cHEMV_o_N2);
        verifyMatrix(vectorYRef, vectorYC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZHEMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copyFrom(mBLASData.L2_zHEMV_A_nn);
        vectorXZ.copyFrom(mBLASData.L2_zHEMV_x_n1);
        vectorYZ.copyFrom(mBLASData.L2_zHEMV_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZHEMV(uplo, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_zHEMV_o_N);
        verifyMatrix(vectorYRef, vectorYZ);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zHEMV_x_n2);
        vectorYZ.copyFrom(mBLASData.L2_zHEMV_y_n2);

        mBLAS.ZHEMV(uplo, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_zHEMV_o_N2);
        verifyMatrix(vectorYRef, vectorYZ);

        mRS.finish();
        checkError();
    }



    private void xHBMV_API_test(int Uplo, int K, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSYR2(elemA, Uplo, vecX, incX, vecY, incY, matA) && K >= 0) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CHBMV(Uplo, K, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZHBMV(Uplo, K, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CHBMV(Uplo, K, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            fail("should throw RSRuntimeException for CHBMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZHBMV(Uplo, K, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            fail("should throw RSRuntimeException for ZHBMV");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xHBMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int K : mK) {
                for (int incX : mInc) {
                        xHBMV_API_test(Uplo, K, incX, incX, mMatrix);
                }
            }
        }
    }

    public void test_L2_CHBMV_API() {
        L2_xHBMV_API(mMatrixC);
    }

    public void test_L2_ZHBMV_API() {
        L2_xHBMV_API(mMatrixZ);
    }

    public void test_L2_CHBMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_cHBMV_A_nn);
        vectorXC.copyFrom(mBLASData.L2_cHBMV_x_n1);
        vectorYC.copyFrom(mBLASData.L2_cHBMV_y_n1);

        // Test for the default case:
        mBLAS.CHBMV(uplo, mBLASData.KL, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_cHBMV_o_N);
        verifyMatrix(vectorYRef, vectorYC);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cHBMV_x_n2);
        vectorYC.copyFrom(mBLASData.L2_cHBMV_y_n2);

        mBLAS.CHBMV(uplo, mBLASData.KL, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_cHBMV_o_N2);
        verifyMatrix(vectorYRef, vectorYC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZHBMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_zHBMV_A_nn);
        vectorXZ.copyFrom(mBLASData.L2_zHBMV_x_n1);
        vectorYZ.copyFrom(mBLASData.L2_zHBMV_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZHBMV(uplo, mBLASData.KL, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_zHBMV_o_N);
        verifyMatrix(vectorYRef, vectorYZ);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zHBMV_x_n2);
        vectorYZ.copyFrom(mBLASData.L2_zHBMV_y_n2);

        mBLAS.ZHBMV(uplo, mBLASData.KL, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_zHBMV_o_N2);
        verifyMatrix(vectorYRef, vectorYZ);

        mRS.finish();
        checkError();
    }


    private void xHPMV_API_test(int Uplo, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSPR2(elemA, Uplo, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CHPMV(Uplo, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZHPMV(Uplo, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CHPMV(Uplo, alphaC, matA, vecX, incX, betaC, vecY, incY);
                            fail("should throw RSRuntimeException for CHPMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZHPMV(Uplo, alphaZ, matA, vecX, incX, betaZ, vecY, incY);
                            fail("should throw RSRuntimeException for ZHPMV");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xHPMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xHPMV_API_test(Uplo, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_CHPMV_API() {
        L2_xHPMV_API(mMatrixC);
    }

    public void test_L2_ZHPMV_API() {
        L2_xHPMV_API(mMatrixZ);
    }

    public void test_L2_CHPMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        matrixAC.copyFrom(mBLASData.L2_cHEMV_A_nn_pu);
        vectorXC.copyFrom(mBLASData.L2_cHEMV_x_n1);
        vectorYC.copyFrom(mBLASData.L2_cHEMV_y_n1);

        // Test for the default case:
        mBLAS.CHPMV(uplo, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        vectorYRef.copyFrom(mBLASData.L2_cHEMV_o_N);
        verifyMatrix(vectorYRef, vectorYC);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (N - 1) * incX;
        int dimY = 1 + (N - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cHEMV_x_n2);
        vectorYC.copyFrom(mBLASData.L2_cHEMV_y_n2);

        mBLAS.CHPMV(uplo, alphaC, matrixAC, vectorXC, incX, betaC, vectorYC, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_cHEMV_o_N2);
        verifyMatrix(vectorYRef, vectorYC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZHPMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        matrixAZ.copyFrom(mBLASData.L2_zHEMV_A_nn_pu);
        vectorXZ.copyFrom(mBLASData.L2_zHEMV_x_n1);
        vectorYZ.copyFrom(mBLASData.L2_zHEMV_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZHPMV(uplo, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        vectorYRef.copyFrom(mBLASData.L2_zHEMV_o_N);
        verifyMatrix(vectorYRef, vectorYZ);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (N - 1) * incX;
        int dimY = 1 + (N - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zHEMV_x_n2);
        vectorYZ.copyFrom(mBLASData.L2_zHEMV_y_n2);

        mBLAS.ZHPMV(uplo, alphaZ, matrixAZ, vectorXZ, incX, betaZ, vectorYZ, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_zHEMV_o_N2);
        verifyMatrix(vectorYRef, vectorYZ);

        mRS.finish();
        checkError();
    }


    private boolean validateSYMV(Element e, int Uplo, Allocation A, Allocation X, int incX, Allocation Y, int incY) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        int N = A.getType().getY();
        if (A.getType().getX() != N) {
            return false;
        }
        if (!A.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e) ||
            !Y.getType().getElement().isCompatible(e) ) {
            return false;
        }
        if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
            return false;
        }

        if (incX <= 0 || incY <= 0) {
            return false;
        }
        int expectedXDim = 1 + (N - 1) * incX;
        if (X.getType().getX() != expectedXDim) {
            return false;
        }
        int expectedYDim = 1 + (N - 1) * incY;
        if (Y.getType().getX() != expectedYDim) {
            return false;
        }
        return true;
    }

    private void xSYMV_API_test(int Uplo, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSYMV(elemA, Uplo, matA, vecX, incX, vecY, incY)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SSYMV(Uplo, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DSYMV(Uplo, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SSYMV(Uplo, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            fail("should throw RSRuntimeException for SSYMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DSYMV(Uplo, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            fail("should throw RSRuntimeException for DSYMV");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xSYMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xSYMV_API_test(Uplo, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_SSYMV_API() {
        L2_xSYMV_API(mMatrixS);
    }

    public void test_L2_DSYMV_API() {
        L2_xSYMV_API(mMatrixD);
    }

    public void test_L2_SSYMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        Allocation vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copyFrom(mBLASData.L2_sSYMV_A_nn);
        vectorXS.copyFrom(mBLASData.L2_sSYMV_x_n1);
        vectorYS.copyFrom(mBLASData.L2_sSYMV_y_n1);

        // Test for the default case:
        mBLAS.SSYMV(uplo, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_sSYMV_o_N);
        verifyMatrix(vectorYRef, vectorYS);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorXS.copyFrom(mBLASData.L2_sSYMV_x_n2);
        vectorYS.copyFrom(mBLASData.L2_sSYMV_y_n2);

        mBLAS.SSYMV(uplo, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_sSYMV_o_N2);
        verifyMatrix(vectorYRef, vectorYS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DSYMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        Allocation vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copyFrom(mBLASData.L2_dSYMV_A_nn);
        vectorXD.copyFrom(mBLASData.L2_dSYMV_x_n1);
        vectorYD.copyFrom(mBLASData.L2_dSYMV_y_n1);

        // Test for the default case:
        mBLAS.DSYMV(uplo, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_dSYMV_o_N);
        verifyMatrix(vectorYRef, vectorYD);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorXD.copyFrom(mBLASData.L2_dSYMV_x_n2);
        vectorYD.copyFrom(mBLASData.L2_dSYMV_y_n2);

        mBLAS.DSYMV(uplo, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_dSYMV_o_N2);
        verifyMatrix(vectorYRef, vectorYD);

        mRS.finish();
        checkError();
    }



    private void xSBMV_API_test(int Uplo, int K, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSYMV(elemA, Uplo, matA, vecX, incX, vecY, incY) && K >= 0) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SSBMV(Uplo, K, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DSBMV(Uplo, K, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SSBMV(Uplo, K, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            fail("should throw RSRuntimeException for SSBMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DSBMV(Uplo, K, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            fail("should throw RSRuntimeException for DSBMV");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xSBMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int K : mK) {
                for (int incX : mInc) {
                    xSBMV_API_test(Uplo, K, incX, incX, mMatrix);
                }
            }
        }
    }

    public void test_L2_SSBMV_API() {
        L2_xSBMV_API(mMatrixS);
    }

    public void test_L2_DSBMV_API() {
        L2_xSBMV_API(mMatrixD);
    }

    public void test_L2_SSBMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        Allocation vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_sSBMV_A_nn);
        vectorXS.copyFrom(mBLASData.L2_sSBMV_x_n1);
        vectorYS.copyFrom(mBLASData.L2_sSBMV_y_n1);

        // Test for the default case:
        mBLAS.SSBMV(uplo, mBLASData.KL, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_sSBMV_o_N);
        verifyMatrix(vectorYRef, vectorYS);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorXS.copyFrom(mBLASData.L2_sSBMV_x_n2);
        vectorYS.copyFrom(mBLASData.L2_sSBMV_y_n2);

        mBLAS.SSBMV(uplo, mBLASData.KL, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_sSBMV_o_N2);
        verifyMatrix(vectorYRef, vectorYS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DSBMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        Allocation vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_dSBMV_A_nn);
        vectorXD.copyFrom(mBLASData.L2_dSBMV_x_n1);
        vectorYD.copyFrom(mBLASData.L2_dSBMV_y_n1);

        // Test for the default case:
        mBLAS.DSBMV(uplo, mBLASData.KL, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        vectorYRef.copyFrom(mBLASData.L2_dSBMV_o_N);
        verifyMatrix(vectorYRef, vectorYD);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorXD.copyFrom(mBLASData.L2_dSBMV_x_n2);
        vectorYD.copyFrom(mBLASData.L2_dSBMV_y_n2);

        mBLAS.DSBMV(uplo, mBLASData.KL, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_dSBMV_o_N2);
        verifyMatrix(vectorYRef, vectorYD);

        mRS.finish();
        checkError();
    }


    private boolean validateSPMV(Element e, int Uplo, Allocation Ap, Allocation X, int incX, Allocation Y, int incY) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!Ap.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e) ||
            !Y.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
            return false;
        }

        if (Ap.getType().getY() > 1) {
            return false;
        }

        int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
        if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
            return false;
        }
        if (incX <= 0 || incY <= 0) {
            return false;
        }
        int expectedXDim = 1 + (N - 1) * incX;
        if (X.getType().getX() != expectedXDim) {
            return false;
        }
        int expectedYDim = 1 + (N - 1) * incY;
        if (Y.getType().getX() != expectedYDim) {
            return false;
        }

        return true;
    }

    private void xSPMV_API_test(int Uplo, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSPMV(elemA, Uplo, matA, vecX, incX, vecY, incY)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SSPMV(Uplo, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DSPMV(Uplo, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SSPMV(Uplo, alphaS, matA, vecX, incX, betaS, vecY, incY);
                            fail("should throw RSRuntimeException for SSPMV");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DSPMV(Uplo, alphaD, matA, vecX, incX, betaD, vecY, incY);
                            fail("should throw RSRuntimeException for DSPMV");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xSPMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xSPMV_API_test(Uplo, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_SSPMV_API() {
        L2_xSPMV_API(mMatrixS);
    }

    public void test_L2_DSPMV_API() {
        L2_xSPMV_API(mMatrixD);
    }

    public void test_L2_SSPMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N * (N+1) / 2, 1));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        Allocation vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        matrixAS.copyFrom(mBLASData.L2_sSYMV_A_nn_pu);
        vectorXS.copyFrom(mBLASData.L2_sSYMV_x_n1);
        vectorYS.copyFrom(mBLASData.L2_sSYMV_y_n1);

        // Test for the default case:
        mBLAS.SSPMV(uplo, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        vectorYRef.copyFrom(mBLASData.L2_sSYMV_o_N);
        verifyMatrix(vectorYRef, vectorYS);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (N - 1) * incX;
        int dimY = 1 + (N - 1) * incY;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorXS.copyFrom(mBLASData.L2_sSYMV_x_n2);
        vectorYS.copyFrom(mBLASData.L2_sSYMV_y_n2);

        mBLAS.SSPMV(uplo, alphaS, matrixAS, vectorXS, incX, betaS, vectorYS, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_sSYMV_o_N2);
        verifyMatrix(vectorYRef, vectorYS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DSPMV_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N * (N+1) / 2, 1));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        Allocation vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        matrixAD.copyFrom(mBLASData.L2_dSYMV_A_nn_pu);
        vectorXD.copyFrom(mBLASData.L2_dSYMV_x_n1);
        vectorYD.copyFrom(mBLASData.L2_dSYMV_y_n1);

        // Test for the default case:
        mBLAS.DSPMV(uplo, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        Allocation vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        vectorYRef.copyFrom(mBLASData.L2_dSYMV_o_N);
        verifyMatrix(vectorYRef, vectorYD);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (N - 1) * incX;
        int dimY = 1 + (N - 1) * incY;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorXD.copyFrom(mBLASData.L2_dSYMV_x_n2);
        vectorYD.copyFrom(mBLASData.L2_dSYMV_y_n2);

        mBLAS.DSPMV(uplo, alphaD, matrixAD, vectorXD, incX, betaD, vectorYD, incY);
        vectorYRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorYRef.copyFrom(mBLASData.L2_dSYMV_o_N2);
        verifyMatrix(vectorYRef, vectorYD);

        mRS.finish();
        checkError();
    }



    private boolean validateTRMV(Element e, int Uplo, int TransA, int Diag, Allocation A, Allocation X, int incX) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!validateTranspose(TransA)) {
            return false;
        }
        if (!validateDiag(Diag)) {
            return false;
        }
        int N = A.getType().getY();
        if (A.getType().getX() != N) {
            return false;
        }
        if (!A.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (X.getType().getY() > 1) {
            return false;
        }

        if (incX <= 0) {
            return false;
        }
        int expectedXDim = 1 + (N - 1) * incX;
        if (X.getType().getX() != expectedXDim) {
            return false;
        }
        return true;
    }

    private void xTRMV_API_test(int Uplo, int TransA, int Diag, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateTRMV(elemA, Uplo, TransA, Diag, matA, vecX, incX)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.STRMV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DTRMV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CTRMV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZTRMV(Uplo, TransA, Diag, matA, vecX, incX);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.STRMV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for STRMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DTRMV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for DTRMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CTRMV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for CTRMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZTRMV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for ZTRMV");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xTRMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int TransA : mTranspose) {
                for (int Diag : mDiag) {
                    for (int incX : mInc) {
                        xTRMV_API_test(Uplo, TransA, Diag, incX, mMatrix);
                    }
                }
            }
        }
    }

    public void test_L2_STRMV_API() {
        L2_xTRMV_API(mMatrixS);
    }

    public void test_L2_DTRMV_API() {
        L2_xTRMV_API(mMatrixD);
    }

    public void test_L2_CTRMV_API() {
        L2_xTRMV_API(mMatrixC);
    }

    public void test_L2_ZTRMV_API() {
        L2_xTRMV_API(mMatrixZ);
    }

    public void test_L2_STRMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copyFrom(mBLASData.L2_sTRMV_A_nn);
        vectorXS.copyFrom(mBLASData.L2_sTRMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.STRMV(uplo, trans, diag, matrixAS, vectorXS, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTRMV_o_UN);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXS.copyFrom(mBLASData.L2_sTRMV_x_n1);
        mBLAS.STRMV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTRMV_o_UT);
        verifyMatrix(vectorXRef, vectorXS);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXS.copyFrom(mBLASData.L2_sTRMV_x_n1);
        mBLAS.STRMV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTRMV_o_UH);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXS.copyFrom(mBLASData.L2_sTRMV_x_n2);

        mBLAS.STRMV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTRMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DTRMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copyFrom(mBLASData.L2_dTRMV_A_nn);
        vectorXD.copyFrom(mBLASData.L2_dTRMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DTRMV(uplo, trans, diag, matrixAD, vectorXD, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTRMV_o_UN);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXD.copyFrom(mBLASData.L2_dTRMV_x_n1);
        mBLAS.DTRMV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTRMV_o_UT);
        verifyMatrix(vectorXRef, vectorXD);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXD.copyFrom(mBLASData.L2_dTRMV_x_n1);
        mBLAS.DTRMV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTRMV_o_UH);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXD.copyFrom(mBLASData.L2_dTRMV_x_n2);

        mBLAS.DTRMV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTRMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXD);

        mRS.finish();
        checkError();
    }

    public void test_L2_CTRMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copyFrom(mBLASData.L2_cTRMV_A_nn);
        vectorXC.copyFrom(mBLASData.L2_cTRMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CTRMV(uplo, trans, diag, matrixAC, vectorXC, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTRMV_o_UN);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXC.copyFrom(mBLASData.L2_cTRMV_x_n1);
        mBLAS.CTRMV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTRMV_o_UT);
        verifyMatrix(vectorXRef, vectorXC);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXC.copyFrom(mBLASData.L2_cTRMV_x_n1);
        mBLAS.CTRMV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTRMV_o_UH);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXC.copyFrom(mBLASData.L2_cTRMV_x_n2);

        mBLAS.CTRMV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTRMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZTRMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copyFrom(mBLASData.L2_zTRMV_A_nn);
        vectorXZ.copyFrom(mBLASData.L2_zTRMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZTRMV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTRMV_o_UN);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXZ.copyFrom(mBLASData.L2_zTRMV_x_n1);
        mBLAS.ZTRMV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTRMV_o_UT);
        verifyMatrix(vectorXRef, vectorXZ);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXZ.copyFrom(mBLASData.L2_zTRMV_x_n1);
        mBLAS.ZTRMV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTRMV_o_UH);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXZ.copyFrom(mBLASData.L2_zTRMV_x_n2);

        mBLAS.ZTRMV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTRMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXZ);

        mRS.finish();
        checkError();
    }



    private void xTBMV_API_test(int Uplo, int TransA, int Diag, int K, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                Element elemA = matA.getType().getElement();
                if (validateTRMV(elemA, Uplo, TransA, Diag, matA, vecX, incX) && K >= 0) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.STBMV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DTBMV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CTBMV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZTBMV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.STBMV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        fail("should throw RSRuntimeException for STBMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DTBMV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        fail("should throw RSRuntimeException for DTBMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CTBMV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        fail("should throw RSRuntimeException for CTBMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZTBMV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        fail("should throw RSRuntimeException for ZTBMV");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xTBMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int TransA : mTranspose) {
                for (int Diag : mDiag) {
                    for (int K : mK) {
                        for (int incX : mInc) {
                            xTBMV_API_test(Uplo, TransA, Diag, K, incX, mMatrix);
                        }
                    }
                }
            }
        }
    }

    public void test_L2_STBMV_API() {
        L2_xTBMV_API(mMatrixS);
    }

    public void test_L2_DTBMV_API() {
        L2_xTBMV_API(mMatrixD);
    }

    public void test_L2_CTBMV_API() {
        L2_xTBMV_API(mMatrixC);
    }

    public void test_L2_ZTBMV_API() {
        L2_xTBMV_API(mMatrixZ);
    }

    public void test_L2_STBMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_sTBMV_A_nn);
        vectorXS.copyFrom(mBLASData.L2_sTBMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.STBMV(uplo, trans, diag, mBLASData.KL, matrixAS, vectorXS, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTBMV_o_UN);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXS.copyFrom(mBLASData.L2_sTBMV_x_n1);
        mBLAS.STBMV(uplo, trans, diag, mBLASData.KL, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTBMV_o_UT);
        verifyMatrix(vectorXRef, vectorXS);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXS.copyFrom(mBLASData.L2_sTBMV_x_n1);
        mBLAS.STBMV(uplo, trans, diag, mBLASData.KL, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTBMV_o_UH);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXS.copyFrom(mBLASData.L2_sTBMV_x_n2);

        mBLAS.STBMV(uplo, trans, diag, mBLASData.KL, matrixAS, vectorXS, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTBMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DTBMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_dTBMV_A_nn);
        vectorXD.copyFrom(mBLASData.L2_dTBMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DTBMV(uplo, trans, diag, mBLASData.KL, matrixAD, vectorXD, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTBMV_o_UN);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXD.copyFrom(mBLASData.L2_dTBMV_x_n1);
        mBLAS.DTBMV(uplo, trans, diag, mBLASData.KL, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTBMV_o_UT);
        verifyMatrix(vectorXRef, vectorXD);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXD.copyFrom(mBLASData.L2_dTBMV_x_n1);
        mBLAS.DTBMV(uplo, trans, diag, mBLASData.KL, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTBMV_o_UH);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXD.copyFrom(mBLASData.L2_dTBMV_x_n2);

        mBLAS.DTBMV(uplo, trans, diag, mBLASData.KL, matrixAD, vectorXD, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTBMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXD);

        mRS.finish();
        checkError();
    }

    public void test_L2_CTBMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_cTBMV_A_nn);
        vectorXC.copyFrom(mBLASData.L2_cTBMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CTBMV(uplo, trans, diag, mBLASData.KL, matrixAC, vectorXC, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTBMV_o_UN);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXC.copyFrom(mBLASData.L2_cTBMV_x_n1);
        mBLAS.CTBMV(uplo, trans, diag, mBLASData.KL, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTBMV_o_UT);
        verifyMatrix(vectorXRef, vectorXC);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXC.copyFrom(mBLASData.L2_cTBMV_x_n1);
        mBLAS.CTBMV(uplo, trans, diag, mBLASData.KL, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTBMV_o_UH);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXC.copyFrom(mBLASData.L2_cTBMV_x_n2);

        mBLAS.CTBMV(uplo, trans, diag, mBLASData.KL, matrixAC, vectorXC, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTBMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZTBMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_zTBMV_A_nn);
        vectorXZ.copyFrom(mBLASData.L2_zTBMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZTBMV(uplo, trans, diag, mBLASData.KL, matrixAZ, vectorXZ, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTBMV_o_UN);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXZ.copyFrom(mBLASData.L2_zTBMV_x_n1);
        mBLAS.ZTBMV(uplo, trans, diag, mBLASData.KL, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTBMV_o_UT);
        verifyMatrix(vectorXRef, vectorXZ);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXZ.copyFrom(mBLASData.L2_zTBMV_x_n1);
        mBLAS.ZTBMV(uplo, trans, diag, mBLASData.KL, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTBMV_o_UH);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXZ.copyFrom(mBLASData.L2_zTBMV_x_n2);

        mBLAS.ZTBMV(uplo, trans, diag, mBLASData.KL, matrixAZ, vectorXZ, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTBMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXZ);

        mRS.finish();
        checkError();
    }


    private boolean validateTPMV(Element e, int Uplo, int TransA, int Diag, Allocation Ap, Allocation X, int incX) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!validateTranspose(TransA)) {
            return false;
        }
        if (!validateDiag(Diag)) {
            return false;
        }
        if (!Ap.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (X.getType().getY() > 1) {
            return false;
        }

        if (Ap.getType().getY() > 1) {
            return false;
        }

        int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
        if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
            return false;
        }
        if (incX <= 0) {
            return false;
        }
        int expectedXDim = 1 + (N - 1) * incX;
        if (X.getType().getX() != expectedXDim) {
            return false;
        }

        return true;
    }

    private void xTPMV_API_test(int Uplo, int TransA, int Diag, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateTPMV(elemA, Uplo, TransA, Diag, matA, vecX, incX)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.STPMV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DTPMV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CTPMV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZTPMV(Uplo, TransA, Diag, matA, vecX, incX);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.STPMV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for STPMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DTPMV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for DTPMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CTPMV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for CTPMV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZTPMV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for ZTPMV");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xTPMV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int TransA : mTranspose) {
                for (int Diag : mDiag) {
                    for (int incX : mInc) {
                        xTPMV_API_test(Uplo, TransA, Diag, incX, mMatrix);
                    }
                }
            }
        }
    }

    public void test_L2_STPMV_API() {
        L2_xTPMV_API(mMatrixS);
    }

    public void test_L2_DTPMV_API() {
        L2_xTPMV_API(mMatrixD);
    }

    public void test_L2_CTPMV_API() {
        L2_xTPMV_API(mMatrixC);
    }

    public void test_L2_ZTPMV_API() {
        L2_xTPMV_API(mMatrixZ);
    }

    public void test_L2_STPMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N * (N+1) / 2, 1));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        matrixAS.copyFrom(mBLASData.L2_sTRMV_A_nn_pu);
        vectorXS.copyFrom(mBLASData.L2_sTRMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.STPMV(uplo, trans, diag, matrixAS, vectorXS, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTRMV_o_UN);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXS.copyFrom(mBLASData.L2_sTRMV_x_n1);
        mBLAS.STPMV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTRMV_o_UT);
        verifyMatrix(vectorXRef, vectorXS);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXS.copyFrom(mBLASData.L2_sTRMV_x_n1);
        mBLAS.STPMV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTRMV_o_UH);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXS.copyFrom(mBLASData.L2_sTRMV_x_n2);

        mBLAS.STPMV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTRMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DTPMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N * (N+1) / 2, 1));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        matrixAD.copyFrom(mBLASData.L2_dTRMV_A_nn_pu);
        vectorXD.copyFrom(mBLASData.L2_dTRMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DTPMV(uplo, trans, diag, matrixAD, vectorXD, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTRMV_o_UN);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXD.copyFrom(mBLASData.L2_dTRMV_x_n1);
        mBLAS.DTPMV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTRMV_o_UT);
        verifyMatrix(vectorXRef, vectorXD);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXD.copyFrom(mBLASData.L2_dTRMV_x_n1);
        mBLAS.DTPMV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTRMV_o_UH);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXD.copyFrom(mBLASData.L2_dTRMV_x_n2);

        mBLAS.DTPMV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTRMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXD);

        mRS.finish();
        checkError();
    }

    public void test_L2_CTPMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        matrixAC.copyFrom(mBLASData.L2_cTRMV_A_nn_pu);
        vectorXC.copyFrom(mBLASData.L2_cTRMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CTPMV(uplo, trans, diag, matrixAC, vectorXC, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTRMV_o_UN);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXC.copyFrom(mBLASData.L2_cTRMV_x_n1);
        mBLAS.CTPMV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTRMV_o_UT);
        verifyMatrix(vectorXRef, vectorXC);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXC.copyFrom(mBLASData.L2_cTRMV_x_n1);
        mBLAS.CTPMV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTRMV_o_UH);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXC.copyFrom(mBLASData.L2_cTRMV_x_n2);

        mBLAS.CTPMV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTRMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZTPMV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        matrixAZ.copyFrom(mBLASData.L2_zTRMV_A_nn_pu);
        vectorXZ.copyFrom(mBLASData.L2_zTRMV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZTPMV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTRMV_o_UN);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXZ.copyFrom(mBLASData.L2_zTRMV_x_n1);
        mBLAS.ZTPMV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTRMV_o_UT);
        verifyMatrix(vectorXRef, vectorXZ);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXZ.copyFrom(mBLASData.L2_zTRMV_x_n1);
        mBLAS.ZTPMV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTRMV_o_UH);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXZ.copyFrom(mBLASData.L2_zTRMV_x_n2);

        mBLAS.ZTPMV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTRMV_o_UN2);
        verifyMatrix(vectorXRef, vectorXZ);

        mRS.finish();
        checkError();
    }


    private void xTRSV_API_test(int Uplo, int TransA, int Diag, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateTRMV(elemA, Uplo, TransA, Diag, matA, vecX, incX)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.STRSV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DTRSV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CTRSV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZTRSV(Uplo, TransA, Diag, matA, vecX, incX);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.STRSV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for STRSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DTRSV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for DTRSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CTRSV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for CTRSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZTRSV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for ZTRSV");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xTRSV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int TransA : mTranspose) {
                for (int Diag : mDiag) {
                    for (int incX : mInc) {
                        xTRSV_API_test(Uplo, TransA, Diag, incX, mMatrix);
                    }
                }
            }
        }
    }

    public void test_L2_STRSV_API() {
        L2_xTRSV_API(mMatrixS);
    }

    public void test_L2_DTRSV_API() {
        L2_xTRSV_API(mMatrixD);
    }

    public void test_L2_CTRSV_API() {
        L2_xTRSV_API(mMatrixC);
    }

    public void test_L2_ZTRSV_API() {
        L2_xTRSV_API(mMatrixZ);
    }

    public void test_L2_STRSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copyFrom(mBLASData.L2_sTRSV_A_nn);
        vectorXS.copyFrom(mBLASData.L2_sTRSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.STRSV(uplo, trans, diag, matrixAS, vectorXS, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTRSV_o_UN);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXS.copyFrom(mBLASData.L2_sTRSV_x_n1);
        mBLAS.STRSV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTRSV_o_UT);
        verifyMatrix(vectorXRef, vectorXS);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXS.copyFrom(mBLASData.L2_sTRSV_x_n1);
        mBLAS.STRSV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTRSV_o_UH);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXS.copyFrom(mBLASData.L2_sTRSV_x_n2);

        mBLAS.STRSV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTRSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DTRSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copyFrom(mBLASData.L2_dTRSV_A_nn);
        vectorXD.copyFrom(mBLASData.L2_dTRSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DTRSV(uplo, trans, diag, matrixAD, vectorXD, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTRSV_o_UN);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXD.copyFrom(mBLASData.L2_dTRSV_x_n1);
        mBLAS.DTRSV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTRSV_o_UT);
        verifyMatrix(vectorXRef, vectorXD);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXD.copyFrom(mBLASData.L2_dTRSV_x_n1);
        mBLAS.DTRSV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTRSV_o_UH);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXD.copyFrom(mBLASData.L2_dTRSV_x_n2);

        mBLAS.DTRSV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTRSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXD);

        mRS.finish();
        checkError();
    }

    public void test_L2_CTRSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copyFrom(mBLASData.L2_cTRSV_A_nn);
        vectorXC.copyFrom(mBLASData.L2_cTRSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CTRSV(uplo, trans, diag, matrixAC, vectorXC, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTRSV_o_UN);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXC.copyFrom(mBLASData.L2_cTRSV_x_n1);
        mBLAS.CTRSV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTRSV_o_UT);
        verifyMatrix(vectorXRef, vectorXC);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXC.copyFrom(mBLASData.L2_cTRSV_x_n1);
        mBLAS.CTRSV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTRSV_o_UH);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXC.copyFrom(mBLASData.L2_cTRSV_x_n2);

        mBLAS.CTRSV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTRSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZTRSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copyFrom(mBLASData.L2_zTRSV_A_nn);
        vectorXZ.copyFrom(mBLASData.L2_zTRSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZTRSV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTRSV_o_UN);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXZ.copyFrom(mBLASData.L2_zTRSV_x_n1);
        mBLAS.ZTRSV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTRSV_o_UT);
        verifyMatrix(vectorXRef, vectorXZ);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXZ.copyFrom(mBLASData.L2_zTRSV_x_n1);
        mBLAS.ZTRSV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTRSV_o_UH);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXZ.copyFrom(mBLASData.L2_zTRSV_x_n2);

        mBLAS.ZTRSV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTRSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXZ);

        mRS.finish();
        checkError();
    }


    private void xTBSV_API_test(int Uplo, int TransA, int Diag, int K, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateTRMV(elemA, Uplo, TransA, Diag, matA, vecX, incX) && K >= 0) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.STBSV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DTBSV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CTBSV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZTBSV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.STBSV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        fail("should throw RSRuntimeException for STBSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DTBSV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        fail("should throw RSRuntimeException for DTBSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CTBSV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        fail("should throw RSRuntimeException for CTBSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZTBSV(Uplo, TransA, Diag, K, matA, vecX, incX);
                        fail("should throw RSRuntimeException for ZTBSV");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xTBSV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int TransA : mTranspose) {
                for (int Diag : mDiag) {
                    for (int K : mK) {
                        for (int incX : mInc) {
                            xTBSV_API_test(Uplo, TransA, Diag, K, incX, mMatrix);
                        }
                    }
                }
            }
        }
    }

    public void test_L2_STBSV_API() {
        L2_xTBSV_API(mMatrixS);
    }

    public void test_L2_DTBSV_API() {
        L2_xTBSV_API(mMatrixD);
    }

    public void test_L2_CTBSV_API() {
        L2_xTBSV_API(mMatrixC);
    }

    public void test_L2_ZTBSV_API() {
        L2_xTBSV_API(mMatrixZ);
    }

    public void test_L2_STBSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_sTBSV_A_nn);
        vectorXS.copyFrom(mBLASData.L2_sTBSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.STBSV(uplo, trans, diag, mBLASData.KL, matrixAS, vectorXS, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTBSV_o_UN);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXS.copyFrom(mBLASData.L2_sTBSV_x_n1);
        mBLAS.STBSV(uplo, trans, diag, mBLASData.KL, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTBSV_o_UT);
        verifyMatrix(vectorXRef, vectorXS);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXS.copyFrom(mBLASData.L2_sTBSV_x_n1);
        mBLAS.STBSV(uplo, trans, diag, mBLASData.KL, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTBSV_o_UH);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXS.copyFrom(mBLASData.L2_sTBSV_x_n2);

        mBLAS.STBSV(uplo, trans, diag, mBLASData.KL, matrixAS, vectorXS, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTBSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DTBSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_dTBSV_A_nn);
        vectorXD.copyFrom(mBLASData.L2_dTBSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DTBSV(uplo, trans, diag, mBLASData.KL, matrixAD, vectorXD, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTBSV_o_UN);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXD.copyFrom(mBLASData.L2_dTBSV_x_n1);
        mBLAS.DTBSV(uplo, trans, diag, mBLASData.KL, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTBSV_o_UT);
        verifyMatrix(vectorXRef, vectorXD);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXD.copyFrom(mBLASData.L2_dTBSV_x_n1);
        mBLAS.DTBSV(uplo, trans, diag, mBLASData.KL, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTBSV_o_UH);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXD.copyFrom(mBLASData.L2_dTBSV_x_n2);

        mBLAS.DTBSV(uplo, trans, diag, mBLASData.KL, matrixAD, vectorXD, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTBSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXD);

        mRS.finish();
        checkError();
    }

    public void test_L2_CTBSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_cTBSV_A_nn);
        vectorXC.copyFrom(mBLASData.L2_cTBSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CTBSV(uplo, trans, diag, mBLASData.KL, matrixAC, vectorXC, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTBSV_o_UN);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXC.copyFrom(mBLASData.L2_cTBSV_x_n1);
        mBLAS.CTBSV(uplo, trans, diag, mBLASData.KL, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTBSV_o_UT);
        verifyMatrix(vectorXRef, vectorXC);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXC.copyFrom(mBLASData.L2_cTBSV_x_n1);
        mBLAS.CTBSV(uplo, trans, diag, mBLASData.KL, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTBSV_o_UH);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXC.copyFrom(mBLASData.L2_cTBSV_x_n2);

        mBLAS.CTBSV(uplo, trans, diag, mBLASData.KL, matrixAC, vectorXC, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTBSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZTBSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copy2DRangeFrom(0, 0, mBLASData.KL + 1, mBLASData.dN, mBLASData.L2_zTBSV_A_nn);
        vectorXZ.copyFrom(mBLASData.L2_zTBSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZTBSV(uplo, trans, diag, mBLASData.KL, matrixAZ, vectorXZ, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTBSV_o_UN);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXZ.copyFrom(mBLASData.L2_zTBSV_x_n1);
        mBLAS.ZTBSV(uplo, trans, diag, mBLASData.KL, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTBSV_o_UT);
        verifyMatrix(vectorXRef, vectorXZ);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXZ.copyFrom(mBLASData.L2_zTBSV_x_n1);
        mBLAS.ZTBSV(uplo, trans, diag, mBLASData.KL, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTBSV_o_UH);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXZ.copyFrom(mBLASData.L2_zTBSV_x_n2);

        mBLAS.ZTBSV(uplo, trans, diag, mBLASData.KL, matrixAZ, vectorXZ, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTBSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXZ);

        mRS.finish();
        checkError();
    }


    private void xTPSV_API_test(int Uplo, int TransA, int Diag, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateTPMV(elemA, Uplo, TransA, Diag, matA, vecX, incX)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.STPSV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DTPSV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CTPSV(Uplo, TransA, Diag, matA, vecX, incX);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZTPSV(Uplo, TransA, Diag, matA, vecX, incX);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.STPSV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for STPSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DTPSV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for DTPSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CTPSV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for CTPSV");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZTPSV(Uplo, TransA, Diag, matA, vecX, incX);
                        fail("should throw RSRuntimeException for ZTPSV");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xTPSV_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int TransA : mTranspose) {
                for (int Diag : mDiag) {
                    for (int incX : mInc) {
                        xTPSV_API_test(Uplo, TransA, Diag, incX, mMatrix);
                    }
                }
            }
        }
    }

    public void test_L2_STPSV_API() {
        L2_xTPSV_API(mMatrixS);
    }

    public void test_L2_DTPSV_API() {
        L2_xTPSV_API(mMatrixD);
    }

    public void test_L2_CTPSV_API() {
        L2_xTPSV_API(mMatrixC);
    }

    public void test_L2_ZTPSV_API() {
        L2_xTPSV_API(mMatrixZ);
    }

    public void test_L2_STPSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N * (N+1) / 2, 1));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        matrixAS.copyFrom(mBLASData.L2_sTRSV_A_nn_pu);
        vectorXS.copyFrom(mBLASData.L2_sTRSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.STPSV(uplo, trans, diag, matrixAS, vectorXS, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTRSV_o_UN);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXS.copyFrom(mBLASData.L2_sTRSV_x_n1);
        mBLAS.STPSV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTRSV_o_UT);
        verifyMatrix(vectorXRef, vectorXS);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXS.copyFrom(mBLASData.L2_sTRSV_x_n1);
        mBLAS.STPSV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef.copyFrom(mBLASData.L2_sTRSV_o_UH);
        verifyMatrix(vectorXRef, vectorXS);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXS.copyFrom(mBLASData.L2_sTRSV_x_n2);

        mBLAS.STPSV(uplo, trans, diag, matrixAS, vectorXS, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_sTRSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DTPSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N * (N+1) / 2, 1));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        matrixAD.copyFrom(mBLASData.L2_dTRSV_A_nn_pu);
        vectorXD.copyFrom(mBLASData.L2_dTRSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DTPSV(uplo, trans, diag, matrixAD, vectorXD, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTRSV_o_UN);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXD.copyFrom(mBLASData.L2_dTRSV_x_n1);
        mBLAS.DTPSV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTRSV_o_UT);
        verifyMatrix(vectorXRef, vectorXD);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXD.copyFrom(mBLASData.L2_dTRSV_x_n1);
        mBLAS.DTPSV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef.copyFrom(mBLASData.L2_dTRSV_o_UH);
        verifyMatrix(vectorXRef, vectorXD);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXD.copyFrom(mBLASData.L2_dTRSV_x_n2);

        mBLAS.DTPSV(uplo, trans, diag, matrixAD, vectorXD, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_dTRSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXD);

        mRS.finish();
        checkError();
    }

    public void test_L2_CTPSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        matrixAC.copyFrom(mBLASData.L2_cTRSV_A_nn_pu);
        vectorXC.copyFrom(mBLASData.L2_cTRSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CTPSV(uplo, trans, diag, matrixAC, vectorXC, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTRSV_o_UN);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXC.copyFrom(mBLASData.L2_cTRSV_x_n1);
        mBLAS.CTPSV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTRSV_o_UT);
        verifyMatrix(vectorXRef, vectorXC);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXC.copyFrom(mBLASData.L2_cTRSV_x_n1);
        mBLAS.CTPSV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef.copyFrom(mBLASData.L2_cTRSV_o_UH);
        verifyMatrix(vectorXRef, vectorXC);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXC.copyFrom(mBLASData.L2_cTRSV_x_n2);

        mBLAS.CTPSV(uplo, trans, diag, matrixAC, vectorXC, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_cTRSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZTPSV_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        matrixAZ.copyFrom(mBLASData.L2_zTRSV_A_nn_pu);
        vectorXZ.copyFrom(mBLASData.L2_zTRSV_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZTPSV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        Allocation vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTRSV_o_UN);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload vector X, since it was overwritten by BLAS.
        vectorXZ.copyFrom(mBLASData.L2_zTRSV_x_n1);
        mBLAS.ZTPSV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTRSV_o_UT);
        verifyMatrix(vectorXRef, vectorXZ);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        vectorXZ.copyFrom(mBLASData.L2_zTRSV_x_n1);
        mBLAS.ZTPSV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef.copyFrom(mBLASData.L2_zTRSV_o_UH);
        verifyMatrix(vectorXRef, vectorXZ);

        // Test for incX = 2;
        trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXZ.copyFrom(mBLASData.L2_zTRSV_x_n2);

        mBLAS.ZTPSV(uplo, trans, diag, matrixAZ, vectorXZ, incX);
        vectorXRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXRef.copyFrom(mBLASData.L2_zTRSV_o_UN2);
        verifyMatrix(vectorXRef, vectorXZ);

        mRS.finish();
        checkError();
    }


    private boolean validateGER(Element e, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
        if (!A.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e) ||
            !Y.getType().getElement().isCompatible(e) ) {
            return false;
        }

        if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
            return false;
        }

        int M = A.getType().getY();
        int N = A.getType().getX();

        if (N < 1 || M < 1) {
            return false;
        }
        if (incX <= 0 || incY <= 0) {
            return false;
        }
        int expectedXDim = 1 + (M - 1) * incX;
        if (X.getType().getX() != expectedXDim) {
            return false;
        }
        int expectedYDim = 1 + (N - 1) * incY;
        if (Y.getType().getX() != expectedYDim) {
            return false;
        }
        return true;
    }


    private void xGER_API_test(int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateGER(elemA, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SGER(alphaS, vecX, incX, vecY, incY, matA);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DGER(alphaD, vecX, incX, vecY, incY, matA);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SGER(alphaS, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for SGER");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DGER(alphaD, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for DGER");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    private void L2_xGER_API(ArrayList<Allocation> mMatrix) {
        for (int incX : mInc) {
            for (int incY : mInc) {
                xGERU_API_test(incX, incY, mMatrix);
            }
        }
    }

    public void test_L2_SGER_API() {
        L2_xGER_API(mMatrixS);
    }

    public void test_L2_DGER_API() {
        L2_xGER_API(mMatrixD);
    }

    public void test_L2_SGER_Correctness() {
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, 1));
        Allocation vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copyFrom(mBLASData.L2_sGER_A_mn);
        vectorXS.copyFrom(mBLASData.L2_sGER_x_m1);
        vectorYS.copyFrom(mBLASData.L2_sGER_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.SGER(alphaS, vectorXS, incX, vectorYS, incY, matrixAS);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixARef.copyFrom(mBLASData.L2_sGER_o_N);
        verifyMatrix(matrixARef, matrixAS);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dM - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorXS.copyFrom(mBLASData.L2_sGER_x_m2);
        vectorYS.copyFrom(mBLASData.L2_sGER_y_n2);
        matrixAS.copyFrom(mBLASData.L2_sGER_A_mn);

        mBLAS.SGER(alphaS, vectorXS, incX, vectorYS, incY, matrixAS);
        verifyMatrix(matrixARef, matrixAS);

        mRS.finish();
        checkError();
    }

    public void test_L2_DGER_Correctness() {
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, 1));
        Allocation vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copyFrom(mBLASData.L2_dGER_A_mn);
        vectorXD.copyFrom(mBLASData.L2_dGER_x_m1);
        vectorYD.copyFrom(mBLASData.L2_dGER_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DGER(alphaD, vectorXD, incX, vectorYD, incY, matrixAD);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixARef.copyFrom(mBLASData.L2_dGER_o_N);
        verifyMatrix(matrixARef, matrixAD);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dM - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorXD.copyFrom(mBLASData.L2_dGER_x_m2);
        vectorYD.copyFrom(mBLASData.L2_dGER_y_n2);
        matrixAD.copyFrom(mBLASData.L2_dGER_A_mn);

        mBLAS.DGER(alphaD, vectorXD, incX, vectorYD, incY, matrixAD);
        verifyMatrix(matrixARef, matrixAD);

        mRS.finish();
        checkError();
    }


    private boolean validateGERU(Element e, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
        if (!A.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e) ||
            !Y.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
            return false;
        }

        int M = A.getType().getY();
        int N = A.getType().getX();
        if (incX <= 0 || incY <= 0) {
            return false;
        }
        int expectedXDim = 1 + (M - 1) * incX;
        if (X.getType().getX() != expectedXDim) {
            return false;
        }
        int expectedYDim = 1 + (N - 1) * incY;
        if (Y.getType().getX() != expectedYDim) {
            return false;
        }
        return true;
    }

    private void xGERU_API_test(int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateGERU(elemA, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CGERU(alphaC, vecX, incX, vecY, incY, matA);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZGERU(alphaZ, vecX, incX, vecY, incY, matA);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CGERU(alphaC, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for CGERU");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZGERU(alphaZ, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for ZGERU");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    private void L2_xGERU_API(ArrayList<Allocation> mMatrix) {
        for (int incX : mInc) {
            for (int incY : mInc) {
                xGERU_API_test(incX, incY, mMatrix);
            }
        }
    }

    public void test_L2_CGERU_API() {
        L2_xGERU_API(mMatrixC);
    }

    public void test_L2_ZGERU_API() {
        L2_xGERU_API(mMatrixZ);
    }

    public void test_L2_CGERU_Correctness() {
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copyFrom(mBLASData.L2_cGERU_A_mn);
        vectorXC.copyFrom(mBLASData.L2_cGERU_x_m1);
        vectorYC.copyFrom(mBLASData.L2_cGERU_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CGERU(alphaC, vectorXC, incX, vectorYC, incY, matrixAC);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixARef.copyFrom(mBLASData.L2_cGERU_o_N);
        verifyMatrix(matrixARef, matrixAC);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dM - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cGERU_x_m2);
        vectorYC.copyFrom(mBLASData.L2_cGERU_y_n2);
        matrixAC.copyFrom(mBLASData.L2_cGERU_A_mn);

        mBLAS.CGERU(alphaC, vectorXC, incX, vectorYC, incY, matrixAC);
        verifyMatrix(matrixARef, matrixAC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZGERU_Correctness() {
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copyFrom(mBLASData.L2_zGERU_A_mn);
        vectorXZ.copyFrom(mBLASData.L2_zGERU_x_m1);
        vectorYZ.copyFrom(mBLASData.L2_zGERU_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZGERU(alphaZ, vectorXZ, incX, vectorYZ, incY, matrixAZ);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixARef.copyFrom(mBLASData.L2_zGERU_o_N);
        verifyMatrix(matrixARef, matrixAZ);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dM - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zGERU_x_m2);
        vectorYZ.copyFrom(mBLASData.L2_zGERU_y_n2);
        matrixAZ.copyFrom(mBLASData.L2_zGERU_A_mn);

        mBLAS.ZGERU(alphaZ, vectorXZ, incX, vectorYZ, incY, matrixAZ);
        verifyMatrix(matrixARef, matrixAZ);

        mRS.finish();
        checkError();
    }



    private void xGERC_API_test(int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateGERU(elemA, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CGERC(alphaC, vecX, incX, vecY, incY, matA);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZGERC(alphaZ, vecX, incX, vecY, incY, matA);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CGERC(alphaC, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for CGERC");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZGERC(alphaZ, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for ZGERC");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    private void L2_xGERC_API(ArrayList<Allocation> mMatrix) {
        for (int incX : mInc) {
            for (int incY : mInc) {
                xGERC_API_test(incX, incY, mMatrix);
            }
        }
    }

    public void test_L2_CGERC_API() {
        L2_xGERC_API(mMatrixC);
    }

    public void test_L2_ZGERC_API() {
        L2_xGERC_API(mMatrixZ);
    }

    public void test_L2_CGERC_Correctness() {
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copyFrom(mBLASData.L2_cGERC_A_mn);
        vectorXC.copyFrom(mBLASData.L2_cGERC_x_m1);
        vectorYC.copyFrom(mBLASData.L2_cGERC_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CGERC(alphaC, vectorXC, incX, vectorYC, incY, matrixAC);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixARef.copyFrom(mBLASData.L2_cGERC_o_N);
        verifyMatrix(matrixARef, matrixAC);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dM - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cGERC_x_m2);
        vectorYC.copyFrom(mBLASData.L2_cGERC_y_n2);
        matrixAC.copyFrom(mBLASData.L2_cGERC_A_mn);

        mBLAS.CGERC(alphaC, vectorXC, incX, vectorYC, incY, matrixAC);
        verifyMatrix(matrixARef, matrixAC);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZGERC_Correctness() {
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copyFrom(mBLASData.L2_zGERC_A_mn);
        vectorXZ.copyFrom(mBLASData.L2_zGERC_x_m1);
        vectorYZ.copyFrom(mBLASData.L2_zGERC_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZGERC(alphaZ, vectorXZ, incX, vectorYZ, incY, matrixAZ);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixARef.copyFrom(mBLASData.L2_zGERC_o_N);
        verifyMatrix(matrixARef, matrixAZ);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dM - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zGERC_x_m2);
        vectorYZ.copyFrom(mBLASData.L2_zGERC_y_n2);
        matrixAZ.copyFrom(mBLASData.L2_zGERC_A_mn);

        mBLAS.ZGERC(alphaZ, vectorXZ, incX, vectorYZ, incY, matrixAZ);
        verifyMatrix(matrixARef, matrixAZ);

        mRS.finish();
        checkError();
    }


    private void xHER_API_test(int Uplo, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateSYR(elemA, Uplo, vecX, incX, matA)) {
                    try {
                        if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CHER(Uplo, alphaS, vecX, incX, matA);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZHER(Uplo, alphaD, vecX, incX, matA);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.CHER(Uplo, alphaS, vecX, incX, matA);
                        fail("should throw RSRuntimeException for CHER");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZHER(Uplo, alphaD, vecX, incX, matA);
                        fail("should throw RSRuntimeException for ZHER");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xHER_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xHER_API_test(Uplo, incX, mMatrix);
            }
        }
    }

    public void test_L2_CHER_API() {
        L2_xHER_API(mMatrixC);
    }

    public void test_L2_ZHER_API() {
        L2_xHER_API(mMatrixZ);
    }

    public void test_L2_CHER_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copyFrom(mBLASData.L2_cHER_A_nn);
        vectorXC.copyFrom(mBLASData.L2_cHER_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CHER(uplo, alphaS, vectorXC, incX, matrixAC);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixARef.copyFrom(mBLASData.L2_cHER_o_N);
        verifyMatrix(matrixARef, matrixAC, true);

        // Test for incX = 2;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXC.copyFrom(mBLASData.L2_cHER_x_n2);
        matrixAC.copyFrom(mBLASData.L2_cHER_A_nn);

        mBLAS.CHER(uplo, alphaS, vectorXC, incX, matrixAC);
        verifyMatrix(matrixARef, matrixAC, true);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZHER_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copyFrom(mBLASData.L2_zHER_A_nn);
        vectorXZ.copyFrom(mBLASData.L2_zHER_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZHER(uplo, alphaD, vectorXZ, incX, matrixAZ);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixARef.copyFrom(mBLASData.L2_zHER_o_N);
        verifyMatrix(matrixARef, matrixAZ, true);

        // Test for incX = 2;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXZ.copyFrom(mBLASData.L2_zHER_x_n2);
        matrixAZ.copyFrom(mBLASData.L2_zHER_A_nn);

        mBLAS.ZHER(uplo, alphaD, vectorXZ, incX, matrixAZ);
        verifyMatrix(matrixARef, matrixAZ, true);

        mRS.finish();
        checkError();
    }


    private void xHPR_API_test(int Uplo, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateSPR(elemA, Uplo, vecX, incX, matA)) {
                    try {
                        if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CHPR(Uplo, alphaS, vecX, incX, matA);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZHPR(Uplo, alphaD, vecX, incX, matA);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.CHPR(Uplo, alphaS, vecX, incX, matA);
                        fail("should throw RSRuntimeException for CHPR");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZHPR(Uplo, alphaD, vecX, incX, matA);
                        fail("should throw RSRuntimeException for ZHPR");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xHPR_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xHPR_API_test(Uplo, incX, mMatrix);
            }
        }
    }

    public void test_L2_CHPR_API() {
        L2_xHPR_API(mMatrixC);
    }

    public void test_L2_ZHPR_API() {
        L2_xHPR_API(mMatrixZ);
    }

    public void test_L2_CHPR_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        matrixAC.copyFrom(mBLASData.L2_cHER_A_nn_pu);
        vectorXC.copyFrom(mBLASData.L2_cHER_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CHPR(uplo, alphaS, vectorXC, incX, matrixAC);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N * (N+1) / 2, 1));
        matrixARef.copyFrom(mBLASData.L2_cHER_o_N_pu);
        verifyMatrix(matrixARef, matrixAC, true);

        // Test for incX = 2;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorXC.copyFrom(mBLASData.L2_cHER_x_n2);
        matrixAC.copyFrom(mBLASData.L2_cHER_A_nn_pu);

        mBLAS.CHPR(uplo, alphaS, vectorXC, incX, matrixAC);
        verifyMatrix(matrixARef, matrixAC, true);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZHPR_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        matrixAZ.copyFrom(mBLASData.L2_zHER_A_nn_pu);
        vectorXZ.copyFrom(mBLASData.L2_zHER_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZHPR(uplo, alphaD, vectorXZ, incX, matrixAZ);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N * (N+1) / 2, 1));
        matrixARef.copyFrom(mBLASData.L2_zHER_o_N_pu);
        verifyMatrix(matrixARef, matrixAZ, true);

        // Test for incX = 2;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorXZ.copyFrom(mBLASData.L2_zHER_x_n2);
        matrixAZ.copyFrom(mBLASData.L2_zHER_A_nn_pu);

        mBLAS.ZHPR(uplo, alphaD, vectorXZ, incX, matrixAZ);
        verifyMatrix(matrixARef, matrixAZ, true);

        mRS.finish();
        checkError();
    }


    private void xHER2_API_test(int Uplo, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSYR2(elemA, Uplo, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CHER2(Uplo, alphaC, vecX, incX, vecY, incY, matA);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZHER2(Uplo, alphaZ, vecX, incX, vecY, incY, matA);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CHER2(Uplo, alphaC, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for CHER2");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZHER2(Uplo, alphaZ, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for ZHER2");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xHER2_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xHER2_API_test(Uplo, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_CHER2_API() {
        L2_xHER2_API(mMatrixC);
    }

    public void test_L2_ZHER2_API() {
        L2_xHER2_API(mMatrixZ);
    }

    public void test_L2_CHER2_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, 1));
        matrixAC.copyFrom(mBLASData.L2_cHER2_A_nn);
        vectorXC.copyFrom(mBLASData.L2_cHER2_x_n1);
        vectorYC.copyFrom(mBLASData.L2_cHER2_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CHER2(uplo, alphaC, vectorXC, incX, vectorYC, incY, matrixAC);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixARef.copyFrom(mBLASData.L2_cHER2_o_N);
        verifyMatrix(matrixARef, matrixAC, true);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cHER2_x_n2);
        vectorYC.copyFrom(mBLASData.L2_cHER2_y_n2);
        matrixAC.copyFrom(mBLASData.L2_cHER2_A_nn);

        mBLAS.CHER2(uplo, alphaC, vectorXC, incX, vectorYC, incY, matrixAC);
        verifyMatrix(matrixARef, matrixAC, true);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZHER2_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, 1));
        matrixAZ.copyFrom(mBLASData.L2_zHER2_A_nn);
        vectorXZ.copyFrom(mBLASData.L2_zHER2_x_n1);
        vectorYZ.copyFrom(mBLASData.L2_zHER2_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZHER2(uplo, alphaZ, vectorXZ, incX, vectorYZ, incY, matrixAZ);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixARef.copyFrom(mBLASData.L2_zHER2_o_N);
        verifyMatrix(matrixARef, matrixAZ, true);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zHER2_x_n2);
        vectorYZ.copyFrom(mBLASData.L2_zHER2_y_n2);
        matrixAZ.copyFrom(mBLASData.L2_zHER2_A_nn);

        mBLAS.ZHER2(uplo, alphaZ, vectorXZ, incX, vectorYZ, incY, matrixAZ);
        verifyMatrix(matrixARef, matrixAZ, true);

        mRS.finish();
        checkError();
    }



    private void xHPR2_API_test(int Uplo, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSPR2(elemA, Uplo, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CHPR2(Uplo, alphaC, vecX, incX, vecY, incY, matA);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZHPR2(Uplo, alphaZ, vecX, incX, vecY, incY, matA);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CHPR2(Uplo, alphaC, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for CHPR2");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZHPR2(Uplo, alphaZ, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for ZHPR2");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xHPR2_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xHPR2_API_test(Uplo, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_CHPR2_API() {
        L2_xHPR2_API(mMatrixC);
    }

    public void test_L2_ZHPR2_API() {
        L2_xHPR2_API(mMatrixZ);
    }

    public void test_L2_CHPR2_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        Allocation vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N, 1));
        matrixAC.copyFrom(mBLASData.L2_cHER2_A_nn_pu);
        vectorXC.copyFrom(mBLASData.L2_cHER2_x_n1);
        vectorYC.copyFrom(mBLASData.L2_cHER2_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.CHPR2(uplo, alphaC, vectorXC, incX, vectorYC, incY, matrixAC);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), N * (N+1) / 2, 1));
        matrixARef.copyFrom(mBLASData.L2_cHER2_o_N_pu);
        verifyMatrix(matrixARef, matrixAC, true);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (N - 1) * incX;
        int dimY = 1 + (N - 1) * incY;
        vectorXC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimX, 1));
        vectorYC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), dimY, 1));
        vectorXC.copyFrom(mBLASData.L2_cHER2_x_n2);
        vectorYC.copyFrom(mBLASData.L2_cHER2_y_n2);
        matrixAC.copyFrom(mBLASData.L2_cHER2_A_nn_pu);

        mBLAS.CHPR2(uplo, alphaC, vectorXC, incX, vectorYC, incY, matrixAC);
        verifyMatrix(matrixARef, matrixAC, true);

        mRS.finish();
        checkError();
    }

    public void test_L2_ZHPR2_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N * (N+1) / 2, 1));
        Allocation vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        Allocation vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N, 1));
        matrixAZ.copyFrom(mBLASData.L2_zHER2_A_nn_pu);
        vectorXZ.copyFrom(mBLASData.L2_zHER2_x_n1);
        vectorYZ.copyFrom(mBLASData.L2_zHER2_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.ZHPR2(uplo, alphaZ, vectorXZ, incX, vectorYZ, incY, matrixAZ);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), N * (N+1) / 2, 1));
        matrixARef.copyFrom(mBLASData.L2_zHER2_o_N_pu);
        verifyMatrix(matrixARef, matrixAZ, true);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (N - 1) * incX;
        int dimY = 1 + (N - 1) * incY;
        vectorXZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimX, 1));
        vectorYZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), dimY, 1));
        vectorXZ.copyFrom(mBLASData.L2_zHER2_x_n2);
        vectorYZ.copyFrom(mBLASData.L2_zHER2_y_n2);
        matrixAZ.copyFrom(mBLASData.L2_zHER2_A_nn_pu);

        mBLAS.ZHPR2(uplo, alphaZ, vectorXZ, incX, vectorYZ, incY, matrixAZ);
        verifyMatrix(matrixARef, matrixAZ, true);

        mRS.finish();
        checkError();
    }



    private boolean validateSYR(Element e, int Uplo, Allocation X, int incX, Allocation A) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!A.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e)) {
            return false;
        }

        int N = A.getType().getX();

        if (X.getType().getY() > 1) {
            return false;
        }
        if (N != A.getType().getY()) {
            return false;
        }
        if (incX <= 0) {
            return false;
        }
        int expectedXDim = 1 + (N - 1) * incX;
        if (X.getType().getX() != expectedXDim) {
            return false;
        }
        return true;
    }

    private void xSYR_API_test(int Uplo, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateSYR(elemA, Uplo, vecX, incX, matA)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.SSYR(Uplo, alphaS, vecX, incX, matA);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DSYR(Uplo, alphaD, vecX, incX, matA);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.SSYR(Uplo, alphaS, vecX, incX, matA);
                        fail("should throw RSRuntimeException for SSYR");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DSYR(Uplo, alphaD, vecX, incX, matA);
                        fail("should throw RSRuntimeException for DSYR");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xSYR_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xSYR_API_test(Uplo, incX, mMatrix);
            }
        }
    }

    public void test_L2_SSYR_API() {
        L2_xSYR_API(mMatrixS);
    }

    public void test_L2_DSYR_API() {
        L2_xSYR_API(mMatrixD);
    }

    public void test_L2_SSYR_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copyFrom(mBLASData.L2_sSYR_A_nn);
        vectorXS.copyFrom(mBLASData.L2_sSYR_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.SSYR(uplo, alphaS, vectorXS, incX, matrixAS);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixARef.copyFrom(mBLASData.L2_sSYR_o_N);
        verifyMatrix(matrixARef, matrixAS, true);

        // Test for incX = 2;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXS.copyFrom(mBLASData.L2_sSYR_x_n2);
        matrixAS.copyFrom(mBLASData.L2_sSYR_A_nn);

        mBLAS.SSYR(uplo, alphaS, vectorXS, incX, matrixAS);
        verifyMatrix(matrixARef, matrixAS, true);

        mRS.finish();
        checkError();
    }

    public void test_L2_DSYR_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copyFrom(mBLASData.L2_dSYR_A_nn);
        vectorXD.copyFrom(mBLASData.L2_dSYR_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DSYR(uplo, alphaD, vectorXD, incX, matrixAD);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixARef.copyFrom(mBLASData.L2_dSYR_o_N);
        verifyMatrix(matrixARef, matrixAD, true);

        // Test for incX = 2;
        incX = 2;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXD.copyFrom(mBLASData.L2_dSYR_x_n2);
        matrixAD.copyFrom(mBLASData.L2_dSYR_A_nn);

        mBLAS.DSYR(uplo, alphaD, vectorXD, incX, matrixAD);
        verifyMatrix(matrixARef, matrixAD, true);

        mRS.finish();
        checkError();
    }


    private boolean validateSPR(Element e, int Uplo, Allocation X, int incX, Allocation Ap) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!Ap.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (X.getType().getY() > 1) {
            return false;
        }

        if (Ap.getType().getY() > 1) {
            return false;
        }

        int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
        if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
            return false;
        }
        if (incX <= 0) {
            return false;
        }
        int expectedXDim = 1 + (N - 1) * incX;
        if (X.getType().getX() != expectedXDim) {
            return false;
        }

        return true;
    }

    private void xSPR_API_test(int Uplo, int incX, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                Element elemA = matA.getType().getElement();
                if (validateSPR(elemA, Uplo, vecX, incX, matA)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.SSPR(Uplo, alphaS, vecX, incX, matA);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DSPR(Uplo, alphaD, vecX, incX, matA);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.SSPR(Uplo, alphaS, vecX, incX, matA);
                        fail("should throw RSRuntimeException for SSPR");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DSPR(Uplo, alphaD, vecX, incX, matA);
                        fail("should throw RSRuntimeException for DSPR");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L2_xSPR_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xSPR_API_test(Uplo, incX, mMatrix);
            }
        }
    }

    public void test_L2_SSPR_API() {
        L2_xSPR_API(mMatrixS);
    }

    public void test_L2_DSPR_API() {
        L2_xSPR_API(mMatrixD);
    }

    public void test_L2_SSPR_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N * (N+1) / 2, 1));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        matrixAS.copyFrom(mBLASData.L2_sSYR_A_nn_pu);
        vectorXS.copyFrom(mBLASData.L2_sSYR_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.SSPR(uplo, alphaS, vectorXS, incX, matrixAS);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N * (N+1) / 2, 1));
        matrixARef.copyFrom(mBLASData.L2_sSYR_o_N_pu);
        verifyMatrix(matrixARef, matrixAS, true);

        // Test for incX = 2;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorXS.copyFrom(mBLASData.L2_sSYR_x_n2);
        matrixAS.copyFrom(mBLASData.L2_sSYR_A_nn_pu);

        mBLAS.SSPR(uplo, alphaS, vectorXS, incX, matrixAS);
        verifyMatrix(matrixARef, matrixAS, true);

        mRS.finish();
        checkError();
    }

    public void test_L2_DSPR_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N * (N+1) / 2, 1));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        matrixAD.copyFrom(mBLASData.L2_dSYR_A_nn_pu);
        vectorXD.copyFrom(mBLASData.L2_dSYR_x_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DSPR(uplo, alphaD, vectorXD, incX, matrixAD);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N * (N+1) / 2, 1));
        matrixARef.copyFrom(mBLASData.L2_dSYR_o_N_pu);
        verifyMatrix(matrixARef, matrixAD, true);

        // Test for incX = 2;
        incX = 2;
        int dimX = 1 + (N - 1) * incX;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorXD.copyFrom(mBLASData.L2_dSYR_x_n2);
        matrixAD.copyFrom(mBLASData.L2_dSYR_A_nn_pu);

        mBLAS.DSPR(uplo, alphaD, vectorXD, incX, matrixAD);
        verifyMatrix(matrixARef, matrixAD, true);

        mRS.finish();
        checkError();
    }


    private boolean validateSYR2(Element e, int Uplo, Allocation X, int incX, Allocation Y, int incY, Allocation A) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!A.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e) ||
            !Y.getType().getElement().isCompatible(e)) {
            return false;
        }

        if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
            return false;
        }

        int N = A.getType().getX();

        if (N != A.getType().getY()) {
            return false;
        }
        if (incX <= 0 || incY <= 0) {
            return false;
        }
        int expectedXDim = 1 + (N - 1) * incX;
        int expectedYDim = 1 + (N - 1) * incY;
        if (X.getType().getX() != expectedXDim || Y.getType().getX() != expectedYDim) {
            return false;
        }
        return true;
    }

    private void xSYR2_API_test(int Uplo, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSYR2(elemA, Uplo, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SSYR2(Uplo, alphaS, vecX, incX, vecY, incY, matA);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DSYR2(Uplo, alphaD, vecX, incX, vecY, incY, matA);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SSYR2(Uplo, alphaS, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for SSYR2");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DSYR2(Uplo, alphaD, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for DSYR2");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xSYR2_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xSYR2_API_test(Uplo, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_SSYR2_API() {
        L2_xSYR2_API(mMatrixS);
    }

    public void test_L2_DSYR2_API() {
        L2_xSYR2_API(mMatrixD);
    }

    public void test_L2_SSYR2_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        Allocation vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, 1));
        matrixAS.copyFrom(mBLASData.L2_sSYR2_A_nn);
        vectorXS.copyFrom(mBLASData.L2_sSYR2_x_n1);
        vectorYS.copyFrom(mBLASData.L2_sSYR2_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.SSYR2(uplo, alphaS, vectorXS, incX, vectorYS, incY, matrixAS);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixARef.copyFrom(mBLASData.L2_sSYR2_o_N);
        verifyMatrix(matrixARef, matrixAS, true);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorXS.copyFrom(mBLASData.L2_sSYR2_x_n2);
        vectorYS.copyFrom(mBLASData.L2_sSYR2_y_n2);
        matrixAS.copyFrom(mBLASData.L2_sSYR2_A_nn);

        mBLAS.SSYR2(uplo, alphaS, vectorXS, incX, vectorYS, incY, matrixAS);
        verifyMatrix(matrixARef, matrixAS, true);

        mRS.finish();
        checkError();
    }

    public void test_L2_DSYR2_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        Allocation vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, 1));
        matrixAD.copyFrom(mBLASData.L2_dSYR2_A_nn);
        vectorXD.copyFrom(mBLASData.L2_dSYR2_x_n1);
        vectorYD.copyFrom(mBLASData.L2_dSYR2_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DSYR2(uplo, alphaD, vectorXD, incX, vectorYD, incY, matrixAD);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixARef.copyFrom(mBLASData.L2_dSYR2_o_N);
        verifyMatrix(matrixARef, matrixAD, true);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (mBLASData.dN - 1) * incX;
        int dimY = 1 + (mBLASData.dN - 1) * incY;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorXD.copyFrom(mBLASData.L2_dSYR2_x_n2);
        vectorYD.copyFrom(mBLASData.L2_dSYR2_y_n2);
        matrixAD.copyFrom(mBLASData.L2_dSYR2_A_nn);

        mBLAS.DSYR2(uplo, alphaD, vectorXD, incX, vectorYD, incY, matrixAD);
        verifyMatrix(matrixARef, matrixAD, true);

        mRS.finish();
        checkError();
    }


    private boolean validateSPR2(Element e, int Uplo, Allocation X, int incX, Allocation Y, int incY, Allocation Ap) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!Ap.getType().getElement().isCompatible(e) ||
            !X.getType().getElement().isCompatible(e) ||
            !Y.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (X.getType().getY() > 1 || Y.getType().getY() > 1) {
            return false;
        }

        if (Ap.getType().getY() > 1) {
            return false;
        }

        int N = (int)Math.sqrt((double)Ap.getType().getX() * 2);
        if (Ap.getType().getX() != ((N * (N+1)) / 2)) {
            return false;
        }
        if (incX <= 0 || incY <= 0) {
            return false;
        }
        int expectedXDim = 1 + (N - 1) * incX;
        int expectedYDim = 1 + (N - 1) * incY;
        if (X.getType().getX() != expectedXDim || Y.getType().getX() != expectedYDim) {
            return false;
        }

        return true;
    }

    private void xSPR2_API_test(int Uplo, int incX, int incY, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation vecX : mMatrix) {
                if (!validateVecInput(vecX)) {
                    continue;
                }
                for (Allocation vecY : mMatrix) {
                    if (!validateVecInput(vecY)) {
                        continue;
                    }
                    Element elemA = matA.getType().getElement();
                    if (validateSPR2(elemA, Uplo, vecX, incX, vecY, incY, matA)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SSPR2(Uplo, alphaS, vecX, incX, vecY, incY, matA);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DSPR2(Uplo, alphaD, vecX, incX, vecY, incY, matA);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SSPR2(Uplo, alphaS, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for SSPR2");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DSPR2(Uplo, alphaD, vecX, incX, vecY, incY, matA);
                            fail("should throw RSRuntimeException for DSPR2");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L2_xSPR2_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int incX : mInc) {
                xSPR2_API_test(Uplo, incX, incX, mMatrix);
            }
        }
    }

    public void test_L2_SSPR2_API() {
        L2_xSPR2_API(mMatrixS);
    }

    public void test_L2_DSPR2_API() {
        L2_xSPR2_API(mMatrixD);
    }

    public void test_L2_SSPR2_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N * (N+1) / 2, 1));
        Allocation vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        Allocation vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N, 1));
        matrixAS.copyFrom(mBLASData.L2_sSYR2_A_nn_pu);
        vectorXS.copyFrom(mBLASData.L2_sSYR2_x_n1);
        vectorYS.copyFrom(mBLASData.L2_sSYR2_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.SSPR2(uplo, alphaS, vectorXS, incX, vectorYS, incY, matrixAS);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), N * (N+1) / 2, 1));
        matrixARef.copyFrom(mBLASData.L2_sSYR2_o_N_pu);
        verifyMatrix(matrixARef, matrixAS, true);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (N - 1) * incX;
        int dimY = 1 + (N - 1) * incY;
        vectorXS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimX, 1));
        vectorYS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), dimY, 1));
        vectorXS.copyFrom(mBLASData.L2_sSYR2_x_n2);
        vectorYS.copyFrom(mBLASData.L2_sSYR2_y_n2);
        matrixAS.copyFrom(mBLASData.L2_sSYR2_A_nn_pu);

        mBLAS.SSPR2(uplo, alphaS, vectorXS, incX, vectorYS, incY, matrixAS);
        verifyMatrix(matrixARef, matrixAS, true);

        mRS.finish();
        checkError();
    }

    public void test_L2_DSPR2_Correctness() {
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int incX = 1;
        int incY = 1;

        // Populate input allocations
        int N = mBLASData.dN;
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N * (N+1) / 2, 1));
        Allocation vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        Allocation vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N, 1));
        matrixAD.copyFrom(mBLASData.L2_dSYR2_A_nn_pu);
        vectorXD.copyFrom(mBLASData.L2_dSYR2_x_n1);
        vectorYD.copyFrom(mBLASData.L2_dSYR2_y_n1);

        // Test for the default case: NO_TRANS
        mBLAS.DSPR2(uplo, alphaD, vectorXD, incX, vectorYD, incY, matrixAD);
        Allocation matrixARef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), N * (N+1) / 2, 1));
        matrixARef.copyFrom(mBLASData.L2_dSYR2_o_N_pu);
        verifyMatrix(matrixARef, matrixAD, true);

        // Test for incX = 2 & incY = 3;
        incX = 2;
        incY = 3;
        int dimX = 1 + (N - 1) * incX;
        int dimY = 1 + (N - 1) * incY;
        vectorXD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimX, 1));
        vectorYD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), dimY, 1));
        vectorXD.copyFrom(mBLASData.L2_dSYR2_x_n2);
        vectorYD.copyFrom(mBLASData.L2_dSYR2_y_n2);
        matrixAD.copyFrom(mBLASData.L2_dSYR2_A_nn_pu);

        mBLAS.DSPR2(uplo, alphaD, vectorXD, incX, vectorYD, incY, matrixAD);
        verifyMatrix(matrixARef, matrixAD, true);

        mRS.finish();
        checkError();
    }



    private boolean validateL3(Element e, int TransA, int TransB, int Side, Allocation A, Allocation B, Allocation C) {
        int aM = -1, aN = -1, bM = -1, bN = -1, cM = -1, cN = -1;
        if ((A != null && !A.getType().getElement().isCompatible(e)) ||
            (B != null && !B.getType().getElement().isCompatible(e)) ||
            (C != null && !C.getType().getElement().isCompatible(e))) {
            return false;
        }
        if (C == null) {
            //since matrix C is used to store the result, it cannot be null.
            return false;
        }
        cM = C.getType().getY();
        cN = C.getType().getX();

        if (Side == ScriptIntrinsicBLAS.RIGHT) {
            if ((A == null && B != null) || (A != null && B == null)) {
                return false;
            }
            if (B != null) {
                bM = A.getType().getY();
                bN = A.getType().getX();
            }
            if (A != null) {
                aM = B.getType().getY();
                aN = B.getType().getX();
            }
        } else {
            if (A != null) {
                if (TransA == ScriptIntrinsicBLAS.TRANSPOSE ||
                    TransA == ScriptIntrinsicBLAS.CONJ_TRANSPOSE ) {
                    aN = A.getType().getY();
                    aM = A.getType().getX();
                } else {
                    aM = A.getType().getY();
                    aN = A.getType().getX();
                }
            }
            if (B != null) {
                if (TransB == ScriptIntrinsicBLAS.TRANSPOSE ||
                    TransB == ScriptIntrinsicBLAS.CONJ_TRANSPOSE ) {
                    bN = B.getType().getY();
                    bM = B.getType().getX();
                } else {
                    bM = B.getType().getY();
                    bN = B.getType().getX();
                }
            }
        }
        if (A != null && B != null && C != null) {
            if (aN != bM || aM != cM || bN != cN) {
                return false;
            }
        } else if (A != null && C != null) {
            // A and C only, for SYRK
            if (cM != cN) {
                return false;
            }
            if (aM != cM) {
                return false;
            }
        } else if (A != null && B != null) {
            // A and B only
            if (aN != bM) {
                return false;
            }
        }

        return true;
    }

    private boolean validateL3_xGEMM(Element e, int TransA, int TransB, Allocation A, Allocation B, Allocation C) {
        boolean result = true;
        result &= validateTranspose(TransA);
        result &= validateTranspose(TransB);
        result &= validateL3(e, TransA, TransB, 0, A, B, C);

        return result;
    }

    private void xGEMM_API_test(int transA, int transB, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matB : mMatrix) {
                for (Allocation matC : mMatrix) {
                    Element elemA = matA.getType().getElement();
                    if (validateL3_xGEMM(elemA, transA, transB, matA, matB, matC)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SGEMM(transA, transB, alphaS, matA, matB, betaS, matC);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DGEMM(transA, transB, alphaD, matA, matB, betaD, matC);
                            } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CGEMM(transA, transB, alphaC, matA, matB, betaC, matC);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZGEMM(transA, transB, alphaZ, matA, matB, betaZ, matC);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SGEMM(transA, transB, alphaS, matA, matB, betaS, matC);
                            fail("should throw RSRuntimeException for SGEMM");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DGEMM(transA, transB, alphaD, matA, matB, betaD, matC);
                            fail("should throw RSRuntimeException for DGEMM");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.CGEMM(transA, transB, alphaC, matA, matB, betaC, matC);
                            fail("should throw RSRuntimeException for CGEMM");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZGEMM(transA, transB, alphaZ, matA, matB, betaZ, matC);
                            fail("should throw RSRuntimeException for ZGEMM");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    private void L3_xGEMM_API(ArrayList<Allocation> mMatrix) {
        for (int transA : mTranspose) {
            for (int transB : mTranspose) {
                xGEMM_API_test(transA, transB, mMatrix);
            }
        }
    }

    public void test_L3_SGEMM_API() {
        L3_xGEMM_API(mMatrixS);
    }

    public void test_L3_DGEMM_API() {
        L3_xGEMM_API(mMatrixD);
    }

    public void test_L3_CGEMM_API() {
        L3_xGEMM_API(mMatrixC);
    }

    public void test_L3_ZGEMM_API() {
        L3_xGEMM_API(mMatrixZ);
    }


    public void test_L3_SGEMM_Correctness() {
        int transA = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int transB = ScriptIntrinsicBLAS.NO_TRANSPOSE;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dK, mBLASData.dM));
        Allocation matrixBS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dK));
        Allocation matrixCS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixAS.copyFrom(mBLASData.L3_sGEMM_A_mk);
        matrixBS.copyFrom(mBLASData.L3_sGEMM_B_kn);
        matrixCS.copyFrom(mBLASData.L3_sGEMM_C_mn);

        // Test for the default case: NO_TRANS
        mBLAS.SGEMM(transA, transB, alphaS, matrixAS, matrixBS, betaS, matrixCS);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_sGEMM_o_NN);
        verifyMatrix(matrixCRef, matrixCS);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, mBLASData.dK));
        matrixBS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dK, mBLASData.dN));
        matrixAS.copyFrom(mBLASData.L3_sGEMM_A_km);
        matrixBS.copyFrom(mBLASData.L3_sGEMM_B_nk);

        transA = ScriptIntrinsicBLAS.TRANSPOSE;
        transB = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCS.copyFrom(mBLASData.L3_sGEMM_C_mn);
        mBLAS.SGEMM(transA, transB, alphaS, matrixAS, matrixBS, betaS, matrixCS);
        matrixCRef.copyFrom(mBLASData.L3_sGEMM_o_TT);
        verifyMatrix(matrixCRef, matrixCS);

        transA = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        transB = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        matrixCS.copyFrom(mBLASData.L3_sGEMM_C_mn);
        mBLAS.SGEMM(transA, transB, alphaS, matrixAS, matrixBS, betaS, matrixCS);
        matrixCRef.copyFrom(mBLASData.L3_sGEMM_o_HH);
        verifyMatrix(matrixCRef, matrixCS);

        mRS.finish();
        checkError();
    }

    public void test_L3_DGEMM_Correctness() {
        int transA = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int transB = ScriptIntrinsicBLAS.NO_TRANSPOSE;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dK, mBLASData.dM));
        Allocation matrixBD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dK));
        Allocation matrixCD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixAD.copyFrom(mBLASData.L3_dGEMM_A_mk);
        matrixBD.copyFrom(mBLASData.L3_dGEMM_B_kn);
        matrixCD.copyFrom(mBLASData.L3_dGEMM_C_mn);
        // Test for the default case: NO_TRANS
        mBLAS.DGEMM(transA, transB, alphaD, matrixAD, matrixBD, betaD, matrixCD);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_dGEMM_o_NN);
        verifyMatrix(matrixCRef, matrixCD);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, mBLASData.dK));
        matrixBD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dK, mBLASData.dN));
        matrixAD.copyFrom(mBLASData.L3_dGEMM_A_km);
        matrixBD.copyFrom(mBLASData.L3_dGEMM_B_nk);

        transA = ScriptIntrinsicBLAS.TRANSPOSE;
        transB = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCD.copyFrom(mBLASData.L3_dGEMM_C_mn);
        mBLAS.DGEMM(transA, transB, alphaD, matrixAD, matrixBD, betaD, matrixCD);
        matrixCRef.copyFrom(mBLASData.L3_dGEMM_o_TT);
        verifyMatrix(matrixCRef, matrixCD);

        transA = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        transB = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        matrixCD.copyFrom(mBLASData.L3_dGEMM_C_mn);
        mBLAS.DGEMM(transA, transB, alphaD, matrixAD, matrixBD, betaD, matrixCD);
        matrixCRef.copyFrom(mBLASData.L3_dGEMM_o_HH);
        verifyMatrix(matrixCRef, matrixCD);

        mRS.finish();
        checkError();
    }

    public void test_L3_CGEMM_Correctness() {
        int transA = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int transB = ScriptIntrinsicBLAS.NO_TRANSPOSE;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dK, mBLASData.dM));
        Allocation matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dK));
        Allocation matrixCC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAC.copyFrom(mBLASData.L3_cGEMM_A_mk);
        matrixBC.copyFrom(mBLASData.L3_cGEMM_B_kn);
        matrixCC.copyFrom(mBLASData.L3_cGEMM_C_mn);

        // Test for the default case: NO_TRANS
        mBLAS.CGEMM(transA, transB, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_cGEMM_o_NN);
        verifyMatrix(matrixCRef, matrixCC);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, mBLASData.dK));
        matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dK, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cGEMM_A_km);
        matrixBC.copyFrom(mBLASData.L3_cGEMM_B_nk);

        transA = ScriptIntrinsicBLAS.TRANSPOSE;
        transB = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCC.copyFrom(mBLASData.L3_cGEMM_C_mn);
        mBLAS.CGEMM(transA, transB, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        matrixCRef.copyFrom(mBLASData.L3_cGEMM_o_TT);
        verifyMatrix(matrixCRef, matrixCC);

        transA = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        transB = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        matrixCC.copyFrom(mBLASData.L3_cGEMM_C_mn);
        mBLAS.CGEMM(transA, transB, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        matrixCRef.copyFrom(mBLASData.L3_cGEMM_o_HH);
        verifyMatrix(matrixCRef, matrixCC);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZGEMM_Correctness() {
        int transA = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int transB = ScriptIntrinsicBLAS.NO_TRANSPOSE;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dK, mBLASData.dM));
        Allocation matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dK));
        Allocation matrixCZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAZ.copyFrom(mBLASData.L3_zGEMM_A_mk);
        matrixBZ.copyFrom(mBLASData.L3_zGEMM_B_kn);
        matrixCZ.copyFrom(mBLASData.L3_zGEMM_C_mn);

        // Test for the default case: NO_TRANS
        mBLAS.ZGEMM(transA, transB, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_zGEMM_o_NN);
        verifyMatrix(matrixCRef, matrixCZ);

        // Test for trans cases: TRANSPOSE, CONJ_TRANSPOSE
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, mBLASData.dK));
        matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dK, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zGEMM_A_km);
        matrixBZ.copyFrom(mBLASData.L3_zGEMM_B_nk);

        transA = ScriptIntrinsicBLAS.TRANSPOSE;
        transB = ScriptIntrinsicBLAS.TRANSPOSE;
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCZ.copyFrom(mBLASData.L3_zGEMM_C_mn);
        mBLAS.ZGEMM(transA, transB, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        matrixCRef.copyFrom(mBLASData.L3_zGEMM_o_TT);
        verifyMatrix(matrixCRef, matrixCZ);

        transA = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        transB = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        matrixCZ.copyFrom(mBLASData.L3_zGEMM_C_mn);
        mBLAS.ZGEMM(transA, transB, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        matrixCRef.copyFrom(mBLASData.L3_zGEMM_o_HH);
        verifyMatrix(matrixCRef, matrixCZ);

        mRS.finish();
        checkError();
    }



    private boolean validateL3_xSYMM(Element e, int Side, int Uplo, Allocation A, Allocation B, Allocation C) {
        boolean result = true;
        result &= validateSide(Side);
        result &= validateUplo(Uplo);
        result &= validateL3(e, 0, 0, Side, A, B, C);
        result &= (A.getType().getX() == A.getType().getY());
        return result;
    }

    private void xSYMM_API_test(int Side, int Uplo, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matB : mMatrix) {
                for (Allocation matC : mMatrix) {
                    Element elemA = matA.getType().getElement();
                    if (validateL3_xSYMM(elemA, Side, Uplo, matA, matB, matC)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SSYMM(Side, Uplo, alphaS, matA, matB, betaS, matC);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DSYMM(Side, Uplo, alphaD, matA, matB, betaD, matC);
                            } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CSYMM(Side, Uplo, alphaC, matA, matB, betaC, matC);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZSYMM(Side, Uplo, alphaZ, matA, matB, betaZ, matC);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SSYMM(Side, Uplo, alphaS, matA, matB, betaS, matC);
                            fail("should throw RSRuntimeException for SSYMM");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DSYMM(Side, Uplo, alphaD, matA, matB, betaD, matC);
                            fail("should throw RSRuntimeException for DSYMM");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.CSYMM(Side, Uplo, alphaC, matA, matB, betaC, matC);
                            fail("should throw RSRuntimeException for CSYMM");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZSYMM(Side, Uplo, alphaZ, matA, matB, betaZ, matC);
                            fail("should throw RSRuntimeException for ZSYMM");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    private void L3_xSYMM_API(ArrayList<Allocation> mMatrix) {
        for (int Side : mSide) {
            for (int Uplo : mUplo) {
                xSYMM_API_test(Side, Uplo, mMatrix);
            }
        }
    }

    public void test_L3_SSYMM_API() {
        L3_xSYMM_API(mMatrixS);
    }

    public void test_L3_DSYMM_API() {
        L3_xSYMM_API(mMatrixD);
    }

    public void test_L3_CSYMM_API() {
        L3_xSYMM_API(mMatrixC);
    }

    public void test_L3_ZSYMM_API() {
        L3_xSYMM_API(mMatrixZ);
    }


    public void test_L3_SSYMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        Allocation matrixCS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixAS.copyFrom(mBLASData.L3_sSYMM_A_mm);
        matrixBS.copyFrom(mBLASData.L3_sSYMM_B_mn);
        matrixCS.copyFrom(mBLASData.L3_sSYMM_C_mn);

        // Default case: SIDE = LEFT
        mBLAS.SSYMM(side, uplo, alphaS, matrixAS, matrixBS, betaS, matrixCS);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_sSYMM_o_L);
        verifyMatrix(matrixCRef, matrixCS);

        // SIDE = RIGHT
        side = ScriptIntrinsicBLAS.RIGHT;
        matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixAS.copyFrom(mBLASData.L3_sSYMM_A_nn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCS.copyFrom(mBLASData.L3_sSYMM_C_mn);
        mBLAS.SSYMM(side, uplo, alphaS, matrixAS, matrixBS, betaS, matrixCS);
        matrixCRef.copyFrom(mBLASData.L3_sSYMM_o_R);
        verifyMatrix(matrixCRef, matrixCS);

        mRS.finish();
        checkError();
    }

    public void test_L3_DSYMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        Allocation matrixCD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixAD.copyFrom(mBLASData.L3_dSYMM_A_mm);
        matrixBD.copyFrom(mBLASData.L3_dSYMM_B_mn);
        matrixCD.copyFrom(mBLASData.L3_dSYMM_C_mn);

        // Default case: SIDE = LEFT
        mBLAS.DSYMM(side, uplo, alphaD, matrixAD, matrixBD, betaD, matrixCD);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_dSYMM_o_L);
        verifyMatrix(matrixCRef, matrixCD);

        // SIDE = RIGHT
        side = ScriptIntrinsicBLAS.RIGHT;
        matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixAD.copyFrom(mBLASData.L3_dSYMM_A_nn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCD.copyFrom(mBLASData.L3_dSYMM_C_mn);
        mBLAS.DSYMM(side, uplo, alphaD, matrixAD, matrixBD, betaD, matrixCD);
        matrixCRef.copyFrom(mBLASData.L3_dSYMM_o_R);
        verifyMatrix(matrixCRef, matrixCD);

        mRS.finish();
        checkError();
    }

    public void test_L3_CSYMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation matrixCC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAC.copyFrom(mBLASData.L3_cSYMM_A_mm);
        matrixBC.copyFrom(mBLASData.L3_cSYMM_B_mn);
        matrixCC.copyFrom(mBLASData.L3_cSYMM_C_mn);

        // Default case: SIDE = LEFT
        mBLAS.CSYMM(side, uplo, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_cSYMM_o_L);
        verifyMatrix(matrixCRef, matrixCC);

        // SIDE = RIGHT
        side = ScriptIntrinsicBLAS.RIGHT;
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cSYMM_A_nn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCC.copyFrom(mBLASData.L3_cSYMM_C_mn);
        mBLAS.CSYMM(side, uplo, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        matrixCRef.copyFrom(mBLASData.L3_cSYMM_o_R);
        verifyMatrix(matrixCRef, matrixCC);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZSYMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation matrixCZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAZ.copyFrom(mBLASData.L3_zSYMM_A_mm);
        matrixBZ.copyFrom(mBLASData.L3_zSYMM_B_mn);
        matrixCZ.copyFrom(mBLASData.L3_zSYMM_C_mn);

        // Default case: SIDE = LEFT
        mBLAS.ZSYMM(side, uplo, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_zSYMM_o_L);
        verifyMatrix(matrixCRef, matrixCZ);

        // SIDE = RIGHT
        side = ScriptIntrinsicBLAS.RIGHT;
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zSYMM_A_nn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCZ.copyFrom(mBLASData.L3_zSYMM_C_mn);
        mBLAS.ZSYMM(side, uplo, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        matrixCRef.copyFrom(mBLASData.L3_zSYMM_o_R);
        verifyMatrix(matrixCRef, matrixCZ);

        mRS.finish();
        checkError();
    }


    private boolean validateHEMM(Element e, int Side, int Uplo, Allocation A, Allocation B, Allocation C) {
        if (!validateSide(Side)) {
            return false;
        }

        if (!validateUplo(Uplo)) {
            return false;
        }

        if (!A.getType().getElement().isCompatible(e) ||
            !B.getType().getElement().isCompatible(e) ||
            !C.getType().getElement().isCompatible(e)) {
            return false;
        }

        // A must be square; can potentially be relaxed similar to TRSM
        int adim = A.getType().getX();
        if (adim != A.getType().getY()) {
            return false;
        }
        if ((Side == ScriptIntrinsicBLAS.LEFT && adim != B.getType().getY()) ||
            (Side == ScriptIntrinsicBLAS.RIGHT && adim != B.getType().getX())) {
            return false;
        }
        if (B.getType().getX() != C.getType().getX() ||
            B.getType().getY() != C.getType().getY()) {
            return false;
        }

        return true;
    }

    private void xHEMM_API_test(int Side, int Uplo, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matB : mMatrix) {
                for (Allocation matC : mMatrix) {
                    Element elemA = matA.getType().getElement();
                    if (validateHEMM(elemA, Side, Uplo, matA, matB, matC)) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CHEMM(Side, Uplo, alphaC, matA, matB, betaC, matC);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZHEMM(Side, Uplo, alphaZ, matA, matB, betaZ, matC);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CHEMM(Side, Uplo, alphaC, matA, matB, betaC, matC);
                            fail("should throw RSRuntimeException for CHEMM");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZHEMM(Side, Uplo, alphaZ, matA, matB, betaZ, matC);
                            fail("should throw RSRuntimeException for ZHEMM");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L3_xHEMM_API(ArrayList<Allocation> mMatrix) {
        for (int Side : mSide) {
            for (int Uplo : mUplo) {
                xHEMM_API_test(Side, Uplo, mMatrix);
            }
        }
    }

    public void test_L3_CHEMM_API() {
        L3_xHEMM_API(mMatrixC);
    }

    public void test_L3_ZHEMM_API() {
        L3_xHEMM_API(mMatrixZ);
    }

    public void test_L3_CHEMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation matrixCC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAC.copyFrom(mBLASData.L3_cHEMM_A_mm);
        matrixBC.copyFrom(mBLASData.L3_cHEMM_B_mn);
        matrixCC.copyFrom(mBLASData.L3_cHEMM_C_mn);

        // Default case: SIDE = LEFT
        mBLAS.CHEMM(side, uplo, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_cHEMM_o_L);
        verifyMatrix(matrixCRef, matrixCC);

        // SIDE = RIGHT
        side = ScriptIntrinsicBLAS.RIGHT;
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cHEMM_A_nn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCC.copyFrom(mBLASData.L3_cHEMM_C_mn);
        mBLAS.CHEMM(side, uplo, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        matrixCRef.copyFrom(mBLASData.L3_cHEMM_o_R);
        verifyMatrix(matrixCRef, matrixCC);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZHEMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        Allocation matrixCZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAZ.copyFrom(mBLASData.L3_zHEMM_A_mm);
        matrixBZ.copyFrom(mBLASData.L3_zHEMM_B_mn);
        matrixCZ.copyFrom(mBLASData.L3_zHEMM_C_mn);

        // Default case: SIDE = LEFT
        mBLAS.ZHEMM(side, uplo, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixCRef.copyFrom(mBLASData.L3_zHEMM_o_L);
        verifyMatrix(matrixCRef, matrixCZ);

        // SIDE = RIGHT
        side = ScriptIntrinsicBLAS.RIGHT;
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zHEMM_A_nn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCZ.copyFrom(mBLASData.L3_zHEMM_C_mn);
        mBLAS.ZHEMM(side, uplo, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        matrixCRef.copyFrom(mBLASData.L3_zHEMM_o_R);
        verifyMatrix(matrixCRef, matrixCZ);

        mRS.finish();
        checkError();
    }



    private boolean validateL3_xSYRK(Element e, int Uplo, int Trans, Allocation A, Allocation C) {
        boolean result = true;
        result &= validateTranspose(Trans);
        result &= validateUplo(Uplo);
        result &= validateL3(e, Trans, 0, 0, A, null, C);

        return result;
    }

    private void xSYRK_API_test(int Uplo, int Trans, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matC : mMatrix) {
                Element elemA = matA.getType().getElement();
                if (validateL3_xSYRK(elemA, Uplo, Trans, matA, matC)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.SSYRK(Uplo, Trans, alphaS, matA, betaS, matC);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DSYRK(Uplo, Trans, alphaD, matA, betaD, matC);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CSYRK(Uplo, Trans, alphaC, matA, betaC, matC);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZSYRK(Uplo, Trans, alphaZ, matA, betaZ, matC);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.SSYRK(Uplo, Trans, alphaS, matA, betaS, matC);
                        fail("should throw RSRuntimeException for SSYRK");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DSYRK(Uplo, Trans, alphaD, matA, betaD, matC);
                        fail("should throw RSRuntimeException for DSYRK");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CSYRK(Uplo, Trans, alphaC, matA, betaC, matC);
                        fail("should throw RSRuntimeException for CSYRK");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZSYRK(Uplo, Trans, alphaZ, matA, betaZ, matC);
                        fail("should throw RSRuntimeException for ZSYRK");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L3_xSYRK_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int Trans : mTranspose) {
                xSYRK_API_test(Uplo, Trans, mMatrix);
            }
        }
    }

    public void test_L3_SSYRK_API() {
        L3_xSYRK_API(mMatrixS);
    }

    public void test_L3_DSYRK_API() {
        L3_xSYRK_API(mMatrixD);
    }

    public void test_L3_CSYRK_API() {
        L3_xSYRK_API(mMatrixC);
    }

    public void test_L3_ZSYRK_API() {
        L3_xSYRK_API(mMatrixZ);
    }


    public void test_L3_SSYRK_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixAS.copyFrom(mBLASData.L3_sSYRK_A_nk);
        matrixCS.copyFrom(mBLASData.L3_sSYRK_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.SSYRK(uplo, trans, alphaS, matrixAS, betaS, matrixCS);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_sSYRK_o_N);
        verifyMatrix(matrixCRef, matrixCS, true);

        // Case: TRANSPOSE
        matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dK));
        matrixAS.copyFrom(mBLASData.L3_sSYRK_A_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCS.copyFrom(mBLASData.L3_sSYRK_C_nn);

        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        mBLAS.SSYRK(uplo, trans, alphaS, matrixAS, betaS, matrixCS);
        matrixCRef.copyFrom(mBLASData.L3_sSYRK_o_T);
        verifyMatrix(matrixCRef, matrixCS, true);

        mRS.finish();
        checkError();
    }

    public void test_L3_DSYRK_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixAD.copyFrom(mBLASData.L3_dSYRK_A_nk);
        matrixCD.copyFrom(mBLASData.L3_dSYRK_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.DSYRK(uplo, trans, alphaD, matrixAD, betaD, matrixCD);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_dSYRK_o_N);
        verifyMatrix(matrixCRef, matrixCD, true);

        // Case: TRANSPOSE
        matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dK));
        matrixAD.copyFrom(mBLASData.L3_dSYRK_A_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCD.copyFrom(mBLASData.L3_dSYRK_C_nn);

        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        mBLAS.DSYRK(uplo, trans, alphaD, matrixAD, betaD, matrixCD);
        matrixCRef.copyFrom(mBLASData.L3_dSYRK_o_T);
        verifyMatrix(matrixCRef, matrixCD, true);

        mRS.finish();
        checkError();
    }

    public void test_L3_CSYRK_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cSYRK_A_nk);
        matrixCC.copyFrom(mBLASData.L3_cSYRK_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.CSYRK(uplo, trans, alphaC, matrixAC, betaC, matrixCC);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_cSYRK_o_N);
        verifyMatrix(matrixCRef, matrixCC, true);

        // Case: TRANSPOSE
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixAC.copyFrom(mBLASData.L3_cSYRK_A_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCC.copyFrom(mBLASData.L3_cSYRK_C_nn);

        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        mBLAS.CSYRK(uplo, trans, alphaC, matrixAC, betaC, matrixCC);
        matrixCRef.copyFrom(mBLASData.L3_cSYRK_o_T);
        verifyMatrix(matrixCRef, matrixCC, true);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZSYRK_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zSYRK_A_nk);
        matrixCZ.copyFrom(mBLASData.L3_zSYRK_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.ZSYRK(uplo, trans, alphaZ, matrixAZ, betaZ, matrixCZ);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_zSYRK_o_N);
        verifyMatrix(matrixCRef, matrixCZ, true);

        // Case: TRANSPOSE
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixAZ.copyFrom(mBLASData.L3_zSYRK_A_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCZ.copyFrom(mBLASData.L3_zSYRK_C_nn);

        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        mBLAS.ZSYRK(uplo, trans, alphaZ, matrixAZ, betaZ, matrixCZ);
        matrixCRef.copyFrom(mBLASData.L3_zSYRK_o_T);
        verifyMatrix(matrixCRef, matrixCZ, true);

        mRS.finish();
        checkError();
    }


    private boolean validateHERK(Element e, int Uplo, int Trans, Allocation A, Allocation C) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!A.getType().getElement().isCompatible(e) ||
            !C.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (!validateConjTranspose(Trans)) {
            return false;
        }
        int cdim = C.getType().getX();
        if (cdim != C.getType().getY()) {
            return false;
        }
        if (Trans == ScriptIntrinsicBLAS.NO_TRANSPOSE) {
            if (cdim != A.getType().getY()) {
                return false;
            }
        } else {
            if (cdim != A.getType().getX()) {
                return false;
            }
        }
        return true;
    }

    private void xHERK_API_test(int Uplo, int Trans, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matC : mMatrix) {
                Element elemA = matA.getType().getElement();
                if (validateHERK(elemA, Uplo, Trans, matA, matC)) {
                    try {
                        if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CHERK(Uplo, Trans, alphaS, matA, betaS, matC);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZHERK(Uplo, Trans, alphaD, matA, betaD, matC);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.CHERK(Uplo, Trans, alphaS, matA, betaS, matC);
                        fail("should throw RSRuntimeException for CHERK");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZHERK(Uplo, Trans, alphaD, matA, betaD, matC);
                        fail("should throw RSRuntimeException for ZHERK");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L3_xHERK_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int Trans : mTranspose) {
                xHERK_API_test(Uplo, Trans, mMatrix);
            }
        }
    }

    public void test_L3_CHERK_API() {
        L3_xHERK_API(mMatrixC);
    }

    public void test_L3_ZHERK_API() {
        L3_xHERK_API(mMatrixZ);
    }

    public void test_L3_CHERK_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cHERK_A_nk);
        matrixCC.copyFrom(mBLASData.L3_cHERK_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.CHERK(uplo, trans, alphaS, matrixAC, betaS, matrixCC);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_cHERK_o_N);
        verifyMatrix(matrixCRef, matrixCC, true);

        // Case: TRANSPOSE
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixAC.copyFrom(mBLASData.L3_cHERK_A_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCC.copyFrom(mBLASData.L3_cHERK_C_nn);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        mBLAS.CHERK(uplo, trans, alphaS, matrixAC, betaS, matrixCC);
        matrixCRef.copyFrom(mBLASData.L3_cHERK_o_H);
        verifyMatrix(matrixCRef, matrixCC, true);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZHERK_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zHERK_A_nk);
        matrixCZ.copyFrom(mBLASData.L3_zHERK_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.ZHERK(uplo, trans, alphaD, matrixAZ, betaD, matrixCZ);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_zHERK_o_N);
        verifyMatrix(matrixCRef, matrixCZ, true);

        // Case: TRANSPOSE
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixAZ.copyFrom(mBLASData.L3_zHERK_A_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCZ.copyFrom(mBLASData.L3_zHERK_C_nn);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        mBLAS.ZHERK(uplo, trans, alphaD, matrixAZ, betaD, matrixCZ);
        matrixCRef.copyFrom(mBLASData.L3_zHERK_o_H);
        verifyMatrix(matrixCRef, matrixCZ, true);

        mRS.finish();
        checkError();
    }


    private boolean validateSYR2K(Element e, int Uplo, int Trans, Allocation A, Allocation B, Allocation C) {
        if (!validateTranspose(Trans)) {
            return false;
        }
        if (!validateUplo(Uplo)) {
            return false;
        }

        if (!A.getType().getElement().isCompatible(e) ||
            !B.getType().getElement().isCompatible(e) ||
            !C.getType().getElement().isCompatible(e)) {
            return false;
        }
        int Cdim = -1;
        // A is n x k if no transpose, k x n if transpose
        // C is n x n
        if (Trans == ScriptIntrinsicBLAS.TRANSPOSE) {
            // check columns versus C
            Cdim = A.getType().getX();
        } else {
            // check rows versus C
            Cdim = A.getType().getY();
        }
        if (C.getType().getX() != Cdim || C.getType().getY() != Cdim) {
            return false;
        }
        // A dims == B dims
        if (A.getType().getX() != B.getType().getX() || A.getType().getY() != B.getType().getY()) {
            return false;
        }
        return true;
    }

    private void xSYR2K_API_test(int Uplo, int Trans, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matB : mMatrix) {
                for (Allocation matC : mMatrix) {
                    Element elemA = matA.getType().getElement();
                    if (validateSYR2K(elemA, Uplo, Trans, matA, matB, matC)) {
                        try {
                            if (elemA.isCompatible(Element.F32(mRS))) {
                                mBLAS.SSYR2K(Uplo, Trans, alphaS, matA, matB, betaS, matC);
                            } else if (elemA.isCompatible(Element.F64(mRS))) {
                                mBLAS.DSYR2K(Uplo, Trans, alphaD, matA, matB, betaD, matC);
                            } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CSYR2K(Uplo, Trans, alphaC, matA, matB, betaC, matC);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZSYR2K(Uplo, Trans, alphaZ, matA, matB, betaZ, matC);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.SSYR2K(Uplo, Trans, alphaS, matA, matB, betaS, matC);
                            fail("should throw RSRuntimeException for SSYR2K");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.DSYR2K(Uplo, Trans, alphaD, matA, matB, betaD, matC);
                            fail("should throw RSRuntimeException for DSYR2K");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.CSYR2K(Uplo, Trans, alphaC, matA, matB, betaC, matC);
                            fail("should throw RSRuntimeException for CSYR2K");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZSYR2K(Uplo, Trans, alphaZ, matA, matB, betaZ, matC);
                            fail("should throw RSRuntimeException for ZSYR2K");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L3_xSYR2K_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int Trans : mTranspose) {
                xSYR2K_API_test(Uplo, Trans, mMatrix);
            }
        }
    }

    public void test_L3_SSYR2K_API() {
        L3_xSYR2K_API(mMatrixS);
    }

    public void test_L3_DSYR2K_API() {
        L3_xSYR2K_API(mMatrixD);
    }

    public void test_L3_CSYR2K_API() {
        L3_xSYR2K_API(mMatrixC);
    }

    public void test_L3_ZSYR2K_API() {
        L3_xSYR2K_API(mMatrixZ);
    }


    public void test_L3_SSYR2K_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixBS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixAS.copyFrom(mBLASData.L3_sSYR2K_A_nk);
        matrixBS.copyFrom(mBLASData.L3_sSYR2K_B_nk);
        matrixCS.copyFrom(mBLASData.L3_sSYR2K_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.SSYR2K(uplo, trans, alphaS, matrixAS, matrixBS, betaS, matrixCS);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_sSYR2K_o_N);
        verifyMatrix(matrixCRef, matrixCS, true);

        // Case: TRANSPOSE
        matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dK));
        matrixBS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dK));
        matrixAS.copyFrom(mBLASData.L3_sSYR2K_A_kn);
        matrixBS.copyFrom(mBLASData.L3_sSYR2K_B_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCS.copyFrom(mBLASData.L3_sSYR2K_C_nn);

        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        mBLAS.SSYR2K(uplo, trans, alphaS, matrixAS, matrixBS, betaS, matrixCS);
        matrixCRef.copyFrom(mBLASData.L3_sSYR2K_o_T);
        verifyMatrix(matrixCRef, matrixCS, true);

        mRS.finish();
        checkError();
    }

    public void test_L3_DSYR2K_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixBD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixAD.copyFrom(mBLASData.L3_dSYR2K_A_nk);
        matrixBD.copyFrom(mBLASData.L3_dSYR2K_B_nk);
        matrixCD.copyFrom(mBLASData.L3_dSYR2K_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.DSYR2K(uplo, trans, alphaD, matrixAD, matrixBD, betaD, matrixCD);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_dSYR2K_o_N);
        verifyMatrix(matrixCRef, matrixCD, true);

        // Case: TRANSPOSE
        matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dK));
        matrixBD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dK));
        matrixAD.copyFrom(mBLASData.L3_dSYR2K_A_kn);
        matrixBD.copyFrom(mBLASData.L3_dSYR2K_B_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCD.copyFrom(mBLASData.L3_dSYR2K_C_nn);

        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        mBLAS.DSYR2K(uplo, trans, alphaD, matrixAD, matrixBD, betaD, matrixCD);
        matrixCRef.copyFrom(mBLASData.L3_dSYR2K_o_T);
        verifyMatrix(matrixCRef, matrixCD, true);

        mRS.finish();
        checkError();
    }

    public void test_L3_CSYR2K_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cSYR2K_A_nk);
        matrixBC.copyFrom(mBLASData.L3_cSYR2K_B_nk);
        matrixCC.copyFrom(mBLASData.L3_cSYR2K_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.CSYR2K(uplo, trans, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_cSYR2K_o_N);
        verifyMatrix(matrixCRef, matrixCC, true);

        // Case: TRANSPOSE
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixAC.copyFrom(mBLASData.L3_cSYR2K_A_kn);
        matrixBC.copyFrom(mBLASData.L3_cSYR2K_B_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCC.copyFrom(mBLASData.L3_cSYR2K_C_nn);

        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        mBLAS.CSYR2K(uplo, trans, alphaC, matrixAC, matrixBC, betaC, matrixCC);
        matrixCRef.copyFrom(mBLASData.L3_cSYR2K_o_T);
        verifyMatrix(matrixCRef, matrixCC, true);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZSYR2K_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zSYR2K_A_nk);
        matrixBZ.copyFrom(mBLASData.L3_zSYR2K_B_nk);
        matrixCZ.copyFrom(mBLASData.L3_zSYR2K_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.ZSYR2K(uplo, trans, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_zSYR2K_o_N);
        verifyMatrix(matrixCRef, matrixCZ, true);

        // Case: TRANSPOSE
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixAZ.copyFrom(mBLASData.L3_zSYR2K_A_kn);
        matrixBZ.copyFrom(mBLASData.L3_zSYR2K_B_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCZ.copyFrom(mBLASData.L3_zSYR2K_C_nn);

        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        mBLAS.ZSYR2K(uplo, trans, alphaZ, matrixAZ, matrixBZ, betaZ, matrixCZ);
        matrixCRef.copyFrom(mBLASData.L3_zSYR2K_o_T);
        verifyMatrix(matrixCRef, matrixCZ, true);

        mRS.finish();
        checkError();
    }


    private boolean validateHER2K(Element e, int Uplo, int Trans, Allocation A, Allocation B, Allocation C) {
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!A.getType().getElement().isCompatible(e) ||
            !B.getType().getElement().isCompatible(e) ||
            !C.getType().getElement().isCompatible(e)) {
            return false;
        }
        if (!validateConjTranspose(Trans)) {
            return false;
        }
        int cdim = C.getType().getX();
        if (cdim != C.getType().getY()) {
            return false;
        }
        if (Trans == ScriptIntrinsicBLAS.NO_TRANSPOSE) {
            if (A.getType().getY() != cdim) {
                return false;
            }
        } else {
            if (A.getType().getX() != cdim) {
                return false;
            }
        }
        if (A.getType().getX() != B.getType().getX() || A.getType().getY() != B.getType().getY()) {
            return false;
        }
        return true;
    }

    private void xHER2K_API_test(int Uplo, int Trans, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matB : mMatrix) {
                for (Allocation matC : mMatrix) {
                    Element elemA = matA.getType().getElement();
                    if (validateHER2K(elemA, Uplo, Trans, matA, matB, matC)) {
                        try {
                            if (elemA.isCompatible(Element.F32_2(mRS))) {
                                mBLAS.CHER2K(Uplo, Trans, alphaC, matA, matB, betaS, matC);
                            } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                                mBLAS.ZHER2K(Uplo, Trans, alphaZ, matA, matB, betaD, matC);
                            }
                        } catch (RSRuntimeException e) {
                            fail("should NOT throw RSRuntimeException");
                        }
                    } else {
                        try {
                            mBLAS.CHER2K(Uplo, Trans, alphaC, matA, matB, betaS, matC);
                            fail("should throw RSRuntimeException for CHER2K");
                        } catch (RSRuntimeException e) {
                        }
                        try {
                            mBLAS.ZHER2K(Uplo, Trans, alphaZ, matA, matB, betaD, matC);
                            fail("should throw RSRuntimeException for ZHER2K");
                        } catch (RSRuntimeException e) {
                        }
                    }
                }
            }
        }
    }

    public void L3_xHER2K_API(ArrayList<Allocation> mMatrix) {
        for (int Uplo : mUplo) {
            for (int Trans : mTranspose) {
                xHER2K_API_test(Uplo, Trans, mMatrix);
            }
        }
    }

    public void test_L3_CHER2K_API() {
        L3_xHER2K_API(mMatrixC);
    }

    public void test_L3_ZHER2K_API() {
        L3_xHER2K_API(mMatrixZ);
    }

    public void test_L3_CHER2K_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cHER2K_A_nk);
        matrixBC.copyFrom(mBLASData.L3_cHER2K_B_nk);
        matrixCC.copyFrom(mBLASData.L3_cHER2K_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.CHER2K(uplo, trans, alphaC, matrixAC, matrixBC, betaS, matrixCC);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_cHER2K_o_N);
        verifyMatrix(matrixCRef, matrixCC, true);

        // Case: TRANSPOSE
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixAC.copyFrom(mBLASData.L3_cHER2K_A_kn);
        matrixBC.copyFrom(mBLASData.L3_cHER2K_B_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCC.copyFrom(mBLASData.L3_cHER2K_C_nn);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        mBLAS.CHER2K(uplo, trans, alphaC, matrixAC, matrixBC, betaS, matrixCC);
        matrixCRef.copyFrom(mBLASData.L3_cHER2K_o_H);
        verifyMatrix(matrixCRef, matrixCC, true);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZHER2K_Correctness() {
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dK, mBLASData.dN));
        Allocation matrixCZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zHER2K_A_nk);
        matrixBZ.copyFrom(mBLASData.L3_zHER2K_B_nk);
        matrixCZ.copyFrom(mBLASData.L3_zHER2K_C_nn);

        // Default case: NO_TRANSPOSE
        mBLAS.ZHER2K(uplo, trans, alphaZ, matrixAZ, matrixBZ, betaD, matrixCZ);
        Allocation matrixCRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixCRef.copyFrom(mBLASData.L3_zHER2K_o_N);
        verifyMatrix(matrixCRef, matrixCZ, true);

        // Case: TRANSPOSE
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dK));
        matrixAZ.copyFrom(mBLASData.L3_zHER2K_A_kn);
        matrixBZ.copyFrom(mBLASData.L3_zHER2K_B_kn);
        // Reload matrix C, since it was overwritten by BLAS.
        matrixCZ.copyFrom(mBLASData.L3_zHER2K_C_nn);

        trans = ScriptIntrinsicBLAS.CONJ_TRANSPOSE;
        mBLAS.ZHER2K(uplo, trans, alphaZ, matrixAZ, matrixBZ, betaD, matrixCZ);
        matrixCRef.copyFrom(mBLASData.L3_zHER2K_o_H);
        verifyMatrix(matrixCRef, matrixCZ, true);

        mRS.finish();
        checkError();
    }


    private boolean validateTRMM(Element e, int Side, int Uplo, int TransA, int Diag, Allocation A, Allocation B) {
        if (!validateSide(Side)) {
            return false;
        }
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!validateTranspose(TransA)) {
            return false;
        }
        if (!validateDiag(Diag)) {
            return false;
        }
        int aM = -1, aN = -1, bM = -1, bN = -1;
        if (!A.getType().getElement().isCompatible(e) ||
            !B.getType().getElement().isCompatible(e)) {
            return false;
        }

        aM = A.getType().getY();
        aN = A.getType().getX();
        if (aM != aN) {
            return false;
        }

        bM = B.getType().getY();
        bN = B.getType().getX();
        if (Side == ScriptIntrinsicBLAS.LEFT) {
            if (aN != bM) {
                return false;
            }
        } else {
            if (bN != aM) {
                return false;
            }
        }
        return true;
    }

    private void xTRMM_API_test(int Side, int Uplo, int TransA, int Diag, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matB : mMatrix) {
                Element elemA = matA.getType().getElement();
                if (validateTRMM(elemA, Side, Uplo, TransA, Diag, matA, matB)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.STRMM(Side, Uplo, TransA, Diag, alphaS, matA, matB);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DTRMM(Side, Uplo, TransA, Diag, alphaD, matA, matB);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CTRMM(Side, Uplo, TransA, Diag, alphaC, matA, matB);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZTRMM(Side, Uplo, TransA, Diag, alphaZ, matA, matB);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.STRMM(Side, Uplo, TransA, Diag, alphaS, matA, matB);
                        fail("should throw RSRuntimeException for STRMM");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DTRMM(Side, Uplo, TransA, Diag, alphaD, matA, matB);
                        fail("should throw RSRuntimeException for DTRMM");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CTRMM(Side, Uplo, TransA, Diag, alphaC, matA, matB);
                        fail("should throw RSRuntimeException for CTRMM");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZTRMM(Side, Uplo, TransA, Diag, alphaZ, matA, matB);
                        fail("should throw RSRuntimeException for ZTRMM");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L3_xTRMM_API(ArrayList<Allocation> mMatrix) {
        for (int Side : mSide) {
            for (int Uplo : mUplo) {
                for (int TransA : mTranspose) {
                    for (int Diag : mDiag) {
                        xTRMM_API_test(Side, Uplo, TransA, Diag, mMatrix);
                    }
                }
            }
        }
    }

    public void test_L3_STRMM_API() {
        L3_xTRMM_API(mMatrixS);
    }

    public void test_L3_DTRMM_API() {
        L3_xTRMM_API(mMatrixD);
    }

    public void test_L3_CTRMM_API() {
        L3_xTRMM_API(mMatrixC);
    }

    public void test_L3_ZTRMM_API() {
        L3_xTRMM_API(mMatrixZ);
    }


    public void test_L3_STRMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixAS.copyFrom(mBLASData.L3_sTRMM_A_mm);
        matrixBS.copyFrom(mBLASData.L3_sTRMM_B_mn);

        // Default case: LEFT, UPPER, NO_TRANSPOSE
        mBLAS.STRMM(side, uplo, trans, diag, alphaS, matrixAS, matrixBS);
        Allocation matrixBRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixBRef.copyFrom(mBLASData.L3_sTRMM_o_LUN);
        verifyMatrix(matrixBRef, matrixBS);

        // Case: RIGHT, LOWER, TRANSPOSE
        matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixAS.copyFrom(mBLASData.L3_sTRMM_A_nn);
        // Reload matrix B, since it was overwritten by BLAS.
        matrixBS.copyFrom(mBLASData.L3_sTRMM_B_mn);

        side = ScriptIntrinsicBLAS.RIGHT;
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        uplo = ScriptIntrinsicBLAS.LOWER;
        mBLAS.STRMM(side, uplo, trans, diag, alphaS, matrixAS, matrixBS);
        matrixBRef.copyFrom(mBLASData.L3_sTRMM_o_RLT);
        verifyMatrix(matrixBRef, matrixBS);

        mRS.finish();
        checkError();
    }

    public void test_L3_DTRMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixAD.copyFrom(mBLASData.L3_dTRMM_A_mm);
        matrixBD.copyFrom(mBLASData.L3_dTRMM_B_mn);

        // Default case: LEFT, UPPER, NO_TRANSPOSE
        mBLAS.DTRMM(side, uplo, trans, diag, alphaD, matrixAD, matrixBD);
        Allocation matrixBRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixBRef.copyFrom(mBLASData.L3_dTRMM_o_LUN);
        verifyMatrix(matrixBRef, matrixBD);

        // Case: RIGHT, LOWER, TRANSPOSE
        matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixAD.copyFrom(mBLASData.L3_dTRMM_A_nn);
        // Reload matrix B, since it was overwritten by BLAS.
        matrixBD.copyFrom(mBLASData.L3_dTRMM_B_mn);

        side = ScriptIntrinsicBLAS.RIGHT;
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        uplo = ScriptIntrinsicBLAS.LOWER;
        mBLAS.DTRMM(side, uplo, trans, diag, alphaD, matrixAD, matrixBD);
        matrixBRef.copyFrom(mBLASData.L3_dTRMM_o_RLT);
        verifyMatrix(matrixBRef, matrixBD);

        mRS.finish();
        checkError();
    }

    public void test_L3_CTRMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAC.copyFrom(mBLASData.L3_cTRMM_A_mm);
        matrixBC.copyFrom(mBLASData.L3_cTRMM_B_mn);

        // Default case: LEFT, UPPER, NO_TRANSPOSE
        mBLAS.CTRMM(side, uplo, trans, diag, alphaC, matrixAC, matrixBC);
        Allocation matrixBRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixBRef.copyFrom(mBLASData.L3_cTRMM_o_LUN);
        verifyMatrix(matrixBRef, matrixBC);

        // Case: RIGHT, LOWER, TRANSPOSE
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cTRMM_A_nn);
        // Reload matrix B, since it was overwritten by BLAS.
        matrixBC.copyFrom(mBLASData.L3_cTRMM_B_mn);

        side = ScriptIntrinsicBLAS.RIGHT;
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        uplo = ScriptIntrinsicBLAS.LOWER;
        mBLAS.CTRMM(side, uplo, trans, diag, alphaC, matrixAC, matrixBC);
        matrixBRef.copyFrom(mBLASData.L3_cTRMM_o_RLT);
        verifyMatrix(matrixBRef, matrixBC);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZTRMM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAZ.copyFrom(mBLASData.L3_zTRMM_A_mm);
        matrixBZ.copyFrom(mBLASData.L3_zTRMM_B_mn);

        // Default case: LEFT, UPPER, NO_TRANSPOSE
        mBLAS.ZTRMM(side, uplo, trans, diag, alphaZ, matrixAZ, matrixBZ);
        Allocation matrixBRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixBRef.copyFrom(mBLASData.L3_zTRMM_o_LUN);
        verifyMatrix(matrixBRef, matrixBZ);

        // Case: RIGHT, LOWER, TRANSPOSE
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zTRMM_A_nn);
        // Reload matrix B, since it was overwritten by BLAS.
        matrixBZ.copyFrom(mBLASData.L3_zTRMM_B_mn);

        side = ScriptIntrinsicBLAS.RIGHT;
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        uplo = ScriptIntrinsicBLAS.LOWER;
        mBLAS.ZTRMM(side, uplo, trans, diag, alphaZ, matrixAZ, matrixBZ);
        matrixBRef.copyFrom(mBLASData.L3_zTRMM_o_RLT);
        verifyMatrix(matrixBRef, matrixBZ);

        mRS.finish();
        checkError();
    }


    private boolean validateTRSM(Element e, int Side, int Uplo, int TransA, int Diag, Allocation A, Allocation B) {
        int adim = -1, bM = -1, bN = -1;
        if (!validateSide(Side)) {
            return false;
        }
        if (!validateTranspose(TransA)) {
            return false;
        }
        if (!validateUplo(Uplo)) {
            return false;
        }
        if (!validateDiag(Diag)) {
            return false;
        }
        if (!A.getType().getElement().isCompatible(e) ||
            !B.getType().getElement().isCompatible(e)) {
            return false;
        }
        adim = A.getType().getX();
        if (adim != A.getType().getY()) {
            // this may be unnecessary, the restriction could potentially be relaxed
            // A needs to contain at least that symmetric matrix but could theoretically be larger
            // for now we assume adapters are sufficient, will reevaluate in the future
            return false;
        }
        bM = B.getType().getY();
        bN = B.getType().getX();
        if (Side == ScriptIntrinsicBLAS.LEFT) {
            // A is M*M
            if (adim != bM) {
                return false;
            }
        } else {
            // A is N*N
            if (adim != bN) {
                return false;
            }
        }
        return true;
    }

    private void xTRSM_API_test(int Side, int Uplo, int TransA, int Diag, ArrayList<Allocation> mMatrix) {
        for (Allocation matA : mMatrix) {
            for (Allocation matB : mMatrix) {
                Element elemA = matA.getType().getElement();
                if (validateTRSM(elemA, Side, Uplo, TransA, Diag, matA, matB)) {
                    try {
                        if (elemA.isCompatible(Element.F32(mRS))) {
                            mBLAS.STRSM(Side, Uplo, TransA, Diag, alphaS, matA, matB);
                        } else if (elemA.isCompatible(Element.F64(mRS))) {
                            mBLAS.DTRSM(Side, Uplo, TransA, Diag, alphaD, matA, matB);
                        } else if (elemA.isCompatible(Element.F32_2(mRS))) {
                            mBLAS.CTRSM(Side, Uplo, TransA, Diag, alphaC, matA, matB);
                        } else if (elemA.isCompatible(Element.F64_2(mRS))) {
                            mBLAS.ZTRSM(Side, Uplo, TransA, Diag, alphaZ, matA, matB);
                        }
                    } catch (RSRuntimeException e) {
                        fail("should NOT throw RSRuntimeException");
                    }
                } else {
                    try {
                        mBLAS.STRSM(Side, Uplo, TransA, Diag, alphaS, matA, matB);
                        fail("should throw RSRuntimeException for STRSM");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.DTRSM(Side, Uplo, TransA, Diag, alphaD, matA, matB);
                        fail("should throw RSRuntimeException for DTRSM");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.CTRSM(Side, Uplo, TransA, Diag, alphaC, matA, matB);
                        fail("should throw RSRuntimeException for CTRSM");
                    } catch (RSRuntimeException e) {
                    }
                    try {
                        mBLAS.ZTRSM(Side, Uplo, TransA, Diag, alphaZ, matA, matB);
                        fail("should throw RSRuntimeException for ZTRSM");
                    } catch (RSRuntimeException e) {
                    }
                }
            }
        }
    }

    public void L3_xTRSM_API(ArrayList<Allocation> mMatrix) {
        for (int Side : mSide) {
            for (int Uplo : mUplo) {
                for (int TransA : mTranspose) {
                    for (int Diag : mDiag) {
                        xTRSM_API_test(Side, Uplo, TransA, Diag, mMatrix);
                    }
                }
            }
        }
    }

    public void test_L3_STRSM_API() {
        L3_xTRSM_API(mMatrixS);
    }

    public void test_L3_DTRSM_API() {
        L3_xTRSM_API(mMatrixD);
    }

    public void test_L3_CTRSM_API() {
        L3_xTRSM_API(mMatrixC);
    }

    public void test_L3_ZTRSM_API() {
        L3_xTRSM_API(mMatrixZ);
    }

    public void test_L3_STRSM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;

        // Populate input allocations
        Allocation matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixAS.copyFrom(mBLASData.L3_sTRSM_A_mm);
        matrixBS.copyFrom(mBLASData.L3_sTRSM_B_mn);

        // Default case: LEFT, UPPER, NO_TRANSPOSE
        mBLAS.STRSM(side, uplo, trans, diag, alphaS, matrixAS, matrixBS);
        Allocation matrixBRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dM));
        matrixBRef.copyFrom(mBLASData.L3_sTRSM_o_LUN);
        verifyMatrix(matrixBRef, matrixBS);

        // Case: RIGHT, LOWER, TRANSPOSE
        matrixAS = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32(mRS), mBLASData.dN, mBLASData.dN));
        matrixAS.copyFrom(mBLASData.L3_sTRSM_A_nn);
        // Reload matrix B, since it was overwritten by BLAS.
        matrixBS.copyFrom(mBLASData.L3_sTRSM_B_mn);

        side = ScriptIntrinsicBLAS.RIGHT;
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        uplo = ScriptIntrinsicBLAS.LOWER;
        mBLAS.STRSM(side, uplo, trans, diag, alphaS, matrixAS, matrixBS);
        matrixBRef.copyFrom(mBLASData.L3_sTRSM_o_RLT);
        verifyMatrix(matrixBRef, matrixBS);

        mRS.finish();
        checkError();
    }

    public void test_L3_DTRSM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;

        // Populate input allocations
        Allocation matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixAD.copyFrom(mBLASData.L3_dTRSM_A_mm);
        matrixBD.copyFrom(mBLASData.L3_dTRSM_B_mn);

        // Default case: LEFT, UPPER, NO_TRANSPOSE
        mBLAS.DTRSM(side, uplo, trans, diag, alphaD, matrixAD, matrixBD);
        Allocation matrixBRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dM));
        matrixBRef.copyFrom(mBLASData.L3_dTRSM_o_LUN);
        verifyMatrix(matrixBRef, matrixBD);

        // Case: RIGHT, LOWER, TRANSPOSE
        matrixAD = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64(mRS), mBLASData.dN, mBLASData.dN));
        matrixAD.copyFrom(mBLASData.L3_dTRSM_A_nn);
        // Reload matrix B, since it was overwritten by BLAS.
        matrixBD.copyFrom(mBLASData.L3_dTRSM_B_mn);

        side = ScriptIntrinsicBLAS.RIGHT;
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        uplo = ScriptIntrinsicBLAS.LOWER;
        mBLAS.DTRSM(side, uplo, trans, diag, alphaD, matrixAD, matrixBD);
        matrixBRef.copyFrom(mBLASData.L3_dTRSM_o_RLT);
        verifyMatrix(matrixBRef, matrixBD);

        mRS.finish();
        checkError();
    }

    public void test_L3_CTRSM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;

        // Populate input allocations
        Allocation matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAC.copyFrom(mBLASData.L3_cTRSM_A_mm);
        matrixBC.copyFrom(mBLASData.L3_cTRSM_B_mn);

        // Default case: LEFT, UPPER, NO_TRANSPOSE
        mBLAS.CTRSM(side, uplo, trans, diag, alphaC, matrixAC, matrixBC);
        Allocation matrixBRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixBRef.copyFrom(mBLASData.L3_cTRSM_o_LUN);
        verifyMatrix(matrixBRef, matrixBC);

        // Case: RIGHT, LOWER, TRANSPOSE
        matrixAC = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F32_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAC.copyFrom(mBLASData.L3_cTRSM_A_nn);
        // Reload matrix B, since it was overwritten by BLAS.
        matrixBC.copyFrom(mBLASData.L3_cTRSM_B_mn);

        side = ScriptIntrinsicBLAS.RIGHT;
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        uplo = ScriptIntrinsicBLAS.LOWER;
        mBLAS.CTRSM(side, uplo, trans, diag, alphaC, matrixAC, matrixBC);
        matrixBRef.copyFrom(mBLASData.L3_cTRSM_o_RLT);
        verifyMatrix(matrixBRef, matrixBC);

        mRS.finish();
        checkError();
    }

    public void test_L3_ZTRSM_Correctness() {
        int side = ScriptIntrinsicBLAS.LEFT;
        int trans = ScriptIntrinsicBLAS.NO_TRANSPOSE;
        int uplo = ScriptIntrinsicBLAS.UPPER;
        int diag = ScriptIntrinsicBLAS.NON_UNIT;

        // Populate input allocations
        Allocation matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dM, mBLASData.dM));
        Allocation matrixBZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixAZ.copyFrom(mBLASData.L3_zTRSM_A_mm);
        matrixBZ.copyFrom(mBLASData.L3_zTRSM_B_mn);

        // Default case: LEFT, UPPER, NO_TRANSPOSE
        mBLAS.ZTRSM(side, uplo, trans, diag, alphaZ, matrixAZ, matrixBZ);
        Allocation matrixBRef = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dM));
        matrixBRef.copyFrom(mBLASData.L3_zTRSM_o_LUN);
        verifyMatrix(matrixBRef, matrixBZ);

        // Case: RIGHT, LOWER, TRANSPOSE
        matrixAZ = Allocation.createTyped(mRS, Type.createXY(mRS, Element.F64_2(mRS), mBLASData.dN, mBLASData.dN));
        matrixAZ.copyFrom(mBLASData.L3_zTRSM_A_nn);
        // Reload matrix B, since it was overwritten by BLAS.
        matrixBZ.copyFrom(mBLASData.L3_zTRSM_B_mn);

        side = ScriptIntrinsicBLAS.RIGHT;
        trans = ScriptIntrinsicBLAS.TRANSPOSE;
        uplo = ScriptIntrinsicBLAS.LOWER;
        mBLAS.ZTRSM(side, uplo, trans, diag, alphaZ, matrixAZ, matrixBZ);
        matrixBRef.copyFrom(mBLASData.L3_zTRSM_o_RLT);
        verifyMatrix(matrixBRef, matrixBZ);

        mRS.finish();
        checkError();
    }
}
