Optimize uniform matrix update

1. Add a fast matrix update function to do a single memcpy for uniform
matrix assignment with same layout. It benefits row-4 no-transpose GLSL
matrix and col-4 transpose HLSL matrix.
2. Make boolean IsColumnMajor to be a template parameter in generate
uniform matrix updating, which gets rid of the conditional branch
in loop and has better performance.
3. Add e2e test of uploading multiple 3x4 GLSL matrices at the same
time, which adds coverage to this CL.

Bug: angleproject:3632
Change-Id: Id1701ef6fbf63ea4b9884254d93ea8eacfe4e16a
Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/1688274
Commit-Queue: Jamie Madill <jmadill@chromium.org>
Reviewed-by: Jamie Madill <jmadill@chromium.org>
diff --git a/src/libANGLE/renderer/d3d/ProgramD3D.cpp b/src/libANGLE/renderer/d3d/ProgramD3D.cpp
index 43f495c..03f208c 100644
--- a/src/libANGLE/renderer/d3d/ProgramD3D.cpp
+++ b/src/libANGLE/renderer/d3d/ProgramD3D.cpp
@@ -2602,9 +2602,9 @@
     {
         if (targetUniform->mShaderData[shaderType])
         {
-            if (SetFloatUniformMatrixHLSL<cols, rows>(arrayElementOffset, elementCount, countIn,
-                                                      transpose, value,
-                                                      targetUniform->mShaderData[shaderType]))
+            if (SetFloatUniformMatrixHLSL<cols, rows>::Run(arrayElementOffset, elementCount,
+                                                           countIn, transpose, value,
+                                                           targetUniform->mShaderData[shaderType]))
             {
                 mShaderUniformsDirty.set(shaderType);
             }
diff --git a/src/libANGLE/renderer/renderer_utils.cpp b/src/libANGLE/renderer/renderer_utils.cpp
index a6c3828..cfbe2f3 100644
--- a/src/libANGLE/renderer/renderer_utils.cpp
+++ b/src/libANGLE/renderer/renderer_utils.cpp
@@ -101,10 +101,10 @@
     colorWriteFunction(reinterpret_cast<const uint8_t *>(&color), destPixelData);
 }
 
-template <int cols, int rows>
-int GetFlattenedIndex(bool isColumnMajor, int col, int row)
+template <int cols, int rows, bool IsColumnMajor>
+inline int GetFlattenedIndex(int col, int row)
 {
-    if (isColumnMajor)
+    if (IsColumnMajor)
     {
         return col * rows + row;
     }
@@ -114,8 +114,14 @@
     }
 }
 
-template <typename T, int colsSrc, int rowsSrc, bool IsDstColumnMajor, int colsDst, int rowsDst>
-bool ExpandMatrix(T *target, const GLfloat *value, bool isSrcColumnMajor)
+template <typename T,
+          bool IsSrcColumnMajor,
+          int colsSrc,
+          int rowsSrc,
+          bool IsDstColumnMajor,
+          int colsDst,
+          int rowsDst>
+bool ExpandMatrix(T *target, const GLfloat *value)
 {
     static_assert(colsSrc <= colsDst && rowsSrc <= rowsDst, "Can only expand!");
 
@@ -126,8 +132,8 @@
     {
         for (int c = 0; c < colsSrc; c++)
         {
-            int srcIndex = GetFlattenedIndex<colsSrc, rowsSrc>(isSrcColumnMajor, c, r);
-            int dstIndex = GetFlattenedIndex<colsDst, rowsDst>(IsDstColumnMajor, c, r);
+            int srcIndex = GetFlattenedIndex<colsSrc, rowsSrc, IsSrcColumnMajor>(c, r);
+            int dstIndex = GetFlattenedIndex<colsDst, rowsDst, IsDstColumnMajor>(c, r);
 
             staging[dstIndex] = static_cast<T>(value[srcIndex]);
         }
@@ -142,11 +148,15 @@
     return true;
 }
 
-template <int colsSrc, int rowsSrc, bool IsDstColumnMajor, int colsDst, int rowsDst>
+template <bool IsSrcColumMajor,
+          int colsSrc,
+          int rowsSrc,
+          bool IsDstColumnMajor,
+          int colsDst,
+          int rowsDst>
 bool SetFloatUniformMatrix(unsigned int arrayElementOffset,
                            unsigned int elementCount,
                            GLsizei countIn,
-                           GLboolean transpose,
                            const GLfloat *value,
                            uint8_t *targetData)
 {
@@ -161,16 +171,39 @@
 
     for (unsigned int i = 0; i < count; i++)
     {
-        const bool isSrcColumnMajor = !transpose;
-        dirty = ExpandMatrix<GLfloat, colsSrc, rowsSrc, IsDstColumnMajor, colsDst, rowsDst>(
-                    target, value, isSrcColumnMajor) ||
+        dirty = ExpandMatrix<GLfloat, IsSrcColumMajor, colsSrc, rowsSrc, IsDstColumnMajor, colsDst,
+                             rowsDst>(target, value) ||
                 dirty;
+
         target += targetMatrixStride;
         value += colsSrc * rowsSrc;
     }
 
     return dirty;
 }
+
+bool SetFloatUniformMatrixFast(unsigned int arrayElementOffset,
+                               unsigned int elementCount,
+                               GLsizei countIn,
+                               size_t matrixSize,
+                               const GLfloat *value,
+                               uint8_t *targetData)
+{
+    const unsigned int count =
+        std::min(elementCount - arrayElementOffset, static_cast<unsigned int>(countIn));
+
+    const uint8_t *valueData = reinterpret_cast<const uint8_t *>(value);
+    targetData               = targetData + arrayElementOffset * matrixSize;
+
+    if (memcmp(targetData, valueData, matrixSize * count) == 0)
+    {
+        return false;
+    }
+
+    memcpy(targetData, valueData, matrixSize * count);
+    return true;
+}
+
 }  // anonymous namespace
 
 PackPixelsParams::PackPixelsParams()
@@ -485,57 +518,161 @@
     return angle::Result::Continue;
 }
 
-#define ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(api, cols, rows)                            \
-    template bool SetFloatUniformMatrix##api<cols, rows>(unsigned int, unsigned int, GLsizei, \
-                                                         GLboolean, const GLfloat *, uint8_t *)
+#define ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(api, cols, rows) \
+    template bool SetFloatUniformMatrix##api<cols, rows>::Run(     \
+        unsigned int, unsigned int, GLsizei, GLboolean, const GLfloat *, uint8_t *)
 
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 2, 2);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 3, 3);
-ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 4, 4);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 2, 3);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 3, 2);
-ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 2, 4);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 4, 2);
-ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 3, 4);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(GLSL, 4, 3);
 
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 2, 2);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 3, 3);
-ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 4, 4);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 2, 3);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 3, 2);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 2, 4);
-ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 4, 2);
 ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 3, 4);
-ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC(HLSL, 4, 3);
 
 #undef ANGLE_INSTANTIATE_SET_UNIFORM_MATRIX_FUNC
 
-template <int cols, int rows>
-bool SetFloatUniformMatrixGLSL(unsigned int arrayElementOffset,
-                               unsigned int elementCount,
-                               GLsizei countIn,
-                               GLboolean transpose,
-                               const GLfloat *value,
-                               uint8_t *targetData)
+#define ANGLE_SPECIALIZATION_ROWS_SET_UNIFORM_MATRIX_FUNC(api, cols, rows)                      \
+    template bool SetFloatUniformMatrix##api<cols, 4>::Run(unsigned int, unsigned int, GLsizei, \
+                                                           GLboolean, const GLfloat *, uint8_t *)
+
+template <int cols>
+struct SetFloatUniformMatrixGLSL<cols, 4>
 {
-    // GLSL expects matrix uniforms to be column-major, and each column is padded to 4 rows.
-    return SetFloatUniformMatrix<cols, rows, true, cols, 4>(arrayElementOffset, elementCount,
-                                                            countIn, transpose, value, targetData);
+    static bool Run(unsigned int arrayElementOffset,
+                    unsigned int elementCount,
+                    GLsizei countIn,
+                    GLboolean transpose,
+                    const GLfloat *value,
+                    uint8_t *targetData);
+};
+
+ANGLE_SPECIALIZATION_ROWS_SET_UNIFORM_MATRIX_FUNC(GLSL, 2, 4);
+ANGLE_SPECIALIZATION_ROWS_SET_UNIFORM_MATRIX_FUNC(GLSL, 3, 4);
+ANGLE_SPECIALIZATION_ROWS_SET_UNIFORM_MATRIX_FUNC(GLSL, 4, 4);
+
+#undef ANGLE_SPECIALIZATION_ROWS_SET_UNIFORM_MATRIX_FUNC
+
+#define ANGLE_SPECIALIZATION_COLS_SET_UNIFORM_MATRIX_FUNC(api, cols, rows)                      \
+    template bool SetFloatUniformMatrix##api<4, rows>::Run(unsigned int, unsigned int, GLsizei, \
+                                                           GLboolean, const GLfloat *, uint8_t *)
+
+template <int rows>
+struct SetFloatUniformMatrixHLSL<4, rows>
+{
+    static bool Run(unsigned int arrayElementOffset,
+                    unsigned int elementCount,
+                    GLsizei countIn,
+                    GLboolean transpose,
+                    const GLfloat *value,
+                    uint8_t *targetData);
+};
+
+ANGLE_SPECIALIZATION_COLS_SET_UNIFORM_MATRIX_FUNC(HLSL, 4, 2);
+ANGLE_SPECIALIZATION_COLS_SET_UNIFORM_MATRIX_FUNC(HLSL, 4, 3);
+ANGLE_SPECIALIZATION_COLS_SET_UNIFORM_MATRIX_FUNC(HLSL, 4, 4);
+
+#undef ANGLE_SPECIALIZATION_COLS_SET_UNIFORM_MATRIX_FUNC
+
+template <int cols>
+bool SetFloatUniformMatrixGLSL<cols, 4>::Run(unsigned int arrayElementOffset,
+                                             unsigned int elementCount,
+                                             GLsizei countIn,
+                                             GLboolean transpose,
+                                             const GLfloat *value,
+                                             uint8_t *targetData)
+{
+    const bool isSrcColumnMajor = !transpose;
+    if (isSrcColumnMajor)
+    {
+        // Both src and dst matrixs are has same layout,
+        // a single memcpy updates all the matrices
+        constexpr size_t srcMatrixSize = sizeof(GLfloat) * cols * 4;
+        return SetFloatUniformMatrixFast(arrayElementOffset, elementCount, countIn, srcMatrixSize,
+                                         value, targetData);
+    }
+    else
+    {
+        // fallback to general cases
+        return SetFloatUniformMatrix<false, cols, 4, true, cols, 4>(
+            arrayElementOffset, elementCount, countIn, value, targetData);
+    }
 }
 
 template <int cols, int rows>
-bool SetFloatUniformMatrixHLSL(unsigned int arrayElementOffset,
-                               unsigned int elementCount,
-                               GLsizei countIn,
-                               GLboolean transpose,
-                               const GLfloat *value,
-                               uint8_t *targetData)
+bool SetFloatUniformMatrixGLSL<cols, rows>::Run(unsigned int arrayElementOffset,
+                                                unsigned int elementCount,
+                                                GLsizei countIn,
+                                                GLboolean transpose,
+                                                const GLfloat *value,
+                                                uint8_t *targetData)
 {
+    const bool isSrcColumnMajor = !transpose;
+    // GLSL expects matrix uniforms to be column-major, and each column is padded to 4 rows.
+    if (isSrcColumnMajor)
+    {
+        return SetFloatUniformMatrix<true, cols, rows, true, cols, 4>(
+            arrayElementOffset, elementCount, countIn, value, targetData);
+    }
+    else
+    {
+        return SetFloatUniformMatrix<false, cols, rows, true, cols, 4>(
+            arrayElementOffset, elementCount, countIn, value, targetData);
+    }
+}
+
+template <int rows>
+bool SetFloatUniformMatrixHLSL<4, rows>::Run(unsigned int arrayElementOffset,
+                                             unsigned int elementCount,
+                                             GLsizei countIn,
+                                             GLboolean transpose,
+                                             const GLfloat *value,
+                                             uint8_t *targetData)
+{
+    const bool isSrcColumnMajor = !transpose;
+    if (!isSrcColumnMajor)
+    {
+        // Both src and dst matrixs are has same layout,
+        // a single memcpy updates all the matrices
+        constexpr size_t srcMatrixSize = sizeof(GLfloat) * 4 * rows;
+        return SetFloatUniformMatrixFast(arrayElementOffset, elementCount, countIn, srcMatrixSize,
+                                         value, targetData);
+    }
+    else
+    {
+        // fallback to general cases
+        return SetFloatUniformMatrix<true, 4, rows, false, 4, rows>(
+            arrayElementOffset, elementCount, countIn, value, targetData);
+    }
+}
+
+template <int cols, int rows>
+bool SetFloatUniformMatrixHLSL<cols, rows>::Run(unsigned int arrayElementOffset,
+                                                unsigned int elementCount,
+                                                GLsizei countIn,
+                                                GLboolean transpose,
+                                                const GLfloat *value,
+                                                uint8_t *targetData)
+{
+    const bool isSrcColumnMajor = !transpose;
     // Internally store matrices as row-major to accomodate HLSL matrix indexing.  Each row is
     // padded to 4 columns.
-    return SetFloatUniformMatrix<cols, rows, false, 4, rows>(arrayElementOffset, elementCount,
-                                                             countIn, transpose, value, targetData);
+    if (!isSrcColumnMajor)
+    {
+        return SetFloatUniformMatrix<false, cols, rows, false, 4, rows>(
+            arrayElementOffset, elementCount, countIn, value, targetData);
+    }
+    else
+    {
+        return SetFloatUniformMatrix<true, cols, rows, false, 4, rows>(
+            arrayElementOffset, elementCount, countIn, value, targetData);
+    }
 }
 
 template void GetMatrixUniform<GLint>(GLenum, GLint *, const GLint *, bool);
diff --git a/src/libANGLE/renderer/renderer_utils.h b/src/libANGLE/renderer/renderer_utils.h
index 448768c..667f3d5 100644
--- a/src/libANGLE/renderer/renderer_utils.h
+++ b/src/libANGLE/renderer/renderer_utils.h
@@ -267,19 +267,26 @@
 // Helpers to set a matrix uniform value based on GLSL or HLSL semantics.
 // The return value indicate if the data was updated or not.
 template <int cols, int rows>
-bool SetFloatUniformMatrixGLSL(unsigned int arrayElementOffset,
-                               unsigned int elementCount,
-                               GLsizei countIn,
-                               GLboolean transpose,
-                               const GLfloat *value,
-                               uint8_t *targetData);
+struct SetFloatUniformMatrixGLSL
+{
+    static bool Run(unsigned int arrayElementOffset,
+                    unsigned int elementCount,
+                    GLsizei countIn,
+                    GLboolean transpose,
+                    const GLfloat *value,
+                    uint8_t *targetData);
+};
+
 template <int cols, int rows>
-bool SetFloatUniformMatrixHLSL(unsigned int arrayElementOffset,
-                               unsigned int elementCount,
-                               GLsizei countIn,
-                               GLboolean transpose,
-                               const GLfloat *value,
-                               uint8_t *targetData);
+struct SetFloatUniformMatrixHLSL
+{
+    static bool Run(unsigned int arrayElementOffset,
+                    unsigned int elementCount,
+                    GLsizei countIn,
+                    GLboolean transpose,
+                    const GLfloat *value,
+                    uint8_t *targetData);
+};
 
 // Helper method to de-tranpose a matrix uniform for an API query.
 void GetMatrixUniform(GLenum type, GLfloat *dataOut, const GLfloat *source, bool transpose);
diff --git a/src/libANGLE/renderer/vulkan/ProgramVk.cpp b/src/libANGLE/renderer/vulkan/ProgramVk.cpp
index eb62701..6c72eeb 100644
--- a/src/libANGLE/renderer/vulkan/ProgramVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ProgramVk.cpp
@@ -783,7 +783,7 @@
             continue;
         }
 
-        bool updated = SetFloatUniformMatrixGLSL<cols, rows>(
+        bool updated = SetFloatUniformMatrixGLSL<cols, rows>::Run(
             locationInfo.arrayIndex, linkedUniform.getArraySizeProduct(), count, transpose, value,
             uniformBlock.uniformData.data() + layoutInfo.offset);
 
diff --git a/src/tests/gl_tests/UniformTest.cpp b/src/tests/gl_tests/UniformTest.cpp
index f616cca..72e8778 100644
--- a/src/tests/gl_tests/UniformTest.cpp
+++ b/src/tests/gl_tests/UniformTest.cpp
@@ -684,6 +684,60 @@
     GLuint mProgram;
 };
 
+// Test that we can get and set an array of matrices uniform.
+TEST_P(UniformTestES3, MatrixArrayUniformStateQuery)
+{
+    constexpr char kFragShader[] =
+        "#version 300 es\n"
+        "precision mediump float;\n"
+        "uniform mat3x4 uniMat3x4[5];\n"
+        "out vec4 fragColor;\n"
+        "void main() {\n"
+        "    fragColor = vec4(uniMat3x4[0]);\n"
+        "    fragColor += vec4(uniMat3x4[1]);\n"
+        "    fragColor += vec4(uniMat3x4[2]);\n"
+        "    fragColor += vec4(uniMat3x4[3]);\n"
+        "    fragColor += vec4(uniMat3x4[4]);\n"
+        "}\n";
+    constexpr unsigned int kArrayCount   = 5;
+    constexpr unsigned int kMatrixStride = 3 * 4;
+
+    mProgram = CompileProgram(essl3_shaders::vs::Zero(), kFragShader);
+    ASSERT_NE(mProgram, 0u);
+
+    glUseProgram(mProgram);
+    GLfloat expected[kArrayCount][kMatrixStride] = {
+        {0.6f, -0.4f, 0.6f, 0.9f, -0.6f, 0.3f, -0.3f, -0.1f, -0.4f, -0.3f, 0.7f, 0.1f},
+        {-0.4f, -0.4f, -0.5f, -0.7f, 0.1f, -0.5f, 0.0f, -0.9f, -0.4f, 0.8f, -0.6f, 0.9f},
+        {0.4f, 0.1f, -0.9f, 1.0f, -0.8f, 0.4f, -0.2f, 0.4f, -0.0f, 0.2f, 0.9f, -0.3f},
+        {0.5f, 0.7f, -0.0f, 1.0f, 0.7f, 0.7f, 0.7f, -0.7f, -0.8f, 0.6f, 0.5f, -0.2f},
+        {-1.0f, 0.8f, 1.0f, -0.4f, 0.7f, 0.5f, 0.5f, 0.8f, 0.6f, 0.1f, 0.4f, -0.9f}};
+
+    GLint baseLocation = glGetUniformLocation(mProgram, "uniMat3x4");
+    ASSERT_NE(-1, baseLocation);
+
+    glUniformMatrix3x4fv(baseLocation, kArrayCount, GL_FALSE, &expected[0][0]);
+
+    for (size_t i = 0; i < kArrayCount; i++)
+    {
+        std::stringstream nameStr;
+        nameStr << "uniMat3x4[" << i << "]";
+        std::string name = nameStr.str();
+        GLint location   = glGetUniformLocation(mProgram, name.c_str());
+        ASSERT_GL_NO_ERROR();
+        ASSERT_NE(-1, location);
+
+        std::vector<GLfloat> results(12, 0);
+        glGetUniformfv(mProgram, location, results.data());
+        ASSERT_GL_NO_ERROR();
+
+        for (size_t compIdx = 0; compIdx < kMatrixStride; compIdx++)
+        {
+            EXPECT_EQ(results[compIdx], expected[i][compIdx]);
+        }
+    }
+}
+
 // Test queries for transposed arrays of non-square matrix uniforms.
 TEST_P(UniformTestES3, TransposedMatrixArrayUniformStateQuery)
 {
diff --git a/src/tests/perf_tests/UniformsPerf.cpp b/src/tests/perf_tests/UniformsPerf.cpp
index 8dc5187..9e139f2 100644
--- a/src/tests/perf_tests/UniformsPerf.cpp
+++ b/src/tests/perf_tests/UniformsPerf.cpp
@@ -14,6 +14,7 @@
 #include <random>
 #include <sstream>
 
+#include "common/debug.h"
 #include "util/Matrix.h"
 #include "util/shader_utils.h"
 
@@ -34,7 +35,9 @@
 enum DataType
 {
     VEC4,
-    MAT4,
+    MAT3x3,
+    MAT3x4,
+    MAT4x4,
 };
 
 // Determines if we state change the program between draws.
@@ -45,6 +48,12 @@
     MULTIPLE,
 };
 
+enum MatrixLayout
+{
+    TRANSPOSE,
+    NO_TRANSPOSE,
+};
+
 struct UniformsParams final : public RenderTestParams
 {
     UniformsParams()
@@ -52,7 +61,7 @@
         iterationsPerStep = kIterationsPerStep;
 
         // Common default params
-        majorVersion = 2;
+        majorVersion = 3;
         minorVersion = 0;
         windowWidth  = 720;
         windowHeight = 720;
@@ -62,9 +71,10 @@
     size_t numVertexUniforms   = 200;
     size_t numFragmentUniforms = 200;
 
-    DataType dataType       = DataType::VEC4;
-    DataMode dataMode       = DataMode::REPEAT;
-    ProgramMode programMode = ProgramMode::SINGLE;
+    DataType dataType         = DataType::VEC4;
+    DataMode dataMode         = DataMode::REPEAT;
+    MatrixLayout matrixLayout = MatrixLayout::NO_TRANSPOSE;
+    ProgramMode programMode   = ProgramMode::SINGLE;
 };
 
 std::ostream &operator<<(std::ostream &os, const UniformsParams &params)
@@ -88,9 +98,22 @@
     {
         strstr << "_" << (numVertexUniforms + numFragmentUniforms) << "_vec4";
     }
+    else if (dataType == DataType::MAT3x3)
+    {
+        strstr << "_" << (numVertexUniforms + numFragmentUniforms) << "_mat3x3";
+    }
+    else if (dataType == DataType::MAT3x4)
+    {
+        strstr << "_" << (numVertexUniforms + numFragmentUniforms) << "_mat3x4";
+    }
     else
     {
-        strstr << "_matrix";
+        strstr << "_" << (numVertexUniforms + numFragmentUniforms) << "_mat4x4";
+    }
+
+    if (matrixLayout == MatrixLayout::TRANSPOSE)
+    {
+        strstr << "_transpose";
     }
 
     if (programMode == ProgramMode::MULTIPLE)
@@ -162,12 +185,33 @@
     glGetIntegerv(GL_MAX_VERTEX_UNIFORM_VECTORS, &maxVertexUniformVectors);
     glGetIntegerv(GL_MAX_FRAGMENT_UNIFORM_VECTORS, &maxFragmentUniformVectors);
 
-    bool isMatrix = params.dataType == DataType::MAT4;
+    GLint vectorCountPerUniform;
+    bool isMatrix;
+    switch (params.dataType)
+    {
+        case DataType::MAT3x3:
+            vectorCountPerUniform = 3;
+            isMatrix              = true;
+            break;
+        case DataType::MAT3x4:
+            // depends on transpose, conservatively set to 4
+            vectorCountPerUniform = 4;
+            isMatrix              = true;
+            break;
+        case DataType::MAT4x4:
+            vectorCountPerUniform = 4;
+            isMatrix              = true;
+            break;
+        default:
+            vectorCountPerUniform = 1;
+            isMatrix              = false;
+            break;
+    }
 
     GLint numVertexUniformVectors =
-        static_cast<GLint>(params.numVertexUniforms) * (isMatrix ? 4 : 1);
+        static_cast<GLint>(params.numVertexUniforms) * vectorCountPerUniform;
     GLint numFragmentUniformVectors =
-        static_cast<GLint>(params.numFragmentUniforms) * (isMatrix ? 4 : 1);
+        static_cast<GLint>(params.numFragmentUniforms) * vectorCountPerUniform;
 
     if (numVertexUniformVectors > maxVertexUniformVectors)
     {
@@ -219,19 +263,38 @@
 void UniformsBenchmark::initShaders()
 {
     const auto &params = GetParam();
-    bool isMatrix      = (params.dataType == DataType::MAT4);
+
+    const std::string kUniformVarPlaceHolder = "%s";
+    std::string typeString;
+    std::string uniformOperationTemplate;
+    switch (params.dataType)
+    {
+        case DataType::VEC4:
+            typeString               = "vec4";
+            uniformOperationTemplate = kUniformVarPlaceHolder;
+            break;
+        case DataType::MAT3x3:
+            typeString = "mat3";
+            uniformOperationTemplate =
+                "mat4(" + kUniformVarPlaceHolder + ") * vec4(1.0, 1.0, 1.0, 1.0)";
+            break;
+        case DataType::MAT3x4:
+            typeString = "mat3x4";
+            uniformOperationTemplate =
+                "mat4(" + kUniformVarPlaceHolder + ") * vec4(1.0, 1.0, 1.0, 1.0)";
+            break;
+        case DataType::MAT4x4:
+            typeString               = "mat4";
+            uniformOperationTemplate = kUniformVarPlaceHolder + "* vec4(1.0, 1.0, 1.0, 1.0)";
+            break;
+        default:
+            UNREACHABLE();
+    }
 
     std::stringstream vstrstr;
+    vstrstr << "#version 300 es\n";
     vstrstr << "precision mediump float;\n";
-    std::string typeString  = isMatrix ? "mat4" : "vec4";
-    std::string constVector = "const vec4 one = vec4(1, 1, 1, 1);\n";
-
-    vstrstr << "attribute vec4 pos;\n";
-
-    if (isMatrix)
-    {
-        vstrstr << constVector;
-    }
+    vstrstr << "in vec4 pos;\n";
 
     for (size_t i = 0; i < params.numVertexUniforms; i++)
     {
@@ -243,22 +306,21 @@
                "    gl_Position = pos;\n";
     for (size_t i = 0; i < params.numVertexUniforms; i++)
     {
-        vstrstr << "    gl_Position += " << GetUniformLocationName(i, true);
-        if (isMatrix)
-        {
-            vstrstr << " * one";
-        }
+        std::string uniformOperation = uniformOperationTemplate;
+        std::size_t pos              = uniformOperation.find(kUniformVarPlaceHolder);
+        ASSERT(pos != std::string::npos);
+        uniformOperation.replace(pos, kUniformVarPlaceHolder.size(),
+                                 GetUniformLocationName(i, true));
+        vstrstr << "    gl_Position += ";
+        vstrstr << uniformOperation;
         vstrstr << ";\n";
     }
     vstrstr << "}";
 
     std::stringstream fstrstr;
+    fstrstr << "#version 300 es\n";
     fstrstr << "precision mediump float;\n";
-
-    if (isMatrix)
-    {
-        fstrstr << constVector;
-    }
+    fstrstr << "out vec4 fragColor;\n";
 
     for (size_t i = 0; i < params.numFragmentUniforms; i++)
     {
@@ -266,14 +328,16 @@
     }
     fstrstr << "void main()\n"
                "{\n"
-               "    gl_FragColor = vec4(0, 0, 0, 0);\n";
+               "    fragColor = vec4(0, 0, 0, 0);\n";
     for (size_t i = 0; i < params.numFragmentUniforms; i++)
     {
-        fstrstr << "    gl_FragColor += " << GetUniformLocationName(i, false);
-        if (isMatrix)
-        {
-            fstrstr << " * one";
-        }
+        std::string uniformOperation = uniformOperationTemplate;
+        std::size_t pos              = uniformOperation.find(kUniformVarPlaceHolder);
+        ASSERT(pos != std::string::npos);
+        uniformOperation.replace(pos, kUniformVarPlaceHolder.size(),
+                                 GetUniformLocationName(i, false));
+        fstrstr << "    fragColor += ";
+        fstrstr << uniformOperation;
         fstrstr << ";\n";
     }
     fstrstr << "}";
@@ -339,32 +403,63 @@
 {
     const auto &params = GetParam();
 
-    if (params.dataType == DataType::MAT4)
-    {
-        auto setFunc = [](const std::vector<GLuint> &locations, const MatrixData &matrixData,
-                          size_t uniform, size_t frameIndex) {
-            glUniformMatrix4fv(locations[uniform], 1, GL_FALSE,
-                               matrixData[frameIndex][uniform].data);
-        };
+    GLboolean transpose = static_cast<GLboolean>(params.matrixLayout == MatrixLayout::TRANSPOSE);
 
-        drawLoop<false>(setFunc);
-    }
-    else
+    switch (params.dataType)
     {
-        auto setFunc = [](const std::vector<GLuint> &locations, const MatrixData &matrixData,
-                          size_t uniform, size_t frameIndex) {
-            float value = static_cast<float>(uniform);
-            glUniform4f(locations[uniform], value, value, value, value);
-        };
+        case DataType::MAT4x4:
+        {
+            auto setFunc = [=](const std::vector<GLuint> &locations, const MatrixData &matrixData,
+                               size_t uniform, size_t frameIndex) {
+                glUniformMatrix4fv(locations[uniform], 1, transpose,
+                                   matrixData[frameIndex][uniform].data);
+            };
 
-        if (params.programMode == ProgramMode::MULTIPLE)
-        {
-            drawLoop<true>(setFunc);
-        }
-        else
-        {
             drawLoop<false>(setFunc);
+            break;
         }
+        case DataType::MAT3x4:
+        {
+            auto setFunc = [=](const std::vector<GLuint> &locations, const MatrixData &matrixData,
+                               size_t uniform, size_t frameIndex) {
+                glUniformMatrix3x4fv(locations[uniform], 1, transpose,
+                                     matrixData[frameIndex][uniform].data);
+            };
+
+            drawLoop<false>(setFunc);
+            break;
+        }
+        case DataType::MAT3x3:
+        {
+            auto setFunc = [=](const std::vector<GLuint> &locations, const MatrixData &matrixData,
+                               size_t uniform, size_t frameIndex) {
+                glUniformMatrix3fv(locations[uniform], 1, transpose,
+                                   matrixData[frameIndex][uniform].data);
+            };
+
+            drawLoop<false>(setFunc);
+            break;
+        }
+        case DataType::VEC4:
+        {
+            auto setFunc = [](const std::vector<GLuint> &locations, const MatrixData &matrixData,
+                              size_t uniform, size_t frameIndex) {
+                float value = static_cast<float>(uniform);
+                glUniform4f(locations[uniform], value, value, value, value);
+            };
+
+            if (params.programMode == ProgramMode::MULTIPLE)
+            {
+                drawLoop<true>(setFunc);
+            }
+            else
+            {
+                drawLoop<false>(setFunc);
+            }
+            break;
+        }
+        default:
+            UNREACHABLE();
     }
 
     ASSERT_GL_NO_ERROR();
@@ -383,12 +478,16 @@
     return params;
 }
 
-UniformsParams MatrixUniforms(const EGLPlatformParameters &egl, DataMode dataMode)
+UniformsParams MatrixUniforms(const EGLPlatformParameters &egl,
+                              DataMode dataMode,
+                              DataType dataType,
+                              MatrixLayout matrixLayout)
 {
     UniformsParams params;
     params.eglParameters = egl;
-    params.dataType      = DataType::MAT4;
+    params.dataType      = dataType;
     params.dataMode      = dataMode;
+    params.matrixLayout  = matrixLayout;
 
     // Reduce the number of uniforms to fit within smaller upper limits on some configs.
     params.numVertexUniforms   = 55;
@@ -404,14 +503,27 @@
     run();
 }
 
-ANGLE_INSTANTIATE_TEST(UniformsBenchmark,
-                       VectorUniforms(D3D9(), DataMode::UPDATE),
-                       VectorUniforms(D3D11(), DataMode::REPEAT),
-                       VectorUniforms(D3D11(), DataMode::UPDATE),
-                       VectorUniforms(D3D11_NULL(), DataMode::UPDATE),
-                       VectorUniforms(OPENGL_OR_GLES(), DataMode::UPDATE),
-                       VectorUniforms(OPENGL_OR_GLES(), DataMode::REPEAT),
-                       VectorUniforms(OPENGL_OR_GLES_NULL(), DataMode::UPDATE),
-                       MatrixUniforms(D3D11(), DataMode::UPDATE),
-                       MatrixUniforms(OPENGL_OR_GLES(), DataMode::UPDATE),
-                       VectorUniforms(D3D11_NULL(), DataMode::REPEAT, ProgramMode::MULTIPLE));
+ANGLE_INSTANTIATE_TEST(
+    UniformsBenchmark,
+    VectorUniforms(D3D11(), DataMode::REPEAT),
+    VectorUniforms(D3D11(), DataMode::UPDATE),
+    VectorUniforms(D3D11_NULL(), DataMode::UPDATE),
+    VectorUniforms(OPENGL_OR_GLES(), DataMode::UPDATE),
+    VectorUniforms(OPENGL_OR_GLES(), DataMode::REPEAT),
+    VectorUniforms(OPENGL_OR_GLES_NULL(), DataMode::UPDATE),
+    MatrixUniforms(D3D11(), DataMode::UPDATE, DataType::MAT4x4, MatrixLayout::NO_TRANSPOSE),
+    MatrixUniforms(OPENGL_OR_GLES(),
+                   DataMode::UPDATE,
+                   DataType::MAT4x4,
+                   MatrixLayout::NO_TRANSPOSE),
+    MatrixUniforms(VULKAN_NULL(), DataMode::UPDATE, DataType::MAT4x4, MatrixLayout::NO_TRANSPOSE),
+    MatrixUniforms(VULKAN_NULL(), DataMode::UPDATE, DataType::MAT4x4, MatrixLayout::TRANSPOSE),
+    MatrixUniforms(VULKAN_NULL(), DataMode::REPEAT, DataType::MAT4x4, MatrixLayout::NO_TRANSPOSE),
+    MatrixUniforms(VULKAN_NULL(), DataMode::UPDATE, DataType::MAT3x4, MatrixLayout::NO_TRANSPOSE),
+    MatrixUniforms(VULKAN_NULL(), DataMode::UPDATE, DataType::MAT3x3, MatrixLayout::TRANSPOSE),
+    MatrixUniforms(VULKAN_NULL(), DataMode::REPEAT, DataType::MAT3x3, MatrixLayout::TRANSPOSE),
+    MatrixUniforms(VULKAN(), DataMode::UPDATE, DataType::MAT4x4, MatrixLayout::NO_TRANSPOSE),
+    MatrixUniforms(VULKAN(), DataMode::REPEAT, DataType::MAT4x4, MatrixLayout::NO_TRANSPOSE),
+    MatrixUniforms(VULKAN(), DataMode::UPDATE, DataType::MAT3x3, MatrixLayout::NO_TRANSPOSE),
+    MatrixUniforms(VULKAN(), DataMode::REPEAT, DataType::MAT3x3, MatrixLayout::NO_TRANSPOSE),
+    VectorUniforms(D3D11_NULL(), DataMode::REPEAT, ProgramMode::MULTIPLE));