Reland "Tightly pack LinkedUniform by using int16_t"

This is a reland of commit 152cf62b38874238095a91307e4ea9bcdedf8f46

Original change's description:
> Tightly pack LinkedUniform by using int16_t
>
> There is a check of vector size when we link uniforms and the maximum
> vector size is 4096 due to we clamp the maxUniformBlockSize to 64KB. In
> reality, if we exceeds this number, program link will take really long
> time and then hit failure. So there is no real need to keep all the
> variables in 32 bit integer. This CL changes to 16 bit integer. Further,
> sh::BlockMemberInfo and ActiveVariable data members are embeded into
> LinkedUniform struct as well so that the unused variables can be removed
> and data can be tightly packed. This also makes LinkedUniform easier to
> maintain as a simple struct with basic data types. With this change,
> LinkedUniform size is reduced from 108 bytes down to 60 bytes, 48 bytes
> reduction. Given some apps has 200-ish uniforms, this CL reduces 48
> bytes x 200 = ~9K memory just for uniforms per program (which goes
> through hash compute and decompression and file reads).
>
> Bug: b/275102061
> Change-Id: I7fae20f5b75f3239305e2094a992e3040b8c8e4c
> Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/4754133
> Reviewed-by: Shahbaz Youssefi <syoussefi@chromium.org>
> Commit-Queue: Charlie Lao <cclao@google.com>

Bug: b/275102061
Change-Id: I1cdec9407e930608d3239a104dcbf77c8d8e2113
Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/4791661
Reviewed-by: Shahbaz Youssefi <syoussefi@chromium.org>
Commit-Queue: Charlie Lao <cclao@google.com>
diff --git a/src/common/angleutils.h b/src/common/angleutils.h
index 529f442..8a6ecec 100644
--- a/src/common/angleutils.h
+++ b/src/common/angleutils.h
@@ -60,7 +60,7 @@
           class KeyEqual = std::equal_to<Key>>
 using HashMap = std::unordered_map<Key, T, Hash, KeyEqual>;
 template <typename Key, class Hash = std::hash<Key>, class KeyEqual = std::equal_to<Key>>
-using HashSet  = std::unordered_set<Key, Hash, KeyEqual>;
+using HashSet = std::unordered_set<Key, Hash, KeyEqual>;
 #    if __cpp_lib_generic_unordered_lookup >= 201811L
 #        define ANGLE_HAS_HASH_MAP_GENERIC_LOOKUP 1
 #    else
@@ -401,6 +401,15 @@
     bool mUseMutex;
 };
 
+// Helper macro that casts to a bitfield type then verifies no bits were dropped.
+#define SetBitField(lhs, rhs)                                                         \
+    do                                                                                \
+    {                                                                                 \
+        auto ANGLE_LOCAL_VAR = rhs;                                                   \
+        lhs = static_cast<typename std::decay<decltype(lhs)>::type>(ANGLE_LOCAL_VAR); \
+        ASSERT(static_cast<decltype(ANGLE_LOCAL_VAR)>(lhs) == ANGLE_LOCAL_VAR);       \
+    } while (0)
+
 // snprintf is not defined with MSVC prior to to msvc14
 #if defined(_MSC_VER) && _MSC_VER < 1900
 #    define snprintf _snprintf
diff --git a/src/libANGLE/ProgramExecutable.cpp b/src/libANGLE/ProgramExecutable.cpp
index 9711550..2f87992 100644
--- a/src/libANGLE/ProgramExecutable.cpp
+++ b/src/libANGLE/ProgramExecutable.cpp
@@ -1672,9 +1672,9 @@
         {
             // The arrays of arrays are flattened to arrays, it needs to record the array offset for
             // the correct binding image unit.
-            mImageBindings.emplace_back(ImageBinding(
-                imageUniform.getBinding() + imageUniform.parentArrayIndex() * arraySize,
-                imageUniform.getBasicTypeElementCount(), textureType));
+            mImageBindings.emplace_back(
+                ImageBinding(imageUniform.getBinding() + imageUniform.parentArrayIndex * arraySize,
+                             imageUniform.getBasicTypeElementCount(), textureType));
         }
 
         *combinedImageUniforms += imageUniform.activeShaderCount() * arraySize;
@@ -1711,13 +1711,14 @@
     {
         auto &uniform = mUniforms[index];
 
-        uniform.blockInfo.offset           = uniform.getOffset();
-        uniform.blockInfo.arrayStride      = uniform.isArray() ? 4 : 0;
-        uniform.blockInfo.matrixStride     = 0;
-        uniform.blockInfo.isRowMajorMatrix = false;
+        uniform.blockOffset                    = uniform.getOffset();
+        uniform.blockArrayStride               = uniform.isArray() ? 4 : 0;
+        uniform.blockMatrixStride              = 0;
+        uniform.flagBits.blockIsRowMajorMatrix = false;
+        uniform.flagBits.isBlock               = true;
 
         bool found = false;
-        for (unsigned int bufferIndex = 0; bufferIndex < getActiveAtomicCounterBufferCount();
+        for (uint16_t bufferIndex = 0; bufferIndex < getActiveAtomicCounterBufferCount();
              ++bufferIndex)
         {
             auto &buffer = mAtomicCounterBuffers[bufferIndex];
@@ -1726,7 +1727,7 @@
                 buffer.memberIndexes.push_back(index);
                 uniform.bufferIndex = bufferIndex;
                 found               = true;
-                buffer.unionReferencesWith(uniform.activeVariable);
+                buffer.unionReferencesWith(uniform);
                 break;
             }
         }
@@ -1735,9 +1736,9 @@
             AtomicCounterBuffer atomicCounterBuffer;
             atomicCounterBuffer.binding = uniform.getBinding();
             atomicCounterBuffer.memberIndexes.push_back(index);
-            atomicCounterBuffer.unionReferencesWith(uniform.activeVariable);
+            atomicCounterBuffer.unionReferencesWith(uniform);
             mAtomicCounterBuffers.push_back(atomicCounterBuffer);
-            uniform.bufferIndex = static_cast<int>(getActiveAtomicCounterBufferCount() - 1);
+            uniform.bufferIndex = static_cast<uint16_t>(getActiveAtomicCounterBufferCount() - 1);
         }
     }
 
diff --git a/src/libANGLE/Uniform.cpp b/src/libANGLE/Uniform.cpp
index 5a8e386..187b4ec 100644
--- a/src/libANGLE/Uniform.cpp
+++ b/src/libANGLE/Uniform.cpp
@@ -30,26 +30,21 @@
     mIds[shaderType] = id;
 }
 
-void ActiveVariable::unionReferencesWith(const ActiveVariable &other)
+void ActiveVariable::unionReferencesWith(const LinkedUniform &other)
 {
     mActiveUseBits |= other.mActiveUseBits;
     for (const ShaderType shaderType : AllShaderTypes())
     {
-        ASSERT(mIds[shaderType] == 0 || other.mIds[shaderType] == 0 ||
-               mIds[shaderType] == other.mIds[shaderType]);
+        ASSERT(mIds[shaderType] == 0 || other.getId(shaderType) == 0 ||
+               mIds[shaderType] == other.getId(shaderType));
         if (mIds[shaderType] == 0)
         {
-            mIds[shaderType] = other.mIds[shaderType];
+            mIds[shaderType] = other.getId(shaderType);
         }
     }
 }
 
-LinkedUniform::LinkedUniform()
-{
-    memset(this, 0, sizeof(*this));
-    blockInfo      = sh::BlockMemberInfo();
-    activeVariable = ActiveVariable();
-}
+LinkedUniform::LinkedUniform() {}
 
 LinkedUniform::LinkedUniform(GLenum typeIn,
                              GLenum precisionIn,
@@ -60,26 +55,27 @@
                              const int bufferIndexIn,
                              const sh::BlockMemberInfo &blockInfoIn)
 {
-    memset(this, 0, sizeof(*this));
-    // Note: Ensure every data member is initialized.
-    type                          = typeIn;
-    precision                     = precisionIn;
-    imageUnitFormat               = GL_NONE;
-    location                      = locationIn;
-    binding                       = bindingIn;
-    offset                        = offsetIn;
-    bufferIndex                   = bufferIndexIn;
-    blockInfo                     = blockInfoIn;
-    activeVariable                = ActiveVariable();
-    id                            = 0;
-    flattenedOffsetInParentArrays = -1;
-    outerArraySizeProduct         = 1;
-    outerArrayOffset              = 0;
-    arraySize                     = arraySizesIn.empty() ? 1 : arraySizesIn[0];
-
-    flagBitsAsUInt   = 0;
-    flagBits.isArray = !arraySizesIn.empty();
+    // arrays are always flattened, which means at most 1D array
     ASSERT(arraySizesIn.size() <= 1);
+
+    memset(this, 0, sizeof(*this));
+    SetBitField(type, typeIn);
+    SetBitField(precision, precisionIn);
+    location = locationIn;
+    SetBitField(binding, bindingIn);
+    SetBitField(offset, offsetIn);
+    SetBitField(bufferIndex, bufferIndexIn);
+    outerArraySizeProduct = 1;
+    SetBitField(arraySize, arraySizesIn.empty() ? 1u : arraySizesIn[0]);
+    SetBitField(flagBits.isArray, !arraySizesIn.empty());
+    if (!(blockInfoIn == sh::kDefaultBlockMemberInfo))
+    {
+        flagBits.isBlock               = 1;
+        flagBits.blockIsRowMajorMatrix = blockInfoIn.isRowMajorMatrix;
+        SetBitField(blockOffset, blockInfoIn.offset);
+        SetBitField(blockArrayStride, blockInfoIn.arrayStride);
+        SetBitField(blockMatrixStride, blockInfoIn.matrixStride);
+    }
 }
 
 LinkedUniform::LinkedUniform(const LinkedUniform &other)
@@ -89,32 +85,37 @@
 
 LinkedUniform::LinkedUniform(const UsedUniform &usedUniform)
 {
-    memset(this, 0, sizeof(*this));
-
     ASSERT(!usedUniform.isArrayOfArrays());
     ASSERT(!usedUniform.isStruct());
     ASSERT(usedUniform.active);
+    ASSERT(usedUniform.blockInfo == sh::kDefaultBlockMemberInfo);
 
     // Note: Ensure every data member is initialized.
-    type                          = usedUniform.type;
-    precision                     = usedUniform.precision;
-    imageUnitFormat               = usedUniform.imageUnitFormat;
-    location                      = usedUniform.location;
-    binding                       = usedUniform.binding;
-    offset                        = usedUniform.offset;
-    bufferIndex                   = usedUniform.bufferIndex;
-    blockInfo                     = usedUniform.blockInfo;
-    activeVariable                = usedUniform.activeVariable;
-    id                            = usedUniform.id;
-    flattenedOffsetInParentArrays = usedUniform.getFlattenedOffsetInParentArrays();
-    outerArraySizeProduct         = ArraySizeProduct(usedUniform.outerArraySizes);
-    outerArrayOffset              = usedUniform.outerArrayOffset;
-    arraySize                     = usedUniform.isArray() ? usedUniform.getArraySizeProduct() : 1u;
+    flagBitsAsUByte = 0;
+    SetBitField(type, usedUniform.type);
+    SetBitField(precision, usedUniform.precision);
+    SetBitField(imageUnitFormat, usedUniform.imageUnitFormat);
+    location          = usedUniform.location;
+    blockOffset       = 0;
+    blockArrayStride  = 0;
+    blockMatrixStride = 0;
+    SetBitField(binding, usedUniform.binding);
+    SetBitField(offset, usedUniform.offset);
 
-    flagBitsAsUInt               = 0;
-    flagBits.isFragmentInOut     = usedUniform.isFragmentInOut;
-    flagBits.texelFetchStaticUse = usedUniform.texelFetchStaticUse;
-    flagBits.isArray             = usedUniform.isArray();
+    SetBitField(bufferIndex, usedUniform.bufferIndex);
+    SetBitField(parentArrayIndex, usedUniform.parentArrayIndex());
+    SetBitField(outerArraySizeProduct, ArraySizeProduct(usedUniform.outerArraySizes));
+    SetBitField(outerArrayOffset, usedUniform.outerArrayOffset);
+    SetBitField(arraySize, usedUniform.isArray() ? usedUniform.getArraySizeProduct() : 1u);
+    SetBitField(flagBits.isArray, usedUniform.isArray());
+
+    id             = usedUniform.id;
+    mActiveUseBits = usedUniform.activeVariable.activeShaders();
+    mIds           = usedUniform.activeVariable.getIds();
+
+    SetBitField(flagBits.isFragmentInOut, usedUniform.isFragmentInOut);
+    SetBitField(flagBits.texelFetchStaticUse, usedUniform.texelFetchStaticUse);
+    ASSERT(!usedUniform.isArray() || arraySize == usedUniform.getArraySizeProduct());
 }
 
 LinkedUniform::~LinkedUniform() {}
diff --git a/src/libANGLE/Uniform.h b/src/libANGLE/Uniform.h
index 7a5ff1e..40fc82c 100644
--- a/src/libANGLE/Uniform.h
+++ b/src/libANGLE/Uniform.h
@@ -23,6 +23,7 @@
 class BinaryOutputStream;
 struct UniformTypeInfo;
 struct UsedUniform;
+struct LinkedUniform;
 
 // Note: keep this struct memcpy-able: i.e, a simple struct with basic types only and no virtual
 // functions. LinkedUniform relies on this so that it can use memcpy to initialize uniform for
@@ -40,7 +41,7 @@
         return static_cast<ShaderType>(ScanForward(mActiveUseBits.bits()));
     }
     void setActive(ShaderType shaderType, bool used, uint32_t id);
-    void unionReferencesWith(const ActiveVariable &other);
+    void unionReferencesWith(const LinkedUniform &otherUniform);
     bool isActive(ShaderType shaderType) const
     {
         ASSERT(shaderType != ShaderType::InvalidEnum);
@@ -49,7 +50,6 @@
     const ShaderMap<uint32_t> &getIds() const { return mIds; }
     uint32_t getId(ShaderType shaderType) const { return mIds[shaderType]; }
     ShaderBitSet activeShaders() const { return mActiveUseBits; }
-    GLuint activeShaderCount() const { return static_cast<GLuint>(mActiveUseBits.count()); }
 
   private:
     ShaderBitSet mActiveUseBits;
@@ -62,6 +62,7 @@
 // not put any std::vector or objects with virtual functions in it.
 // Helper struct representing a single shader uniform. Most of this structure's data member and
 // access functions mirrors ShaderVariable; See ShaderVars.h for more info.
+ANGLE_ENABLE_STRUCT_PADDING_WARNINGS
 struct LinkedUniform
 {
     LinkedUniform();
@@ -88,70 +89,82 @@
     bool isFragmentInOut() const { return flagBits.isFragmentInOut; }
 
     bool isArray() const { return flagBits.isArray; }
-    unsigned int getBasicTypeElementCount() const
+    uint16_t getBasicTypeElementCount() const
     {
         ASSERT(flagBits.isArray || arraySize == 1u);
         return arraySize;
     }
 
     GLenum getType() const { return type; }
-    unsigned int getOuterArrayOffset() const { return outerArrayOffset; }
-    unsigned int getOuterArraySizeProduct() const { return outerArraySizeProduct; }
-    int getBinding() const { return binding; }
-    int getOffset() const { return offset; }
-    const sh::BlockMemberInfo &getBlockInfo() const { return blockInfo; }
+    uint16_t getOuterArrayOffset() const { return outerArrayOffset; }
+    uint16_t getOuterArraySizeProduct() const { return outerArraySizeProduct; }
+    int16_t getBinding() const { return binding; }
+    int16_t getOffset() const { return offset; }
     int getBufferIndex() const { return bufferIndex; }
     int getLocation() const { return location; }
     GLenum getImageUnitFormat() const { return imageUnitFormat; }
 
-    int parentArrayIndex() const
-    {
-        return flattenedOffsetInParentArrays != -1 ? flattenedOffsetInParentArrays : 0;
-    }
-
     ShaderType getFirstActiveShaderType() const
     {
-        return activeVariable.getFirstActiveShaderType();
+        return static_cast<ShaderType>(ScanForward(mActiveUseBits.bits()));
     }
     void setActive(ShaderType shaderType, bool used, uint32_t _id)
     {
-        activeVariable.setActive(shaderType, used, _id);
+        mActiveUseBits.set(shaderType, used);
+        mIds[shaderType] = id;
     }
-    bool isActive(ShaderType shaderType) const { return activeVariable.isActive(shaderType); }
-    const ShaderMap<uint32_t> &getIds() const { return activeVariable.getIds(); }
-    uint32_t getId(ShaderType shaderType) const { return activeVariable.getId(shaderType); }
-    ShaderBitSet activeShaders() const { return activeVariable.activeShaders(); }
-    GLuint activeShaderCount() const { return activeVariable.activeShaderCount(); }
+    bool isActive(ShaderType shaderType) const { return mActiveUseBits[shaderType]; }
+    const ShaderMap<uint32_t> &getIds() const { return mIds; }
+    uint32_t getId(ShaderType shaderType) const { return mIds[shaderType]; }
+    ShaderBitSet activeShaders() const { return mActiveUseBits; }
+    GLuint activeShaderCount() const { return static_cast<GLuint>(mActiveUseBits.count()); }
 
-    sh::BlockMemberInfo blockInfo;
-    ActiveVariable activeVariable;
+    uint16_t type;
+    uint16_t precision;
 
-    GLenum type;
-    GLenum precision;
-    GLenum imageUnitFormat;
     int location;
-    int binding;
-    int offset;
-    uint32_t id;
-    int flattenedOffsetInParentArrays;
-    int bufferIndex;
-    unsigned int outerArraySizeProduct;
-    unsigned int outerArrayOffset;
-    unsigned int arraySize;
 
+    // These are from sh::struct BlockMemberInfo struct. See locklayout.h for detail.
+    uint16_t blockOffset;
+    uint16_t blockArrayStride;
+
+    uint16_t blockMatrixStride;
+    uint16_t imageUnitFormat;
+
+    // maxUniformVectorsCount is 4K due to we clamp maxUniformBlockSize to 64KB. All of these
+    // variable should be enough to pack into 16 bits to reduce the size of mUniforms.
+    int16_t binding;
+    int16_t bufferIndex;
+
+    int16_t offset;
+    uint16_t arraySize;
+
+    uint16_t outerArraySizeProduct;
+    uint16_t outerArrayOffset;
+
+    uint16_t parentArrayIndex;
     union
     {
         struct
         {
-            uint32_t isFragmentInOut : 1;
-            uint32_t texelFetchStaticUse : 1;
-            uint32_t isArray : 1;
-            uint32_t padding : 29;
+            uint8_t isFragmentInOut : 1;
+            uint8_t texelFetchStaticUse : 1;
+            uint8_t isArray : 1;
+            uint8_t blockIsRowMajorMatrix : 1;
+            uint8_t isBlock : 1;
+            uint8_t padding : 3;
         } flagBits;
-
-        uint32_t flagBitsAsUInt;
+        uint8_t flagBitsAsUByte;
     };
+    ShaderBitSet mActiveUseBits;
+
+    uint32_t id;
+
+    // The id of a linked variable in each shader stage.  This id originates from
+    // sh::ShaderVariable::id or sh::InterfaceBlock::id
+    ShaderMap<uint32_t> mIds;
 };
+ANGLE_DISABLE_STRUCT_PADDING_WARNINGS
 
 struct BufferVariable : public sh::ShaderVariable
 {
@@ -195,9 +208,9 @@
     {
         activeVariable.setActive(shaderType, used, _id);
     }
-    void unionReferencesWith(const ActiveVariable &other)
+    void unionReferencesWith(const LinkedUniform &otherUniform)
     {
-        activeVariable.unionReferencesWith(other);
+        activeVariable.unionReferencesWith(otherUniform);
     }
     bool isActive(ShaderType shaderType) const { return activeVariable.isActive(shaderType); }
     const ShaderMap<uint32_t> &getIds() const { return activeVariable.getIds(); }
diff --git a/src/libANGLE/queryutils.cpp b/src/libANGLE/queryutils.cpp
index 8db8bb7..3ff9535 100644
--- a/src/libANGLE/queryutils.cpp
+++ b/src/libANGLE/queryutils.cpp
@@ -1926,16 +1926,16 @@
             return (uniform.isAtomicCounter() ? -1 : uniform.getBufferIndex());
 
         case GL_OFFSET:
-            return uniform.getBlockInfo().offset;
+            return uniform.flagBits.isBlock ? uniform.blockOffset : -1;
 
         case GL_ARRAY_STRIDE:
-            return uniform.getBlockInfo().arrayStride;
+            return uniform.flagBits.isBlock ? uniform.blockArrayStride : -1;
 
         case GL_MATRIX_STRIDE:
-            return uniform.getBlockInfo().matrixStride;
+            return uniform.flagBits.isBlock ? uniform.blockMatrixStride : -1;
 
         case GL_IS_ROW_MAJOR:
-            return static_cast<GLint>(uniform.getBlockInfo().isRowMajorMatrix);
+            return uniform.flagBits.blockIsRowMajorMatrix ? 1 : 0;
 
         case GL_REFERENCED_BY_VERTEX_SHADER:
             return uniform.isActive(ShaderType::Vertex);
diff --git a/src/libANGLE/renderer/d3d/ProgramD3D.cpp b/src/libANGLE/renderer/d3d/ProgramD3D.cpp
index f76e625..60b718a 100644
--- a/src/libANGLE/renderer/d3d/ProgramD3D.cpp
+++ b/src/libANGLE/renderer/d3d/ProgramD3D.cpp
@@ -2874,7 +2874,7 @@
     std::vector<unsigned int> subscripts;
     const std::string baseName = gl::ParseResourceName(d3dUniform->name, &subscripts);
     unsigned int registerOffset =
-        mState.getUniforms()[uniformIndex].parentArrayIndex() * d3dUniform->getArraySizeProduct();
+        mState.getUniforms()[uniformIndex].parentArrayIndex * d3dUniform->getArraySizeProduct();
 
     bool hasUniform = false;
     for (gl::ShaderType shaderType : gl::AllShaderTypes())
@@ -2976,7 +2976,7 @@
     std::vector<unsigned int> subscripts;
     const std::string baseName = gl::ParseResourceName(d3dUniform->name, &subscripts);
     unsigned int registerOffset =
-        mState.getUniforms()[uniformIndex].parentArrayIndex() * d3dUniform->getArraySizeProduct();
+        mState.getUniforms()[uniformIndex].parentArrayIndex * d3dUniform->getArraySizeProduct();
 
     const gl::Shader *computeShader = mState.getAttachedShader(gl::ShaderType::Compute);
     if (computeShader)
diff --git a/src/libANGLE/renderer/renderer_utils.h b/src/libANGLE/renderer/renderer_utils.h
index 65e3c3c..c795c3c 100644
--- a/src/libANGLE/renderer/renderer_utils.h
+++ b/src/libANGLE/renderer/renderer_utils.h
@@ -494,13 +494,4 @@
 #define ANGLE_MARK_TRANSFORM_FEEDBACK_USAGE(instanced) \
     ANGLE_MARK_TRANSFORM_FEEDBACK_USAGE##instanced
 
-// Helper macro that casts to a bitfield type then verifies no bits were dropped.
-#define SetBitField(lhs, rhs)                                                         \
-    do                                                                                \
-    {                                                                                 \
-        auto ANGLE_LOCAL_VAR = rhs;                                                   \
-        lhs = static_cast<typename std::decay<decltype(lhs)>::type>(ANGLE_LOCAL_VAR); \
-        ASSERT(static_cast<decltype(ANGLE_LOCAL_VAR)>(lhs) == ANGLE_LOCAL_VAR);       \
-    } while (0)
-
 #endif  // LIBANGLE_RENDERER_RENDERER_UTILS_H_