Vulkan: Emulate instanced attrib divisor

This sets instancedArrays[ANGLE|EXT] extenstions as always
supported regardless of underlying Vulkan HW's max vertex attrib
divisor.
Then detect instances where app sets a divisor that isn't supported
by hardware and emulate those cases. Emulations is accomplished by
copying the instanced attribs to a new buffer where each attrib is
present once per instance, using the attrib divisor value as a
factor to replicate the attribs, and then setting the actual divisor
value for the draw to "1".
Also, we only store 8 bits for the divisor used in the PSO, so this
code also handles emulation of the case where divisor is > 255.

This is passing all of the drawInstanced/Elements dEQP tests
where divisor has to be emulated.

Also enabled end2end InstancingTestES3 for Vulkan backend.

Bug: angleproject:2672
Change-Id: I9932f9eab49b16a19e8bbd35dacaf3b5a27a213f
Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/1758689
Reviewed-by: Courtney Goeltzenleuchter <courtneygo@google.com>
Commit-Queue: Tobin Ehlis <tobine@google.com>
diff --git a/src/libANGLE/renderer/vulkan/ContextVk.cpp b/src/libANGLE/renderer/vulkan/ContextVk.cpp
index ae5a5b2..9e90529 100644
--- a/src/libANGLE/renderer/vulkan/ContextVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ContextVk.cpp
@@ -477,12 +477,14 @@
     }
 
     // Must be called before the command buffer is started. Can call finish.
-    if (context->getStateCache().hasAnyActiveClientAttrib())
+    if (mVertexArray->getStreamingVertexAttribsMask().any())
     {
         ASSERT(firstVertexOrInvalid != -1);
-        ANGLE_TRY(mVertexArray->updateClientAttribs(context, firstVertexOrInvalid,
-                                                    vertexOrIndexCount, instanceCount,
-                                                    indexTypeOrInvalid, indices));
+        // All client attribs & any emulated buffered attribs will be updated
+        ANGLE_TRY(mVertexArray->updateStreamedAttribs(context, firstVertexOrInvalid,
+                                                      vertexOrIndexCount, instanceCount,
+                                                      indexTypeOrInvalid, indices));
+
         mGraphicsDirtyBits.set(DIRTY_BIT_VERTEX_BUFFERS);
     }
 
diff --git a/src/libANGLE/renderer/vulkan/ContextVk.h b/src/libANGLE/renderer/vulkan/ContextVk.h
index e88fece..af50f4d 100644
--- a/src/libANGLE/renderer/vulkan/ContextVk.h
+++ b/src/libANGLE/renderer/vulkan/ContextVk.h
@@ -213,9 +213,10 @@
                                               GLuint relativeOffset)
     {
         invalidateVertexAndIndexBuffers();
-        mGraphicsPipelineDesc->updateVertexInput(&mGraphicsPipelineTransition,
-                                                 static_cast<uint32_t>(attribIndex), stride,
-                                                 divisor, format, relativeOffset);
+        // Set divisor to 1 for attribs with emulated divisor
+        mGraphicsPipelineDesc->updateVertexInput(
+            &mGraphicsPipelineTransition, static_cast<uint32_t>(attribIndex), stride,
+            divisor > mRenderer->getMaxVertexAttribDivisor() ? 1 : divisor, format, relativeOffset);
     }
 
     void invalidateDefaultAttribute(size_t attribIndex);
diff --git a/src/libANGLE/renderer/vulkan/RendererVk.cpp b/src/libANGLE/renderer/vulkan/RendererVk.cpp
index d309953..3a7b7d0 100644
--- a/src/libANGLE/renderer/vulkan/RendererVk.cpp
+++ b/src/libANGLE/renderer/vulkan/RendererVk.cpp
@@ -1056,7 +1056,11 @@
         deviceProperties.pNext = &divisorProperties;
 
         vkGetPhysicalDeviceProperties2KHR(mPhysicalDevice, &deviceProperties);
-        mMaxVertexAttribDivisor = divisorProperties.maxVertexAttribDivisor;
+        // We only store 8 bit divisor in GraphicsPipelineDesc so capping value & we emulate if
+        // exceeded
+        mMaxVertexAttribDivisor =
+            std::min(divisorProperties.maxVertexAttribDivisor,
+                     static_cast<uint32_t>(std::numeric_limits<uint8_t>::max()));
 
         createInfo.pNext = &enabledFeatures;
     }
diff --git a/src/libANGLE/renderer/vulkan/RendererVk.h b/src/libANGLE/renderer/vulkan/RendererVk.h
index 07e11b2..9340d5b 100644
--- a/src/libANGLE/renderer/vulkan/RendererVk.h
+++ b/src/libANGLE/renderer/vulkan/RendererVk.h
@@ -125,6 +125,7 @@
         ASSERT(mFeaturesInitialized);
         return mFeatures;
     }
+    uint32_t getMaxVertexAttribDivisor() const { return mMaxVertexAttribDivisor; }
 
     bool isMockICDEnabled() const { return mEnabledICD == vk::ICD::Mock; }
 
diff --git a/src/libANGLE/renderer/vulkan/VertexArrayVk.cpp b/src/libANGLE/renderer/vulkan/VertexArrayVk.cpp
index 8cf7e19..77684da 100644
--- a/src/libANGLE/renderer/vulkan/VertexArrayVk.cpp
+++ b/src/libANGLE/renderer/vulkan/VertexArrayVk.cpp
@@ -54,17 +54,37 @@
                                size_t bytesToAllocate,
                                size_t destOffset,
                                size_t vertexCount,
-                               size_t stride,
+                               size_t sourceStride,
+                               size_t destStride,
                                VertexCopyFunction vertexLoadFunction,
                                vk::BufferHelper **bufferOut,
-                               VkDeviceSize *bufferOffsetOut)
+                               VkDeviceSize *bufferOffsetOut,
+                               uint32_t replicateCount)
 {
     uint8_t *dst = nullptr;
     ANGLE_TRY(dynamicBuffer->allocate(contextVk, bytesToAllocate, &dst, nullptr, bufferOffsetOut,
                                       nullptr));
     *bufferOut = dynamicBuffer->getCurrentBuffer();
     dst += destOffset;
-    vertexLoadFunction(sourceData, stride, vertexCount, dst);
+    if (replicateCount == 1)
+    {
+        vertexLoadFunction(sourceData, sourceStride, vertexCount, dst);
+    }
+    else
+    {
+        ASSERT(replicateCount > 1);
+        uint32_t sourceRemainingCount = replicateCount - 1;
+        for (size_t dataCopied = 0; dataCopied < bytesToAllocate;
+             dataCopied += destStride, dst += destStride, sourceRemainingCount--)
+        {
+            vertexLoadFunction(sourceData, sourceStride, 1, dst);
+            if (sourceRemainingCount == 0)
+            {
+                sourceData += sourceStride;
+                sourceRemainingCount = replicateCount;
+            }
+        }
+    }
 
     ANGLE_TRY(dynamicBuffer->flush(contextVk));
     return angle::Result::Continue;
@@ -334,9 +354,9 @@
     srcBytes += binding.getOffset() + relativeOffset;
     ASSERT(GetVertexInputAlignment(vertexFormat) <= vk::kVertexBufferAlignment);
     ANGLE_TRY(StreamVertexData(contextVk, &conversion->data, srcBytes, numVertices * dstFormatSize,
-                               0, numVertices, binding.getStride(), vertexFormat.vertexLoadFunction,
-                               &mCurrentArrayBuffers[attribIndex],
-                               &conversion->lastAllocationOffset));
+                               0, numVertices, binding.getStride(), srcFormatSize,
+                               vertexFormat.vertexLoadFunction, &mCurrentArrayBuffers[attribIndex],
+                               &conversion->lastAllocationOffset, 1));
     srcBuffer->unmapImpl(contextVk);
 
     ASSERT(conversion->dirty);
@@ -460,6 +480,11 @@
         GLuint stride;
         bool anyVertexBufferConvertedOnGpu = false;
         gl::Buffer *bufferGL               = binding.getBuffer().get();
+        // Emulated and/or client-side attribs will be streamed
+        bool isStreamingVertexAttrib =
+            (binding.getDivisor() > renderer->getMaxVertexAttribDivisor()) || (bufferGL == nullptr);
+        mStreamingVertexAttribsMask.set(attribIndex, isStreamingVertexAttrib);
+
         if (bufferGL)
         {
             BufferVk *bufferVk               = vk::GetImpl(bufferGL);
@@ -563,17 +588,24 @@
     return angle::Result::Continue;
 }
 
-angle::Result VertexArrayVk::updateClientAttribs(const gl::Context *context,
-                                                 GLint firstVertex,
-                                                 GLsizei vertexOrIndexCount,
-                                                 GLsizei instanceCount,
-                                                 gl::DrawElementsType indexTypeOrInvalid,
-                                                 const void *indices)
+// Handle copying client attribs and/or expanding attrib buffer in case where attribute
+//  divisor value has to be emulated.
+angle::Result VertexArrayVk::updateStreamedAttribs(const gl::Context *context,
+                                                   GLint firstVertex,
+                                                   GLsizei vertexOrIndexCount,
+                                                   GLsizei instanceCount,
+                                                   gl::DrawElementsType indexTypeOrInvalid,
+                                                   const void *indices)
 {
     ContextVk *contextVk                    = vk::GetImpl(context);
-    const gl::AttributesMask &clientAttribs = context->getStateCache().getActiveClientAttribsMask();
+    const gl::AttributesMask activeAttribs =
+        context->getStateCache().getActiveClientAttribsMask() |
+        context->getStateCache().getActiveBufferedAttribsMask();
+    const gl::AttributesMask activeStreamedAttribs = mStreamingVertexAttribsMask & activeAttribs;
 
-    ASSERT(clientAttribs.any());
+    // Early return for corner case where emulated buffered attribs are not active
+    if (!activeStreamedAttribs.any())
+        return angle::Result::Continue;
 
     GLint startVertex;
     size_t vertexCount;
@@ -586,13 +618,13 @@
     const auto &attribs  = mState.getVertexAttributes();
     const auto &bindings = mState.getVertexBindings();
 
-    // TODO(fjhenigman): When we have a bunch of interleaved attributes, they end up
+    // TODO: When we have a bunch of interleaved attributes, they end up
     // un-interleaved, wasting space and copying time.  Consider improving on that.
-    for (size_t attribIndex : clientAttribs)
+    for (size_t attribIndex : activeStreamedAttribs)
     {
         const gl::VertexAttribute &attrib = attribs[attribIndex];
-        const gl::VertexBinding &binding  = bindings[attrib.bindingIndex];
-        ASSERT(attrib.enabled && binding.getBuffer().get() == nullptr);
+        ASSERT(attrib.enabled);
+        const gl::VertexBinding &binding = bindings[attrib.bindingIndex];
 
         const vk::Format &vertexFormat = renderer->getFormat(attrib.format->id);
         GLuint stride                  = vertexFormat.bufferFormat().pixelBytes;
@@ -600,19 +632,51 @@
         ASSERT(GetVertexInputAlignment(vertexFormat) <= vk::kVertexBufferAlignment);
 
         const uint8_t *src = static_cast<const uint8_t *>(attrib.pointer);
-        if (binding.getDivisor() > 0)
+        const uint32_t divisor = binding.getDivisor();
+        if (divisor > 0)
         {
-            // instanced attrib
-            size_t count           = UnsignedCeilDivide(instanceCount, binding.getDivisor());
-            size_t bytesToAllocate = count * stride;
+            // Instanced attrib
+            if (divisor > renderer->getMaxVertexAttribDivisor())
+            {
+                // Emulated attrib
+                BufferVk *bufferVk = nullptr;
+                if (binding.getBuffer().get() != nullptr)
+                {
+                    // Map buffer to expand attribs for divisor emulation
+                    bufferVk      = vk::GetImpl(binding.getBuffer().get());
+                    void *buffSrc = nullptr;
+                    ANGLE_TRY(bufferVk->mapImpl(contextVk, &buffSrc));
+                    src = reinterpret_cast<const uint8_t *>(buffSrc);
+                }
+                // Divisor will be set to 1 & so update buffer to have 1 attrib per instance
+                size_t bytesToAllocate = instanceCount * stride;
 
-            ANGLE_TRY(StreamVertexData(contextVk, &mDynamicVertexData, src, bytesToAllocate, 0,
-                                       count, binding.getStride(), vertexFormat.vertexLoadFunction,
-                                       &mCurrentArrayBuffers[attribIndex],
-                                       &mCurrentArrayBufferOffsets[attribIndex]));
+                ANGLE_TRY(StreamVertexData(contextVk, &mDynamicVertexData, src, bytesToAllocate, 0,
+                                           instanceCount, binding.getStride(), stride,
+                                           vertexFormat.vertexLoadFunction,
+                                           &mCurrentArrayBuffers[attribIndex],
+                                           &mCurrentArrayBufferOffsets[attribIndex], divisor));
+                if (bufferVk)
+                {
+                    bufferVk->unmapImpl(contextVk);
+                }
+            }
+            else
+            {
+                ASSERT(binding.getBuffer().get() == nullptr);
+                size_t count           = UnsignedCeilDivide(instanceCount, divisor);
+                size_t bytesToAllocate = count * stride;
+
+                ANGLE_TRY(StreamVertexData(contextVk, &mDynamicVertexData, src, bytesToAllocate, 0,
+                                           count, binding.getStride(), stride,
+                                           vertexFormat.vertexLoadFunction,
+                                           &mCurrentArrayBuffers[attribIndex],
+                                           &mCurrentArrayBufferOffsets[attribIndex], 1));
+            }
         }
         else
         {
+            ASSERT(binding.getBuffer().get() == nullptr);
             // Allocate space for startVertex + vertexCount so indexing will work.  If we don't
             // start at zero all the indices will be off.
             // Only vertexCount vertices will be used by the upcoming draw so that is all we copy.
@@ -622,8 +686,8 @@
 
             ANGLE_TRY(StreamVertexData(
                 contextVk, &mDynamicVertexData, src, bytesToAllocate, destOffset, vertexCount,
-                binding.getStride(), vertexFormat.vertexLoadFunction,
-                &mCurrentArrayBuffers[attribIndex], &mCurrentArrayBufferOffsets[attribIndex]));
+                binding.getStride(), stride, vertexFormat.vertexLoadFunction,
+                &mCurrentArrayBuffers[attribIndex], &mCurrentArrayBufferOffsets[attribIndex], 1));
         }
 
         mCurrentArrayBufferHandles[attribIndex] =
diff --git a/src/libANGLE/renderer/vulkan/VertexArrayVk.h b/src/libANGLE/renderer/vulkan/VertexArrayVk.h
index 6c26184..a6338ac 100644
--- a/src/libANGLE/renderer/vulkan/VertexArrayVk.h
+++ b/src/libANGLE/renderer/vulkan/VertexArrayVk.h
@@ -37,12 +37,12 @@
                              VkBuffer bufferHandle,
                              uint32_t offset);
 
-    angle::Result updateClientAttribs(const gl::Context *context,
-                                      GLint firstVertex,
-                                      GLsizei vertexOrIndexCount,
-                                      GLsizei instanceCount,
-                                      gl::DrawElementsType indexTypeOrInvalid,
-                                      const void *indices);
+    angle::Result updateStreamedAttribs(const gl::Context *context,
+                                        GLint firstVertex,
+                                        GLsizei vertexOrIndexCount,
+                                        GLsizei instanceCount,
+                                        gl::DrawElementsType indexTypeOrInvalid,
+                                        const void *indices);
 
     angle::Result handleLineLoop(ContextVk *contextVk,
                                  GLint firstVertex,
@@ -92,6 +92,11 @@
                                         size_t indexCount,
                                         const void *sourcePointer);
 
+    const gl::AttributesMask &getStreamingVertexAttribsMask() const
+    {
+        return mStreamingVertexAttribsMask;
+    }
+
   private:
     void setDefaultPackedInput(ContextVk *contextVk, size_t attribIndex);
 
@@ -133,6 +138,9 @@
 
     // Vulkan does not allow binding a null vertex buffer. We use a dummy as a placeholder.
     vk::BufferHelper mTheNullBuffer;
+
+    // Track client and/or emulated attribs that we have to stream their buffer contents
+    gl::AttributesMask mStreamingVertexAttribsMask;
 };
 }  // namespace rx
 
diff --git a/src/libANGLE/renderer/vulkan/vk_cache_utils.cpp b/src/libANGLE/renderer/vulkan/vk_cache_utils.cpp
index e52cb36..ca60902 100644
--- a/src/libANGLE/renderer/vulkan/vk_cache_utils.cpp
+++ b/src/libANGLE/renderer/vulkan/vk_cache_utils.cpp
@@ -16,6 +16,7 @@
 #include "libANGLE/renderer/vulkan/FramebufferVk.h"
 #include "libANGLE/renderer/vulkan/ProgramVk.h"
 #include "libANGLE/renderer/vulkan/RendererVk.h"
+#include "libANGLE/renderer/vulkan/VertexArrayVk.h"
 #include "libANGLE/renderer/vulkan/vk_format_utils.h"
 #include "libANGLE/renderer/vulkan/vk_helpers.h"
 
@@ -673,7 +674,6 @@
     VkPipelineVertexInputDivisorStateCreateInfoEXT divisorState = {};
     divisorState.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT;
     divisorState.pVertexBindingDivisors = divisorDesc.data();
-
     for (size_t attribIndexSizeT : activeAttribLocationsMask)
     {
         const uint32_t attribIndex = static_cast<uint32_t>(attribIndexSizeT);
@@ -885,10 +885,6 @@
 {
     vk::PackedAttribDesc &packedAttrib = mVertexInputAttribs.attribs[attribIndex];
 
-    // TODO: Handle the case where the divisor overflows the field that holds it.
-    // http://anglebug.com/2672
-    ASSERT(divisor <= std::numeric_limits<decltype(packedAttrib.divisor)>::max());
-
     SetBitField(packedAttrib.stride, stride);
     SetBitField(packedAttrib.divisor, divisor);
 
diff --git a/src/libANGLE/renderer/vulkan/vk_cache_utils.h b/src/libANGLE/renderer/vulkan/vk_cache_utils.h
index b759e0d..1d20ed7 100644
--- a/src/libANGLE/renderer/vulkan/vk_cache_utils.h
+++ b/src/libANGLE/renderer/vulkan/vk_cache_utils.h
@@ -38,7 +38,7 @@
 // fewer bits. For example, boolean values could be represented by a single bit instead
 // of a uint8_t. However at the current time there are concerns about the portability
 // of bitfield operators, and complexity issues with using bit mask operations. This is
-// something likely we will want to investigate as the Vulkan implementation progresses.
+// something we will likely want to investigate as the Vulkan implementation progresses.
 //
 // Second implementation note: the struct packing is also a bit fragile, and some of the
 // packing requirements depend on using alignas and field ordering to get the result of
@@ -164,8 +164,6 @@
 struct PackedAttribDesc final
 {
     uint8_t format;
-
-    // TODO(http://anglebug.com/2672): Emulate divisors greater than UBYTE_MAX.
     uint8_t divisor;
 
     // Can only take 11 bits on NV.
@@ -315,9 +313,9 @@
 static_assert(kPackedInputAssemblyAndColorBlendStateSize == 56, "Size check failed");
 
 constexpr size_t kGraphicsPipelineDescSumOfSizes =
-    kVertexInputAttributesSize + kPackedInputAssemblyAndColorBlendStateSize +
-    kPackedRasterizationAndMultisampleStateSize + kPackedDepthStencilStateSize +
-    kRenderPassDescSize + sizeof(VkViewport) + sizeof(VkRect2D);
+    kVertexInputAttributesSize + kRenderPassDescSize + kPackedRasterizationAndMultisampleStateSize +
+    kPackedDepthStencilStateSize + kPackedInputAssemblyAndColorBlendStateSize + sizeof(VkViewport) +
+    sizeof(VkRect2D);
 
 // Number of dirty bits in the dirty bit set.
 constexpr size_t kGraphicsPipelineDirtyBitBytes = 4;
diff --git a/src/libANGLE/renderer/vulkan/vk_caps_utils.cpp b/src/libANGLE/renderer/vulkan/vk_caps_utils.cpp
index 76815a5..0470ffe 100644
--- a/src/libANGLE/renderer/vulkan/vk_caps_utils.cpp
+++ b/src/libANGLE/renderer/vulkan/vk_caps_utils.cpp
@@ -82,10 +82,9 @@
 
     mNativeExtensions.vertexHalfFloat = true;
 
-    // TODO: Enable this always and emulate instanced draws if any divisor exceeds the maximum
-    // supported.  http://anglebug.com/2672
-    mNativeExtensions.instancedArraysANGLE = mMaxVertexAttribDivisor > 1;
-    mNativeExtensions.instancedArraysEXT   = mMaxVertexAttribDivisor > 1;
+    // Enabled in HW if VK_EXT_vertex_attribute_divisor available, otherwise emulated
+    mNativeExtensions.instancedArraysANGLE = true;
+    mNativeExtensions.instancedArraysEXT   = true;
 
     // Only expose robust buffer access if the physical device supports it.
     mNativeExtensions.robustBufferAccessBehavior =
diff --git a/src/tests/deqp_support/deqp_gles3_test_expectations.txt b/src/tests/deqp_support/deqp_gles3_test_expectations.txt
index 5832ece..b66e9ec 100644
--- a/src/tests/deqp_support/deqp_gles3_test_expectations.txt
+++ b/src/tests/deqp_support/deqp_gles3_test_expectations.txt
@@ -555,16 +555,6 @@
 3219 VULKAN : dEQP-GLES3.functional.negative_api.shader.link_program = FAIL
 3219 VULKAN : dEQP-GLES3.functional.negative_api.shader.use_program = FAIL
 
-2672 VULKAN : dEQP-GLES3.functional.instanced.draw_elements_instanced.attribute_divisor.2*_instances = FAIL
-2672 VULKAN : dEQP-GLES3.functional.instanced.draw_elements_instanced.attribute_divisor.4_instances = FAIL
-2672 VULKAN : dEQP-GLES3.functional.instanced.draw_elements_instanced.mixed.2*_instances = FAIL
-2672 VULKAN : dEQP-GLES3.functional.instanced.draw_elements_instanced.mixed.4_instances = FAIL
-2672 VULKAN : dEQP-GLES3.functional.instanced.draw_arrays_instanced.attribute_divisor.2*_instances = FAIL
-2672 VULKAN : dEQP-GLES3.functional.instanced.draw_arrays_instanced.attribute_divisor.4_instances = FAIL
-2672 VULKAN : dEQP-GLES3.functional.instanced.draw_arrays_instanced.mixed.2*_instances = FAIL
-2672 VULKAN : dEQP-GLES3.functional.instanced.draw_arrays_instanced.mixed.4_instances = FAIL
-2672 VULKAN : dEQP-GLES3.functional.instanced.types* = FAIL
-
 // Polygon offset:
 3678 VULKAN : dEQP-GLES3.functional.polygon_offset.float32_result_depth_clamp = FAIL
 3678 VULKAN : dEQP-GLES3.functional.polygon_offset.float32_factor_1_slope = FAIL
diff --git a/src/tests/gl_tests/InstancingTest.cpp b/src/tests/gl_tests/InstancingTest.cpp
index e65a6f4..8b2d439 100644
--- a/src/tests/gl_tests/InstancingTest.cpp
+++ b/src/tests/gl_tests/InstancingTest.cpp
@@ -588,7 +588,7 @@
         << "Vertex attrib divisor read was not the same that was passed in.";
 }
 
-ANGLE_INSTANTIATE_TEST(InstancingTestES3, ES3_OPENGL(), ES3_OPENGLES(), ES3_D3D11());
+ANGLE_INSTANTIATE_TEST(InstancingTestES3, ES3_OPENGL(), ES3_OPENGLES(), ES3_D3D11(), ES3_VULKAN());
 
 ANGLE_INSTANTIATE_TEST(InstancingTestES31, ES31_OPENGL(), ES31_OPENGLES(), ES31_D3D11());