Vulkan: Remove unnecessary same-layout transitions

Previously, only read-only layouts skipped same-layout transition.  This
is extended to color attachment and depth/stencil attachment layouts as
well.  On Nvidia+Linux, this does reduce the number of barriers in the
Draw* benchmarks, though there's no visible CPU-side performance
difference.  GPU time is not measured in these tests, but it's possible
that the driver already skips such transitions (i.e. this only saves us
from recording an ineffective transition).

Additionally, transfer dst and shader write same-layout transitions are
turned into an execution barrier instead of a memory barrier to avoid an
unnecessary flush.  Currently, this particularly affects texture upload,
and shows a 10% improvement in TextureUploadSubImageBenchmark.  Note
that this optimization is temporarily disabled due to a bug in the
windows AMD driver.

Bug: angleproject:3347
Change-Id: I7dc9d0b5dd2ad87ec19ae13277b330438038519f
Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/1659149
Reviewed-by: Tobin Ehlis <tobine@google.com>
Reviewed-by: Jamie Madill <jmadill@chromium.org>
Commit-Queue: Shahbaz Youssefi <syoussefi@chromium.org>
diff --git a/src/libANGLE/renderer/vulkan/SecondaryCommandBuffer.cpp b/src/libANGLE/renderer/vulkan/SecondaryCommandBuffer.cpp
index 5a559f6..1092fb7 100644
--- a/src/libANGLE/renderer/vulkan/SecondaryCommandBuffer.cpp
+++ b/src/libANGLE/renderer/vulkan/SecondaryCommandBuffer.cpp
@@ -204,6 +204,14 @@
                     vkCmdEndQuery(cmdBuffer, params->queryPool, params->query);
                     break;
                 }
+                case CommandID::ExecutionBarrier:
+                {
+                    const ExecutionBarrierParams *params =
+                        getParamPtr<ExecutionBarrierParams>(currentCommand);
+                    vkCmdPipelineBarrier(cmdBuffer, params->stageMask, params->stageMask, 0, 0,
+                                         nullptr, 0, nullptr, 0, nullptr);
+                    break;
+                }
                 case CommandID::FillBuffer:
                 {
                     const FillBufferParams *params = getParamPtr<FillBufferParams>(currentCommand);
@@ -395,6 +403,9 @@
                 case CommandID::EndQuery:
                     result += "EndQuery";
                     break;
+                case CommandID::ExecutionBarrier:
+                    result += "ExecutionBarrier";
+                    break;
                 case CommandID::FillBuffer:
                     result += "FillBuffer";
                     break;
diff --git a/src/libANGLE/renderer/vulkan/SecondaryCommandBuffer.h b/src/libANGLE/renderer/vulkan/SecondaryCommandBuffer.h
index 9a02ad3..60726e2 100644
--- a/src/libANGLE/renderer/vulkan/SecondaryCommandBuffer.h
+++ b/src/libANGLE/renderer/vulkan/SecondaryCommandBuffer.h
@@ -50,6 +50,7 @@
     DrawIndexedInstanced,
     DrawInstanced,
     EndQuery,
+    ExecutionBarrier,
     FillBuffer,
     ImageBarrier,
     MemoryBarrier,
@@ -249,6 +250,12 @@
 };
 VERIFY_4_BYTE_ALIGNMENT(PipelineBarrierParams)
 
+struct ExecutionBarrierParams
+{
+    VkPipelineStageFlags stageMask;
+};
+VERIFY_4_BYTE_ALIGNMENT(ExecutionBarrierParams)
+
 struct ImageBarrierParams
 {
     VkPipelineStageFlags srcStageMask;
@@ -439,6 +446,8 @@
 
     void endQuery(VkQueryPool queryPool, uint32_t query);
 
+    void executionBarrier(VkPipelineStageFlags stageMask);
+
     void fillBuffer(const Buffer &dstBuffer,
                     VkDeviceSize dstOffset,
                     VkDeviceSize size,
@@ -880,6 +889,13 @@
     paramStruct->query          = query;
 }
 
+ANGLE_INLINE void SecondaryCommandBuffer::executionBarrier(VkPipelineStageFlags stageMask)
+{
+    ExecutionBarrierParams *paramStruct =
+        initCommand<ExecutionBarrierParams>(CommandID::ExecutionBarrier);
+    paramStruct->stageMask = stageMask;
+}
+
 ANGLE_INLINE void SecondaryCommandBuffer::fillBuffer(const Buffer &dstBuffer,
                                                      VkDeviceSize dstOffset,
                                                      VkDeviceSize size,
diff --git a/src/libANGLE/renderer/vulkan/vk_helpers.cpp b/src/libANGLE/renderer/vulkan/vk_helpers.cpp
index 8e4d831..12d5374 100644
--- a/src/libANGLE/renderer/vulkan/vk_helpers.cpp
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.cpp
@@ -57,9 +57,14 @@
     VkAccessFlags srcAccessMask;
 
     // If access is read-only, the execution barrier can be skipped altogether if retransitioning to
-    // the same layout. This is because read-after-read does not need an execution or memory
-    // barrier.
-    bool isReadOnlyAccess;
+    // the same layout.  This is because read-after-read does not need an execution or memory
+    // barrier.  Vulkan additionally guarantees color attachment and depth/stencil attachment
+    // read/writes to be in execution order, so they too won't need a barrier if transitioning to
+    // the same layout.
+    //
+    // Otherwise, same-layout transitions only require an execution barrier (and not a memory
+    // barrier).
+    bool sameLayoutTransitionRequiresBarrier;
 };
 
 // clang-format off
@@ -74,7 +79,7 @@
             0,
             // Transition from: there's no data in the image to care about.
             0,
-            true,
+            false,
         },
     },
     {
@@ -100,7 +105,7 @@
             VK_ACCESS_TRANSFER_READ_BIT,
             // Transition from: RAR and WAR don't need memory barrier.
             0,
-            true,
+            false,
         },
     },
     {
@@ -113,7 +118,7 @@
             VK_ACCESS_TRANSFER_WRITE_BIT,
             // Transition from: all writes must finish before barrier.
             VK_ACCESS_TRANSFER_WRITE_BIT,
-            false,
+            true,
         },
     },
     {
@@ -126,7 +131,7 @@
             VK_ACCESS_SHADER_READ_BIT,
             // Transition from: RAR and WAR don't need memory barrier.
             0,
-            true,
+            false,
         },
     },
     {
@@ -139,7 +144,7 @@
             VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
             // Transition from: all writes must finish before barrier.
             VK_ACCESS_SHADER_WRITE_BIT,
-            false,
+            true,
         },
     },
     {
@@ -152,7 +157,7 @@
             VK_ACCESS_SHADER_READ_BIT,
             // Transition from: RAR and WAR don't need memory barrier.
             0,
-            true,
+            false,
         },
     },
     {
@@ -196,7 +201,7 @@
             0,
             // Transition from: RAR and WAR don't need memory barrier.
             0,
-            true,
+            false,
         },
     },
 };
@@ -1662,10 +1667,14 @@
 {
     const ImageMemoryBarrierData &layoutData = kImageMemoryBarrierData[mCurrentLayout];
 
-    // If transitioning to the same read-only layout (RAR), don't generate a barrier.
-    bool sameLayoutReadAfterRead = mCurrentLayout == newLayout && layoutData.isReadOnlyAccess;
+    // If transitioning to the same layout, we rarely need a barrier.  RAR (read-after-read)
+    // doesn't need a barrier, and WAW (write-after-write) is guaranteed to not require a barrier
+    // for color attachment and depth/stencil attachment writes.  Transfer dst and shader writes
+    // are basically the only cases where an execution barrier is still necessary.
+    bool sameLayoutAndNoNeedForBarrier =
+        mCurrentLayout == newLayout && !layoutData.sameLayoutTransitionRequiresBarrier;
 
-    return !sameLayoutReadAfterRead;
+    return !sameLayoutAndNoNeedForBarrier;
 }
 
 void ImageHelper::changeLayout(VkImageAspectFlags aspectMask,
@@ -1694,6 +1703,24 @@
                                             uint32_t newQueueFamilyIndex,
                                             vk::CommandBuffer *commandBuffer)
 {
+    // If transitioning to the same layout (and there is no queue transfer), an execution barrier
+    // suffices.
+    //
+    // TODO(syoussefi): AMD driver on windows has a bug where an execution barrier is not sufficient
+    // between transfer dst operations (even if the transfer is not to the same subresource!).  A
+    // workaround may be necessary.  http://anglebug.com/3554
+    if (mCurrentLayout == newLayout && mCurrentQueueFamilyIndex == newQueueFamilyIndex &&
+        mCurrentLayout != ImageLayout::TransferDst)
+    {
+        const ImageMemoryBarrierData &transition = kImageMemoryBarrierData[mCurrentLayout];
+
+        // In this case, the image is going to be used in the same way, so the src and dst stage
+        // masks must be necessarily equal.
+        ASSERT(transition.srcStageMask == transition.dstStageMask);
+
+        commandBuffer->executionBarrier(transition.dstStageMask);
+        return;
+    }
 
     const ImageMemoryBarrierData &transitionFrom = kImageMemoryBarrierData[mCurrentLayout];
     const ImageMemoryBarrierData &transitionTo   = kImageMemoryBarrierData[newLayout];
diff --git a/src/libANGLE/renderer/vulkan/vk_wrapper.h b/src/libANGLE/renderer/vulkan/vk_wrapper.h
index fb506dd..1872680 100644
--- a/src/libANGLE/renderer/vulkan/vk_wrapper.h
+++ b/src/libANGLE/renderer/vulkan/vk_wrapper.h
@@ -287,6 +287,8 @@
     void endRenderPass();
     void executeCommands(uint32_t commandBufferCount, const CommandBuffer *commandBuffers);
 
+    void executionBarrier(VkPipelineStageFlags stageMask);
+
     void fillBuffer(const Buffer &dstBuffer,
                     VkDeviceSize dstOffset,
                     VkDeviceSize size,
@@ -649,6 +651,12 @@
                          imageMemoryBarrierCount, imageMemoryBarriers);
 }
 
+ANGLE_INLINE void CommandBuffer::executionBarrier(VkPipelineStageFlags stageMask)
+{
+    ASSERT(valid());
+    vkCmdPipelineBarrier(mHandle, stageMask, stageMask, 0, 0, nullptr, 0, nullptr, 0, nullptr);
+}
+
 ANGLE_INLINE void CommandBuffer::imageBarrier(VkPipelineStageFlags srcStageMask,
                                               VkPipelineStageFlags dstStageMask,
                                               const VkImageMemoryBarrier *imageMemoryBarrier)