Metal: Optimized BufferSubData per device

Adds a staging buffer path which means there are 4 paths
for bufferSubData.

1. direct copy

   * get a pointer to the buffer
   * copy the new data to the buffer
   * if the buffer is managed, tell metal which part was updated

2. use a shadow copy

   * copy the data to a shadow copy
   * copy the entire shadow to a new buffer
   * start using the new buffer

3. use a new buffer

   * get a new buffer (or unused)
   * put the new data in the new buffer
   * blit any unchanged data from the old buffer to the new buffer
   * start using the new buffer

4. use a staging buffer

   * get a staging buffer
   * put the new data in the staging buffer
   * blit from the staging buffer to the existing buffer.

Further, there are 3 types of memory storage modes.
Managed, Staged, Private.

Based on the GPU type different storage modes and different
paths in different sitatutions are more performant.

So, add feature flags to select paths by GPU.

Bug: angleproject:7544
Change-Id: I741dd1874201043416374194bd2001ded8dbd9b4
Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/3842641
Reviewed-by: Kyle Piddington <kpiddington@apple.com>
Reviewed-by: Kenneth Russell <kbr@chromium.org>
Reviewed-by: Quyen Le <lehoangquyen@chromium.org>
Commit-Queue: Gregg Tavares <gman@chromium.org>
diff --git a/include/platform/FeaturesMtl_autogen.h b/include/platform/FeaturesMtl_autogen.h
index 116a0ee..fc93237 100644
--- a/include/platform/FeaturesMtl_autogen.h
+++ b/include/platform/FeaturesMtl_autogen.h
@@ -232,6 +232,31 @@
         "uploadDataToIosurfacesWithStagingBuffers", FeatureCategory::MetalWorkarounds,
         "When uploading data to IOSurface-backed textures, use a staging buffer.", &members,
         "http://anglebug.com/7573"};
+
+    FeatureInfo alwaysUseStagedBufferUpdates = {
+        "alwaysUseStagedBufferUpdates", FeatureCategory::MetalFeatures,
+        "Always update buffers by copying the data to a staging buffer and then blitting it to the "
+        "actual buffer",
+        &members, "http://anglebug.com/7544"};
+
+    FeatureInfo useShadowBuffersWhenAppropriate = {
+        "useShadowBuffersWhenAppropriate", FeatureCategory::MetalFeatures,
+        "On some architectures using a shadow buffer can be faster for certain size buffers",
+        &members, "http://anglebug.com/7544"};
+
+    FeatureInfo alwaysUseManagedStorageModeForBuffers = {
+        "alwaysUseManagedStorageModeForBuffers", FeatureCategory::MetalFeatures,
+        "Metal buffers can be managed, shared, or private. Sometimes managed is fastest", &members,
+        "http://anglebug.com/7544"};
+
+    FeatureInfo alwaysUseSharedStorageModeForBuffers = {
+        "alwaysUseSharedStorageModeForBuffers", FeatureCategory::MetalFeatures,
+        "Metal buffers can be managed, shared, or private. Sometimes shared is fastest", &members,
+        "http://anglebug.com/7544"};
+
+    FeatureInfo preferCpuForBuffersubdata = {
+        "preferCpuForBuffersubdata", FeatureCategory::MetalFeatures,
+        "Makes bufferSubData always update via CPU", &members, "http://anglebug.com/7544"};
 };
 
 inline FeaturesMtl::FeaturesMtl()  = default;
diff --git a/include/platform/mtl_features.json b/include/platform/mtl_features.json
index ccd383e..c928f8c 100644
--- a/include/platform/mtl_features.json
+++ b/include/platform/mtl_features.json
@@ -243,6 +243,46 @@
                 "When uploading data to IOSurface-backed textures, use a staging buffer."
             ],
             "issue": "http://anglebug.com/7573"
+        },
+        {
+            "name": "always_use_staged_buffer_updates",
+            "category": "Features",
+            "description": [
+                "Always update buffers by copying the data to a staging buffer and then blitting it to the actual buffer"
+            ],
+            "issue": "http://anglebug.com/7544"
+        },
+        {
+            "name": "use_shadow_buffers_when_appropriate",
+            "category": "Features",
+            "description": [
+                "On some architectures using a shadow buffer can be faster for certain size buffers"
+            ],
+            "issue": "http://anglebug.com/7544"
+        },
+        {
+            "name": "always_use_managed_storage_mode_for_buffers",
+            "category": "Features",
+            "description": [
+                "Metal buffers can be managed, shared, or private. Sometimes managed is fastest"
+            ],
+            "issue": "http://anglebug.com/7544"
+        },
+        {
+            "name": "always_use_shared_storage_mode_for_buffers",
+            "category": "Features",
+            "description": [
+                "Metal buffers can be managed, shared, or private. Sometimes shared is fastest"
+            ],
+            "issue": "http://anglebug.com/7544"
+        },
+        {
+            "name": "prefer_cpu_for_buffersubdata",
+            "category": "Features",
+            "description": [
+                "Makes bufferSubData always update via CPU"
+            ],
+            "issue": "http://anglebug.com/7544"
         }
     ]
 }
diff --git a/scripts/code_generation_hashes/ANGLE_features.json b/scripts/code_generation_hashes/ANGLE_features.json
index 66d00a0..5da2bea 100644
--- a/scripts/code_generation_hashes/ANGLE_features.json
+++ b/scripts/code_generation_hashes/ANGLE_features.json
@@ -4,7 +4,7 @@
   "include/platform/FeaturesGL_autogen.h":
     "c192145f3939d4d0bf85a39649e0c14e",
   "include/platform/FeaturesMtl_autogen.h":
-    "80c0f3379882d1f67e523a3a1530cd79",
+    "c31c1c77040ef119dfaf882b5b5e65ab",
   "include/platform/FeaturesVk_autogen.h":
     "03f5b51f08b6cb4f831764aa4848f399",
   "include/platform/FrontendFeatures_autogen.h":
@@ -18,11 +18,11 @@
   "include/platform/gl_features.json":
     "a50e9bd2fa9eb0685d9b1c118a21ad2c",
   "include/platform/mtl_features.json":
-    "9833c17145ba2223da2e607a9340afda",
+    "408385ed8fa29652e23a6338faec6d2f",
   "include/platform/vk_features.json":
     "a0dd571e23e0bd521eb42d72a0863297",
   "util/angle_features_autogen.cpp":
-    "b6a2d2cac7d30b6c08d9398fed38a14c",
+    "d43086098956bfd4374284a05cfb884d",
   "util/angle_features_autogen.h":
-    "1e4b7c6e89ee370d052fa7f0c48c11c6"
+    "80421f1223abdee293434a2c7f8ff3bc"
 }
\ No newline at end of file
diff --git a/src/libANGLE/renderer/metal/BUILD.gn b/src/libANGLE/renderer/metal/BUILD.gn
index 9f3aa91..5705faf 100644
--- a/src/libANGLE/renderer/metal/BUILD.gn
+++ b/src/libANGLE/renderer/metal/BUILD.gn
@@ -51,6 +51,8 @@
   "TransformFeedbackMtl.mm",
   "VertexArrayMtl.h",
   "VertexArrayMtl.mm",
+  "mtl_buffer_manager.h",
+  "mtl_buffer_manager.mm",
   "mtl_buffer_pool.h",
   "mtl_buffer_pool.mm",
   "mtl_command_buffer.h",
diff --git a/src/libANGLE/renderer/metal/BufferMtl.h b/src/libANGLE/renderer/metal/BufferMtl.h
index c20690c..4313088 100644
--- a/src/libANGLE/renderer/metal/BufferMtl.h
+++ b/src/libANGLE/renderer/metal/BufferMtl.h
@@ -151,7 +151,8 @@
                                       size_t count,
                                       std::pair<uint32_t, uint32_t> *outIndices);
 
-    const uint8_t *getClientShadowCopyData(ContextMtl *contextMtl);
+    const uint8_t *getBufferDataReadOnly(ContextMtl *contextMtl);
+    bool isSafeToReadFromBufferViaBlit(ContextMtl *contextMtl);
 
     ConversionBufferMtl *getVertexConversionBuffer(ContextMtl *context,
                                                    angle::FormatID formatID,
@@ -186,27 +187,36 @@
                                  size_t size,
                                  size_t offset);
 
-    angle::Result commitShadowCopy(const gl::Context *context);
-    angle::Result commitShadowCopy(const gl::Context *context, size_t size);
+    angle::Result commitShadowCopy(ContextMtl *contextMtl);
+    angle::Result commitShadowCopy(ContextMtl *contextMtl, size_t size);
 
     void markConversionBuffersDirty();
     void clearConversionBuffers();
 
+    angle::Result putDataInNewBufferAndStartUsingNewBuffer(ContextMtl *contextMtl,
+                                                           const uint8_t *srcPtr,
+                                                           size_t sizeToCopy,
+                                                           size_t offset);
+    angle::Result updateExistingBufferViaBlitFromStagingBuffer(ContextMtl *contextMtl,
+                                                               const uint8_t *srcPtr,
+                                                               size_t sizeToCopy,
+                                                               size_t offset);
+    angle::Result copyDataToExistingBufferViaCPU(ContextMtl *contextMtl,
+                                                 const uint8_t *srcPtr,
+                                                 size_t sizeToCopy,
+                                                 size_t offset);
+    angle::Result updateShadowCopyThenCopyShadowToNewBuffer(ContextMtl *contextMtl,
+                                                            const uint8_t *srcPtr,
+                                                            size_t sizeToCopy,
+                                                            size_t offset);
+
     bool clientShadowCopyDataNeedSync(ContextMtl *contextMtl);
     void ensureShadowCopySyncedFromGPU(ContextMtl *contextMtl);
     uint8_t *syncAndObtainShadowCopy(ContextMtl *contextMtl);
 
-    // Convenient method
-    const uint8_t *getClientShadowCopyData(const gl::Context *context)
-    {
-        return getClientShadowCopyData(mtl::GetImpl(context));
-    }
-    // Client side shadow buffer
+    // Optional client side shadow buffer
     angle::MemoryBuffer mShadowCopy;
 
-    // GPU side buffers pool
-    mtl::BufferPool mBufferPool;
-
     // A cache of converted vertex data.
     std::vector<VertexConversionBufferMtl> mVertexConversionBuffers;
 
@@ -224,6 +234,9 @@
     };
     std::optional<RestartRangeCache> mRestartRangeCache;
     std::vector<IndexRange> mRestartIndices;
+    size_t mGLSize        = 0;  // size GL asked for (vs size we actually allocated)
+    size_t mRevisionCount = 0;  // for generating labels only
+    gl::BufferUsage mUsage;
 };
 
 class SimpleWeakBufferHolderMtl : public BufferHolderMtl
diff --git a/src/libANGLE/renderer/metal/BufferMtl.mm b/src/libANGLE/renderer/metal/BufferMtl.mm
index c8769ef..f72f943 100644
--- a/src/libANGLE/renderer/metal/BufferMtl.mm
+++ b/src/libANGLE/renderer/metal/BufferMtl.mm
@@ -13,6 +13,7 @@
 #include "common/utilities.h"
 #include "libANGLE/renderer/metal/ContextMtl.h"
 #include "libANGLE/renderer/metal/DisplayMtl.h"
+#include "libANGLE/renderer/metal/mtl_buffer_manager.h"
 
 namespace rx
 {
@@ -39,6 +40,12 @@
     return angle::Result::Continue;
 }
 
+bool isOffsetAndSizeMetalBlitCompatible(size_t offset, size_t size)
+{
+    // Metal requires offset and size to be multiples of 4
+    return offset % 4 == 0 && size % 4 == 0;
+}
+
 }  // namespace
 
 // ConversionBufferMtl implementation.
@@ -88,9 +95,7 @@
 {}
 
 // BufferMtl implementation
-BufferMtl::BufferMtl(const gl::BufferState &state)
-    : BufferImpl(state), mBufferPool(/** alwaysAllocNewBuffer */ true)
-{}
+BufferMtl::BufferMtl(const gl::BufferState &state) : BufferImpl(state) {}
 
 BufferMtl::~BufferMtl() {}
 
@@ -98,8 +103,13 @@
 {
     ContextMtl *contextMtl = mtl::GetImpl(context);
     mShadowCopy.clear();
-    mBufferPool.destroy(contextMtl);
-    mBuffer = nullptr;
+
+    // if there's a buffer, give it back to the buffer manager
+    if (mBuffer)
+    {
+        contextMtl->getBufferManager().returnBuffer(contextMtl, mBuffer);
+        mBuffer = nullptr;
+    }
 
     clearConversionBuffers();
 }
@@ -136,19 +146,30 @@
     ContextMtl *contextMtl = mtl::GetImpl(context);
     auto srcMtl            = GetAs<BufferMtl>(source);
 
-    if (srcMtl->clientShadowCopyDataNeedSync(contextMtl) || mBuffer->isBeingUsedByGPU(contextMtl))
-    {
-        // If shadow copy requires a synchronization then use blit command instead.
-        // It might break a pending render pass, but still faster than synchronization with
-        // GPU.
-        mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
-        blitEncoder->copyBuffer(srcMtl->getCurrentBuffer(), sourceOffset, mBuffer, destOffset,
-                                size);
+    markConversionBuffersDirty();
 
-        return angle::Result::Continue;
+    if (mShadowCopy.size() > 0)
+    {
+        if (srcMtl->clientShadowCopyDataNeedSync(contextMtl) ||
+            mBuffer->isBeingUsedByGPU(contextMtl))
+        {
+            // If shadow copy requires a synchronization then use blit command instead.
+            // It might break a pending render pass, but still faster than synchronization with
+            // GPU.
+            mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
+            blitEncoder->copyBuffer(srcMtl->getCurrentBuffer(), sourceOffset, mBuffer, destOffset,
+                                    size);
+
+            return angle::Result::Continue;
+        }
+        return setSubDataImpl(context, srcMtl->getBufferDataReadOnly(contextMtl) + sourceOffset,
+                              size, destOffset);
     }
-    return setSubDataImpl(context, srcMtl->getClientShadowCopyData(contextMtl) + sourceOffset, size,
-                          destOffset);
+
+    mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
+    blitEncoder->copyBuffer(srcMtl->getCurrentBuffer(), sourceOffset, mBuffer, destOffset, size);
+
+    return angle::Result::Continue;
 }
 
 angle::Result BufferMtl::map(const gl::Context *context, GLenum access, void **mapPtr)
@@ -176,7 +197,7 @@
     if (mapPtr)
     {
         ContextMtl *contextMtl = mtl::GetImpl(context);
-        if (mBufferPool.getMaxBuffers() == 1)
+        if (mShadowCopy.size() == 0)
         {
             *mapPtr = mBuffer->mapWithOpt(contextMtl, (access & GL_MAP_WRITE_BIT) == 0,
                                           access & GL_MAP_UNSYNCHRONIZED_BIT) +
@@ -199,7 +220,7 @@
 
     markConversionBuffersDirty();
 
-    if (mBufferPool.getMaxBuffers() == 1)
+    if (mShadowCopy.size() == 0)
     {
         ASSERT(mBuffer);
         if (mState.getAccessFlags() & GL_MAP_WRITE_BIT)
@@ -215,8 +236,6 @@
     }
     else
     {
-        ASSERT(mShadowCopy.size());
-
         if (mState.getAccessFlags() & GL_MAP_UNSYNCHRONIZED_BIT)
         {
             // Copy the mapped region without synchronization with GPU
@@ -228,7 +247,7 @@
         else
         {
             // commit shadow copy data to GPU synchronously
-            ANGLE_TRY(commitShadowCopy(context));
+            ANGLE_TRY(commitShadowCopy(contextMtl));
         }
     }
 
@@ -247,7 +266,7 @@
                                        bool primitiveRestartEnabled,
                                        gl::IndexRange *outRange)
 {
-    const uint8_t *indices = getClientShadowCopyData(mtl::GetImpl(context)) + offset;
+    const uint8_t *indices = getBufferDataReadOnly(mtl::GetImpl(context)) + offset;
 
     *outRange = gl::ComputeIndexRange(type, indices, count, primitiveRestartEnabled);
 
@@ -260,7 +279,7 @@
                                              size_t count,
                                              std::pair<uint32_t, uint32_t> *outIndices)
 {
-    const uint8_t *indices = getClientShadowCopyData(contextMtl) + offset;
+    const uint8_t *indices = getBufferDataReadOnly(contextMtl) + offset;
 
     switch (type)
     {
@@ -283,10 +302,9 @@
     markConversionBuffersDirty();
 }
 
-/* public */
-const uint8_t *BufferMtl::getClientShadowCopyData(ContextMtl *contextMtl)
+const uint8_t *BufferMtl::getBufferDataReadOnly(ContextMtl *contextMtl)
 {
-    if (mBufferPool.getMaxBuffers() == 1)
+    if (mShadowCopy.size() == 0)
     {
         // Don't need shadow copy in this case, use the buffer directly
         return mBuffer->mapReadOnly(contextMtl);
@@ -479,13 +497,45 @@
     return restartIndices;
 }
 
+namespace
+{
+
+bool useSharedMemory(ContextMtl *contextMtl, gl::BufferUsage usage)
+{
+    const angle::FeaturesMtl &features = contextMtl->getDisplay()->getFeatures();
+    if (features.alwaysUseManagedStorageModeForBuffers.enabled)
+    {
+        return false;
+    }
+
+    if (features.alwaysUseSharedStorageModeForBuffers.enabled)
+    {
+        return true;
+    }
+
+    switch (usage)
+    {
+        case gl::BufferUsage::StaticCopy:
+        case gl::BufferUsage::StaticDraw:
+        case gl::BufferUsage::StaticRead:
+        case gl::BufferUsage::DynamicRead:
+        case gl::BufferUsage::StreamRead:
+            return true;
+        default:
+            return false;
+    }
+}
+
+}  // namespace
+
 angle::Result BufferMtl::setDataImpl(const gl::Context *context,
                                      gl::BufferBinding target,
                                      const void *data,
                                      size_t intendedSize,
                                      gl::BufferUsage usage)
 {
-    ContextMtl *contextMtl = mtl::GetImpl(context);
+    ContextMtl *contextMtl             = mtl::GetImpl(context);
+    const angle::FeaturesMtl &features = contextMtl->getDisplay()->getFeatures();
 
     // Invalidate conversion buffers
     if (mState.getSize() != static_cast<GLint64>(intendedSize))
@@ -497,80 +547,32 @@
         markConversionBuffersDirty();
     }
 
+    mUsage              = usage;
+    mGLSize             = intendedSize;
     size_t adjustedSize = std::max<size_t>(1, intendedSize);
 
     // Ensures no validation layer issues in std140 with data types like vec3 being 12 bytes vs 16
     // in MSL.
     if (target == gl::BufferBinding::Uniform)
     {
+        // This doesn't work! A buffer can be allocated on ARRAY_BUFFER and used in UNIFORM_BUFFER
+        // TODO(anglebug.com/7585)
         adjustedSize = roundUpPow2(adjustedSize, (size_t)16);
     }
 
-    size_t maxBuffers;
-    switch (usage)
-    {
-        case gl::BufferUsage::StaticCopy:
-        case gl::BufferUsage::StaticDraw:
-        case gl::BufferUsage::StaticRead:
-        case gl::BufferUsage::DynamicRead:
-        case gl::BufferUsage::StreamRead:
-            maxBuffers = 1;  // static/read buffer doesn't need high speed data update
-            mBufferPool.setAlwaysUseGPUMem();
-            break;
-        default:
-            // dynamic buffer, allow up to 10 update per frame/encoding without
-            // waiting for GPU.
-            if (adjustedSize <= mtl::kSharedMemBufferMaxBufSizeHint)
-            {
-                maxBuffers = 10;
-                mBufferPool.setAlwaysUseSharedMem();
-            }
-            else
-            {
-                maxBuffers = 1;
-                mBufferPool.setAlwaysUseGPUMem();
-            }
-            break;
-    }
-
     // Re-create the buffer
-    mBuffer = nullptr;
-    ANGLE_TRY(mBufferPool.reset(contextMtl, adjustedSize, 1, maxBuffers));
-
-    if (maxBuffers > 1)
+    mtl::BufferManager &bufferManager = contextMtl->getBufferManager();
+    if (mBuffer)
     {
-        // We use shadow copy to maintain consistent data between buffers in pool
-        ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(adjustedSize), GL_OUT_OF_MEMORY);
-
-        if (data)
-        {
-            // Transfer data to shadow copy buffer
-            auto ptr = static_cast<const uint8_t *>(data);
-            std::copy(ptr, ptr + intendedSize, mShadowCopy.data());
-
-            // Transfer data from shadow copy buffer to GPU buffer.
-            ANGLE_TRY(commitShadowCopy(context, adjustedSize));
-        }
-        else
-        {
-            // This is needed so that first buffer pointer could be available
-            ANGLE_TRY(commitShadowCopy(context, 0));
-        }
+        // Return the current buffer to the buffer manager
+        // It will not be re-used until it's no longer in use.
+        bufferManager.returnBuffer(contextMtl, mBuffer);
+        mBuffer = nullptr;
     }
-    else
-    {
-        // We don't need shadow copy if there will be only one buffer in the pool.
-        ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(0), GL_OUT_OF_MEMORY);
 
-        // Allocate one buffer to use
-        ANGLE_TRY(
-            mBufferPool.allocate(contextMtl, adjustedSize, nullptr, &mBuffer, nullptr, nullptr));
-
-        if (data)
-        {
-            ANGLE_TRY(setSubDataImpl(context, data, intendedSize, 0));
-        }
-    }
+    // Get a new buffer
+    bool useSharedMem = useSharedMemory(contextMtl, usage);
+    ANGLE_TRY(bufferManager.getBuffer(contextMtl, adjustedSize, useSharedMem, mBuffer));
 
 #ifndef NDEBUG
     ANGLE_MTL_OBJC_SCOPE
@@ -579,9 +581,137 @@
     }
 #endif
 
+    // We may use shadow copy to maintain consistent data between buffers in pool
+    size_t shadowSize = (!features.preferCpuForBuffersubdata.enabled &&
+                         features.useShadowBuffersWhenAppropriate.enabled &&
+                         adjustedSize <= mtl::kSharedMemBufferMaxBufSizeHint)
+                            ? adjustedSize
+                            : 0;
+    ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(shadowSize), GL_OUT_OF_MEMORY);
+
+    if (data)
+    {
+        ANGLE_TRY(setSubDataImpl(context, data, intendedSize, 0));
+    }
+
     return angle::Result::Continue;
 }
 
+// states:
+//  * The buffer is not use
+//
+//    safe = true
+//
+//  * The buffer has a pending blit
+//
+//    In this case, as long as we are only reading from it
+//    via blit to a new buffer our blits will happen after existing
+//    blits
+//
+//    safe = true
+//
+//  * The buffer has pending writes in a commited render encoder
+//
+//    In this case we're encoding commands that will happen after
+//    that encoder
+//
+//    safe = true
+//
+//  * The buffer has pending writes in the current render encoder
+//
+//    in this case we have to split/end the render encoder
+//    before we can use the buffer.
+//
+//    safe = false
+bool BufferMtl::isSafeToReadFromBufferViaBlit(ContextMtl *contextMtl)
+{
+    uint64_t serial   = mBuffer->getLastWritingRenderEncoderSerial();
+    bool isSameSerial = contextMtl->isCurrentRenderEncoderSerial(serial);
+    return !isSameSerial;
+}
+
+angle::Result BufferMtl::updateExistingBufferViaBlitFromStagingBuffer(ContextMtl *contextMtl,
+                                                                      const uint8_t *srcPtr,
+                                                                      size_t sizeToCopy,
+                                                                      size_t offset)
+{
+    ASSERT(isOffsetAndSizeMetalBlitCompatible(offset, sizeToCopy));
+
+    mtl::BufferManager &bufferManager = contextMtl->getBufferManager();
+    return bufferManager.queueBlitCopyDataToBuffer(contextMtl, srcPtr, sizeToCopy, offset, mBuffer);
+}
+
+// * get a new or unused buffer
+// * copy the new data to it
+// * copy any old data not overwriten by the new data to the new buffer
+// * start using the new buffer
+angle::Result BufferMtl::putDataInNewBufferAndStartUsingNewBuffer(ContextMtl *contextMtl,
+                                                                  const uint8_t *srcPtr,
+                                                                  size_t sizeToCopy,
+                                                                  size_t offset)
+{
+    ASSERT(isOffsetAndSizeMetalBlitCompatible(offset, sizeToCopy));
+
+    mtl::BufferManager &bufferManager = contextMtl->getBufferManager();
+    mtl::BufferRef oldBuffer          = mBuffer;
+    bool useSharedMem                 = useSharedMemory(contextMtl, mUsage);
+
+    ANGLE_TRY(bufferManager.getBuffer(contextMtl, mGLSize, useSharedMem, mBuffer));
+    mBuffer->get().label = [NSString stringWithFormat:@"BufferMtl=%p(%lu)", this, ++mRevisionCount];
+
+    uint8_t *ptr = mBuffer->mapWithOpt(contextMtl, false, true);
+    std::copy(srcPtr, srcPtr + sizeToCopy, ptr + offset);
+    mBuffer->unmapAndFlushSubset(contextMtl, offset, sizeToCopy);
+
+    if (offset > 0 || offset + sizeToCopy < mGLSize)
+    {
+        mtl::BlitCommandEncoder *blitEncoder =
+            contextMtl->getBlitCommandEncoderWithoutEndingRenderEncoder();
+        if (offset > 0)
+        {
+            // copy old data before updated region
+            blitEncoder->copyBuffer(oldBuffer, 0, mBuffer, 0, offset);
+        }
+        if (offset + sizeToCopy < mGLSize)
+        {
+            // copy old data after updated region
+            const size_t endOffset     = offset + sizeToCopy;
+            const size_t endSizeToCopy = mGLSize - endOffset;
+            blitEncoder->copyBuffer(oldBuffer, endOffset, mBuffer, endOffset, endSizeToCopy);
+        }
+    }
+
+    bufferManager.returnBuffer(contextMtl, oldBuffer);
+    return angle::Result::Continue;
+}
+
+angle::Result BufferMtl::copyDataToExistingBufferViaCPU(ContextMtl *contextMtl,
+                                                        const uint8_t *srcPtr,
+                                                        size_t sizeToCopy,
+                                                        size_t offset)
+{
+    uint8_t *ptr = mBuffer->map(contextMtl);
+    std::copy(srcPtr, srcPtr + sizeToCopy, ptr + offset);
+    mBuffer->unmapAndFlushSubset(contextMtl, offset, sizeToCopy);
+    return angle::Result::Continue;
+}
+
+angle::Result BufferMtl::updateShadowCopyThenCopyShadowToNewBuffer(ContextMtl *contextMtl,
+                                                                   const uint8_t *srcPtr,
+                                                                   size_t sizeToCopy,
+                                                                   size_t offset)
+{
+    // 1. Before copying data from client, we need to synchronize modified data from GPU to
+    // shadow copy first.
+    ensureShadowCopySyncedFromGPU(contextMtl);
+
+    // 2. Copy data from client to shadow copy.
+    std::copy(srcPtr, srcPtr + sizeToCopy, mShadowCopy.data() + offset);
+
+    // 3. Copy data from shadow copy to GPU.
+    return commitShadowCopy(contextMtl);
+}
+
 angle::Result BufferMtl::setSubDataImpl(const gl::Context *context,
                                         const void *data,
                                         size_t size,
@@ -594,68 +724,72 @@
 
     ASSERT(mBuffer);
 
-    ContextMtl *contextMtl = mtl::GetImpl(context);
+    ContextMtl *contextMtl             = mtl::GetImpl(context);
+    const angle::FeaturesMtl &features = contextMtl->getDisplay()->getFeatures();
 
-    ANGLE_MTL_TRY(contextMtl, offset <= mBuffer->size());
+    ANGLE_MTL_TRY(contextMtl, offset <= mGLSize);
 
     auto srcPtr     = static_cast<const uint8_t *>(data);
-    auto sizeToCopy = std::min<size_t>(size, mBuffer->size() - offset);
+    auto sizeToCopy = std::min<size_t>(size, mGLSize - offset);
 
     markConversionBuffersDirty();
 
-    if (mBufferPool.getMaxBuffers() == 1)
+    if (features.preferCpuForBuffersubdata.enabled)
     {
-        ASSERT(mBuffer);
-        uint8_t *ptr = mBuffer->map(contextMtl);
-        std::copy(srcPtr, srcPtr + sizeToCopy, ptr + offset);
-        mBuffer->unmapAndFlushSubset(contextMtl, offset, sizeToCopy);
+        return copyDataToExistingBufferViaCPU(contextMtl, srcPtr, sizeToCopy, offset);
+    }
+
+    if (mShadowCopy.size() > 0)
+    {
+        return updateShadowCopyThenCopyShadowToNewBuffer(contextMtl, srcPtr, sizeToCopy, offset);
     }
     else
     {
-        ASSERT(mShadowCopy.size());
+        bool alwaysUseStagedBufferUpdates = features.alwaysUseStagedBufferUpdates.enabled;
 
-        // 1. Before copying data from client, we need to synchronize modified data from GPU to
-        // shadow copy first.
-        ensureShadowCopySyncedFromGPU(contextMtl);
-
-        // 2. Copy data from client to shadow copy.
-        std::copy(srcPtr, srcPtr + sizeToCopy, mShadowCopy.data() + offset);
-
-        // 3. Copy data from shadow copy to GPU.
-        ANGLE_TRY(commitShadowCopy(context));
+        if (isOffsetAndSizeMetalBlitCompatible(offset, size) &&
+            (alwaysUseStagedBufferUpdates || mBuffer->isBeingUsedByGPU(contextMtl)))
+        {
+            if (alwaysUseStagedBufferUpdates || !isSafeToReadFromBufferViaBlit(contextMtl))
+            {
+                // We can't use the buffer now so copy the data
+                // to a staging buffer and blit it in
+                return updateExistingBufferViaBlitFromStagingBuffer(contextMtl, srcPtr, sizeToCopy,
+                                                                    offset);
+            }
+            else
+            {
+                return putDataInNewBufferAndStartUsingNewBuffer(contextMtl, srcPtr, sizeToCopy,
+                                                                offset);
+            }
+        }
+        else
+        {
+            return copyDataToExistingBufferViaCPU(contextMtl, srcPtr, sizeToCopy, offset);
+        }
     }
-
-    return angle::Result::Continue;
 }
 
-angle::Result BufferMtl::commitShadowCopy(const gl::Context *context)
+angle::Result BufferMtl::commitShadowCopy(ContextMtl *contextMtl)
 {
-    return commitShadowCopy(context, size());
+    return commitShadowCopy(contextMtl, mGLSize);
 }
 
-angle::Result BufferMtl::commitShadowCopy(const gl::Context *context, size_t size)
+angle::Result BufferMtl::commitShadowCopy(ContextMtl *contextMtl, size_t size)
 {
-    ContextMtl *contextMtl = mtl::GetImpl(context);
+    mtl::BufferManager &bufferManager = contextMtl->getBufferManager();
+    bool useSharedMem                 = useSharedMemory(contextMtl, mUsage);
 
-    if (!size)
-    {
-        // Skip mapping if size to commit is zero.
-        // zero size is passed to allocate buffer only.
-        ANGLE_TRY(mBufferPool.allocate(contextMtl, mShadowCopy.size(), nullptr, &mBuffer, nullptr,
-                                       nullptr));
-    }
-    else
-    {
-        uint8_t *ptr = nullptr;
-        mBufferPool.releaseInFlightBuffers(contextMtl);
-        ANGLE_TRY(
-            mBufferPool.allocate(contextMtl, mShadowCopy.size(), &ptr, &mBuffer, nullptr, nullptr));
+    bufferManager.returnBuffer(contextMtl, mBuffer);
+    ANGLE_TRY(bufferManager.getBuffer(contextMtl, mGLSize, useSharedMem, mBuffer));
 
+    if (size)
+    {
+        uint8_t *ptr = mBuffer->mapWithOpt(contextMtl, false, true);
         std::copy(mShadowCopy.data(), mShadowCopy.data() + size, ptr);
+        mBuffer->unmapAndFlushSubset(contextMtl, 0, size);
     }
 
-    ANGLE_TRY(mBufferPool.commit(contextMtl));
-
     return angle::Result::Continue;
 }
 
diff --git a/src/libANGLE/renderer/metal/ContextMtl.h b/src/libANGLE/renderer/metal/ContextMtl.h
index 3eb8aab..2de2da9 100644
--- a/src/libANGLE/renderer/metal/ContextMtl.h
+++ b/src/libANGLE/renderer/metal/ContextMtl.h
@@ -17,6 +17,7 @@
 #include "libANGLE/Context.h"
 #include "libANGLE/renderer/ContextImpl.h"
 #include "libANGLE/renderer/metal/ProvokingVertexHelper.h"
+#include "libANGLE/renderer/metal/mtl_buffer_manager.h"
 #include "libANGLE/renderer/metal/mtl_buffer_pool.h"
 #include "libANGLE/renderer/metal/mtl_command_buffer.h"
 #include "libANGLE/renderer/metal/mtl_context_device.h"
@@ -369,6 +370,7 @@
     // Will end current command encoder and start new blit command encoder. Unless a blit comamnd
     // encoder is already started.
     mtl::BlitCommandEncoder *getBlitCommandEncoder();
+
     // Will end current command encoder and start new compute command encoder. Unless a compute
     // command encoder is already started.
     mtl::ComputeCommandEncoder *getComputeCommandEncoder();
@@ -381,6 +383,8 @@
     // Get the provoking vertex command encoder.
     mtl::ComputeCommandEncoder *getIndexPreprocessingCommandEncoder();
 
+    bool isCurrentRenderEncoderSerial(uint64_t serial);
+
     const mtl::ContextDevice &getMetalDevice() const { return mContextDevice; }
 
     angle::Result copy2DTextureSlice0Level0ToWorkTexture(const mtl::TextureRef &srcTexture);
@@ -390,6 +394,7 @@
                                                     const mtl::MipmapNativeLevel &mipNativeLevel,
                                                     uint32_t layerIndex);
     const mtl::BufferRef &getWorkBuffer() const { return mWorkBuffer; }
+    mtl::BufferManager &getBufferManager() { return mBufferManager; }
 
   private:
     void ensureCommandBufferReady();
@@ -600,6 +605,8 @@
     MTLCullMode mCullMode;
     bool mCullAllPolygons = false;
 
+    mtl::BufferManager mBufferManager;
+
     // Lineloop and TriFan index buffer
     mtl::BufferPool mLineLoopIndexBuffer;
     mtl::BufferPool mLineLoopLastSegmentIndexBuffer;
diff --git a/src/libANGLE/renderer/metal/ContextMtl.mm b/src/libANGLE/renderer/metal/ContextMtl.mm
index 03396f3..6f00b61 100644
--- a/src/libANGLE/renderer/metal/ContextMtl.mm
+++ b/src/libANGLE/renderer/metal/ContextMtl.mm
@@ -1663,6 +1663,11 @@
         disableActiveOcclusionQueryInRenderPass();
     }
 
+    if (mBlitEncoder.valid())
+    {
+        mBlitEncoder.endEncoding();
+    }
+
     encoder->endEncoding();
 
     // Resolve visibility results
@@ -1755,6 +1760,16 @@
            mRenderEncoder.renderPassDesc().equalIgnoreLoadStoreOptions(desc);
 }
 
+bool ContextMtl::isCurrentRenderEncoderSerial(uint64_t serial)
+{
+    if (!mRenderEncoder.valid())
+    {
+        return false;
+    }
+
+    return serial == mRenderEncoder.getSerial();
+}
+
 // Get current render encoder
 mtl::RenderCommandEncoder *ContextMtl::getRenderCommandEncoder()
 {
@@ -1856,6 +1871,11 @@
 
 mtl::BlitCommandEncoder *ContextMtl::getBlitCommandEncoder()
 {
+    if (mRenderEncoder.valid() || mComputeEncoder.valid())
+    {
+        endEncoding(true);
+    }
+
     if (mBlitEncoder.valid())
     {
         return &mBlitEncoder;
@@ -1882,6 +1902,11 @@
 
 mtl::ComputeCommandEncoder *ContextMtl::getComputeCommandEncoder()
 {
+    if (mRenderEncoder.valid() || mBlitEncoder.valid())
+    {
+        endEncoding(true);
+    }
+
     if (mComputeEncoder.valid())
     {
         return &mComputeEncoder;
@@ -2725,7 +2750,8 @@
     // Expand the buffer if it is not big enough.
     if (!mWorkBuffer || mWorkBuffer->size() < sizeInBytes)
     {
-        ANGLE_TRY(mtl::Buffer::MakeBuffer(this, sizeInBytes, nullptr, &mWorkBuffer));
+        ANGLE_TRY(mtl::Buffer::MakeBufferWithSharedMemOpt(this, true, sizeInBytes, nullptr,
+                                                          &mWorkBuffer));
     }
 
     gl::Rectangle region(0, 0, width, height);
diff --git a/src/libANGLE/renderer/metal/DisplayMtl.mm b/src/libANGLE/renderer/metal/DisplayMtl.mm
index 68f6b89..419acdb 100644
--- a/src/libANGLE/renderer/metal/DisplayMtl.mm
+++ b/src/libANGLE/renderer/metal/DisplayMtl.mm
@@ -1193,6 +1193,16 @@
 
     ANGLE_FEATURE_CONDITION((&mFeatures), preemptivelyStartProvokingVertexCommandBuffer, isAMD());
 
+    ANGLE_FEATURE_CONDITION((&mFeatures), alwaysUseStagedBufferUpdates, isAMD());
+    ANGLE_FEATURE_CONDITION((&mFeatures), alwaysUseManagedStorageModeForBuffers, isAMD());
+
+    ANGLE_FEATURE_CONDITION((&mFeatures), alwaysUseSharedStorageModeForBuffers, isIntel());
+    ANGLE_FEATURE_CONDITION((&mFeatures), useShadowBuffersWhenAppropriate, isIntel());
+
+    // At least one of these must not be set.
+    ASSERT(!mFeatures.alwaysUseManagedStorageModeForBuffers.enabled ||
+           !mFeatures.alwaysUseSharedStorageModeForBuffers.enabled);
+
     bool defaultDirectToMetal = true;
     ANGLE_FEATURE_CONDITION((&mFeatures), directMetalGeneration, defaultDirectToMetal);
 
diff --git a/src/libANGLE/renderer/metal/FrameBufferMtl.mm b/src/libANGLE/renderer/metal/FrameBufferMtl.mm
index 3c010fa..6f5c419 100644
--- a/src/libANGLE/renderer/metal/FrameBufferMtl.mm
+++ b/src/libANGLE/renderer/metal/FrameBufferMtl.mm
@@ -1611,6 +1611,7 @@
 
         return result;
     }
+
     if (texture->isBeingUsedByGPU(contextMtl))
     {
         contextMtl->flushCommandBuffer(mtl::WaitUntilFinished);
diff --git a/src/libANGLE/renderer/metal/ProgramMtl.mm b/src/libANGLE/renderer/metal/ProgramMtl.mm
index 150e858..65777ed 100644
--- a/src/libANGLE/renderer/metal/ProgramMtl.mm
+++ b/src/libANGLE/renderer/metal/ProgramMtl.mm
@@ -1460,7 +1460,7 @@
             // Has the content of the buffer has changed since last conversion?
             if (conversion->dirty)
             {
-                const uint8_t *srcBytes = bufferMtl->getClientShadowCopyData(context);
+                const uint8_t *srcBytes = bufferMtl->getBufferDataReadOnly(context);
                 srcBytes += offsetModulo;
                 size_t sizeToCopy      = bufferMtl->size() - offsetModulo;
                 size_t bytesToAllocate = roundUp<size_t>(sizeToCopy, 16u);
diff --git a/src/libANGLE/renderer/metal/TextureMtl.mm b/src/libANGLE/renderer/metal/TextureMtl.mm
index 7c24916..90548ad 100644
--- a/src/libANGLE/renderer/metal/TextureMtl.mm
+++ b/src/libANGLE/renderer/metal/TextureMtl.mm
@@ -1817,7 +1817,7 @@
         {
             // NOTE(hqle): packed depth & stencil texture cannot copy from buffer directly, needs
             // to split its depth & stencil data and copy separately.
-            const uint8_t *clientData = unpackBufferMtl->getClientShadowCopyData(contextMtl);
+            const uint8_t *clientData = unpackBufferMtl->getBufferDataReadOnly(contextMtl);
             clientData += offset;
             ANGLE_TRY(UploadTextureContents(context, mFormat.actualAngleFormat(), mtlArea,
                                             mtl::kZeroNativeMipLevel, slice, clientData,
@@ -1871,7 +1871,7 @@
             mFormat.intendedAngleFormat().isBlock)
         {
             // Unsupported format, use CPU path.
-            const uint8_t *clientData = unpackBufferMtl->getClientShadowCopyData(contextMtl);
+            const uint8_t *clientData = unpackBufferMtl->getBufferDataReadOnly(contextMtl);
             clientData += offset;
             ANGLE_TRY(convertAndSetPerSliceSubImage(context, slice, mtlArea, internalFormat, type,
                                                     pixelsAngleFormat, pixelsRowPitch,
diff --git a/src/libANGLE/renderer/metal/VertexArrayMtl.mm b/src/libANGLE/renderer/metal/VertexArrayMtl.mm
index d0c1627..f205dba 100644
--- a/src/libANGLE/renderer/metal/VertexArrayMtl.mm
+++ b/src/libANGLE/renderer/metal/VertexArrayMtl.mm
@@ -881,7 +881,7 @@
     {
         // We shouldn't use GPU to convert when we are in a middle of a render pass.
         ANGLE_TRY(StreamIndexData(contextMtl, &conversion->data,
-                                  idxBuffer->getClientShadowCopyData(contextMtl) + offsetModulo,
+                                  idxBuffer->getBufferDataReadOnly(contextMtl) + offsetModulo,
                                   indexType, indexCount, glState.isPrimitiveRestartEnabled(),
                                   &conversion->convertedBuffer, &conversion->convertedOffset));
     }
@@ -1061,7 +1061,7 @@
                                                      ConversionBufferMtl *conversion)
 {
 
-    const uint8_t *srcBytes = srcBuffer->getClientShadowCopyData(contextMtl);
+    const uint8_t *srcBytes = srcBuffer->getBufferDataReadOnly(contextMtl);
     ANGLE_CHECK_GL_ALLOC(contextMtl, srcBytes);
     VertexConversionBufferMtl *vertexConverison =
         static_cast<VertexConversionBufferMtl *>(conversion);
@@ -1151,4 +1151,4 @@
 
     return angle::Result::Continue;
 }
-}
+}  // namespace rx
diff --git a/src/libANGLE/renderer/metal/mtl_buffer_manager.h b/src/libANGLE/renderer/metal/mtl_buffer_manager.h
new file mode 100644
index 0000000..edcb3a1
--- /dev/null
+++ b/src/libANGLE/renderer/metal/mtl_buffer_manager.h
@@ -0,0 +1,94 @@
+//
+// Copyright 2022 The ANGLE Project Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// mtl_buffer_manager.h:
+//    BufferManager manages buffers across all contexts for a single
+//    device.
+//
+#ifndef LIBANGLE_RENDERER_METAL_MTL_BUFFER_MANAGER_H_
+#define LIBANGLE_RENDERER_METAL_MTL_BUFFER_MANAGER_H_
+
+#include "common/FixedVector.h"
+#include "libANGLE/renderer/metal/mtl_resources.h"
+
+#include <vector>
+
+namespace rx
+{
+class ContextMtl;
+
+namespace mtl
+{
+
+// GL buffers are backed by Metal buffers. Which metal
+// buffer is backing a particular GL buffer is fluid.
+// The case being optimized is a loop of something like
+//
+//    for 1..4
+//      glBufferSubData
+//      glDrawXXX
+//
+// You can't update a buffer in the middle of a render pass
+// in metal so instead we'd end up using multiple buffers.
+//
+// Simple case, the call to `glBufferSubData` updates the
+// entire buffer. In this case we'd end up with each call
+// to `glBufferSubData` getting a new buffer from this
+// BufferManager and copying the new data to it. We'd
+// end up submitting this renderpass
+//
+//    draw with buf1
+//    draw with buf2
+//    draw with buf3
+//    draw with buf4
+//
+// The GL buffer now references buf4. And buf1, buf2, buf3 and
+// buf0 (the buffer that was previously referenced by the GL buffer)
+// are all added to the inuse-list
+//
+
+// This macro enables showing the running totals of the various
+// buckets of unused buffers.
+// #define ANGLE_MTL_TRACK_BUFFER_MEM
+
+class BufferManager
+{
+  public:
+    BufferManager();
+
+    static constexpr size_t kMaxStagingBufferSize = 1024 * 1024;
+    static constexpr size_t kMaxSizePowerOf2      = 64;
+
+    angle::Result queueBlitCopyDataToBuffer(ContextMtl *contextMtl,
+                                            const void *srcPtr,
+                                            size_t sizeToCopy,
+                                            size_t offset,
+                                            mtl::BufferRef &dstMetalBuffer);
+
+    angle::Result getBuffer(ContextMtl *contextMtl,
+                            size_t size,
+                            bool useSharedMem,
+                            mtl::BufferRef &bufferRef);
+    void returnBuffer(ContextMtl *contextMtl, mtl::BufferRef &bufferRef);
+
+  private:
+    typedef std::vector<mtl::BufferRef> BufferList;
+
+    void freeUnusedBuffers(ContextMtl *contextMtl);
+    void addBufferRefToFreeLists(mtl::BufferRef &bufferRef);
+
+    BufferList mInUseBuffers;
+
+    angle::FixedVector<BufferList, kMaxSizePowerOf2> mFreeBuffers[2];
+#ifdef ANGLE_MTL_TRACK_BUFFER_MEM
+    angle::FixedVector<size_t, kMaxSizePowerOf2> mAllocations;
+    size_t mTotalMem = 0;
+#endif
+};
+
+}  // namespace mtl
+}  // namespace rx
+
+#endif /* LIBANGLE_RENDERER_METAL_MTL_BUFFER_MANAGER_H_ */
diff --git a/src/libANGLE/renderer/metal/mtl_buffer_manager.mm b/src/libANGLE/renderer/metal/mtl_buffer_manager.mm
new file mode 100644
index 0000000..255f425
--- /dev/null
+++ b/src/libANGLE/renderer/metal/mtl_buffer_manager.mm
@@ -0,0 +1,202 @@
+//
+// Copyright 2022 The ANGLE Project Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// mtl_buffer_manager.mm:
+//    Implements the class methods for BufferManager.
+//
+
+#include "libANGLE/renderer/metal/mtl_buffer_manager.h"
+
+#include "libANGLE/renderer/metal/ContextMtl.h"
+#include "libANGLE/renderer/metal/DisplayMtl.h"
+
+namespace rx
+{
+
+namespace mtl
+{
+
+namespace
+{
+
+constexpr size_t Log2(size_t num)
+{
+    return num <= 1 ? 0 : (1 + Log2(num / 2));
+}
+
+constexpr size_t Log2Ceil(size_t num)
+{
+    size_t l    = Log2(num);
+    size_t size = size_t(1) << l;
+    return num == size ? l : l + 1;
+}
+
+#ifdef ANGLE_MTL_TRACK_BUFFER_MEM
+const char *memUnitSuffix(size_t powerOf2)
+{
+    if (powerOf2 < 10)
+    {
+        return "b";
+    }
+    if (powerOf2 < 20)
+    {
+        return "k";
+    }
+    if (powerOf2 < 30)
+    {
+        return "M";
+    }
+    return "G";
+}
+
+size_t memUnitValue(size_t powerOf2)
+{
+    if (powerOf2 < 10)
+    {
+        return 1u << powerOf2;
+    }
+    if (powerOf2 < 20)
+    {
+        return 1u << (powerOf2 - 10);
+    }
+    if (powerOf2 < 30)
+    {
+        return 1u << (powerOf2 - 20);
+    }
+    return 1u << (powerOf2 - 30);
+}
+#endif  // ANGLE_MTL_TRACK_BUFFER_MEM
+
+int sharedMemToIndex(bool useSharedMem)
+{
+    return useSharedMem ? 1 : 0;
+}
+
+}  // namespace
+
+BufferManager::BufferManager()
+#ifdef ANGLE_MTL_TRACK_BUFFER_MEM
+    : mAllocations(kMaxSizePowerOf2, 0)
+#endif
+{}
+
+void BufferManager::freeUnusedBuffers(ContextMtl *contextMtl)
+{
+    // Scan for the first buffer still in use.
+    BufferList::iterator firstInUseIter =
+        std::find_if(mInUseBuffers.begin(), mInUseBuffers.end(),
+                     [&contextMtl](auto ref) { return ref->isBeingUsedByGPU(contextMtl); });
+
+    // Move unused buffers to the free lists
+    for (BufferList::iterator it = mInUseBuffers.begin(); it != firstInUseIter; ++it)
+    {
+        addBufferRefToFreeLists(*it);
+    }
+    mInUseBuffers.erase(mInUseBuffers.begin(), firstInUseIter);
+}
+
+void BufferManager::addBufferRefToFreeLists(mtl::BufferRef &bufferRef)
+{
+    const size_t bucketNdx = Log2Ceil(bufferRef->size());
+    ASSERT(bucketNdx < kMaxSizePowerOf2);
+    int sharedNdx = sharedMemToIndex(bufferRef->get().storageMode == MTLStorageModeShared);
+    mFreeBuffers[sharedNdx][bucketNdx].push_back(bufferRef);
+}
+
+void BufferManager::returnBuffer(ContextMtl *contextMtl, BufferRef &bufferRef)
+{
+    if (bufferRef->isBeingUsedByGPU(contextMtl))
+    {
+        mInUseBuffers.push_back(bufferRef);
+    }
+    else
+    {
+        addBufferRefToFreeLists(bufferRef);
+    }
+}
+
+angle::Result BufferManager::getBuffer(ContextMtl *contextMtl,
+                                       size_t size,
+                                       bool useSharedMem,
+                                       BufferRef &bufferRef)
+{
+    freeUnusedBuffers(contextMtl);
+
+    const size_t bucketNdx  = Log2Ceil(size);
+    const int sharedNdx     = sharedMemToIndex(useSharedMem);
+    BufferList &freeBuffers = mFreeBuffers[sharedNdx][bucketNdx];
+
+    // If there are free buffers grab one
+    if (!freeBuffers.empty())
+    {
+        bufferRef = freeBuffers.back();
+        freeBuffers.pop_back();
+        return angle::Result::Continue;
+    }
+
+    // Create a new one
+    mtl::BufferRef newBufferRef;
+
+    size_t allocSize = size_t(1) << bucketNdx;
+    ASSERT(allocSize >= size);
+    ANGLE_TRY(mtl::Buffer::MakeBufferWithSharedMemOpt(contextMtl, useSharedMem, allocSize, nullptr,
+                                                      &newBufferRef));
+
+#ifdef ANGLE_MTL_TRACK_BUFFER_MEM
+    {
+        mTotalMem += allocSize;
+        mAllocations[bucketNdx]++;
+        fprintf(stderr, "totalMem: %zu, ", mTotalMem);
+        size_t numBuffers = 0;
+        for (size_t i = 0; i < kMaxSizePowerOf2; ++i)
+        {
+            if (mAllocations[i])
+            {
+                numBuffers += mAllocations[i];
+                fprintf(stderr, "%zu%s: %zu, ", memUnitValue(i), memUnitSuffix(i), mAllocations[i]);
+            }
+        }
+        fprintf(stderr, " total: %zu\n", numBuffers);
+    }
+#endif
+
+    bufferRef = newBufferRef;
+
+    return angle::Result::Continue;
+}
+
+angle::Result BufferManager::queueBlitCopyDataToBuffer(ContextMtl *contextMtl,
+                                                       const void *srcPtr,
+                                                       size_t sizeToCopy,
+                                                       size_t offset,
+                                                       mtl::BufferRef &dstMetalBuffer)
+{
+    const uint8 *src = reinterpret_cast<const uint8 *>(srcPtr);
+    bool useShared =
+        !contextMtl->getDisplay()->getFeatures().alwaysUseManagedStorageModeForBuffers.enabled;
+
+    for (size_t srcOffset = 0; srcOffset < sizeToCopy; srcOffset += kMaxStagingBufferSize)
+    {
+        size_t subSizeToCopy = std::min(kMaxStagingBufferSize, sizeToCopy - srcOffset);
+
+        mtl::BufferRef bufferRef;
+        ANGLE_TRY(getBuffer(contextMtl, subSizeToCopy, useShared, bufferRef));
+
+        // copy data to buffer
+        uint8_t *ptr = bufferRef->mapWithOpt(contextMtl, false, true);
+        std::copy(src + srcOffset, src + srcOffset + subSizeToCopy, ptr);
+        bufferRef->unmapAndFlushSubset(contextMtl, 0, subSizeToCopy);
+
+        // queue blit
+        mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
+        blitEncoder->copyBuffer(bufferRef, 0, dstMetalBuffer, offset + srcOffset, subSizeToCopy);
+
+        returnBuffer(contextMtl, bufferRef);
+    }
+    return angle::Result::Continue;
+}
+
+}  // namespace mtl
+}  // namespace rx
diff --git a/src/libANGLE/renderer/metal/mtl_command_buffer.h b/src/libANGLE/renderer/metal/mtl_command_buffer.h
index 68a93c2..b1f2c5a 100644
--- a/src/libANGLE/renderer/metal/mtl_command_buffer.h
+++ b/src/libANGLE/renderer/metal/mtl_command_buffer.h
@@ -80,6 +80,8 @@
     AutoObjCPtr<id<MTLCommandBuffer>> makeMetalCommandBuffer(uint64_t *queueSerialOut);
     void onCommandBufferCommitted(id<MTLCommandBuffer> buf, uint64_t serial);
 
+    uint64_t getNextRenderEncoderSerial();
+
   private:
     void onCommandBufferCompleted(id<MTLCommandBuffer> buf, uint64_t serial);
     using ParentClass = WrappedObject<id<MTLCommandQueue>>;
@@ -94,6 +96,7 @@
     uint64_t mQueueSerialCounter = 1;
     std::atomic<uint64_t> mCommittedBufferSerial{0};
     std::atomic<uint64_t> mCompletedBufferSerial{0};
+    uint64_t mRenderEncoderCounter = 1;
 
     mutable std::mutex mLock;
 };
@@ -497,6 +500,8 @@
     const RenderPassDesc &renderPassDesc() const { return mRenderPassDesc; }
     bool hasDrawCalls() const { return mHasDrawCalls; }
 
+    uint64_t getSerial() const { return mSerial; }
+
   private:
     // Override CommandEncoder
     id<MTLRenderCommandEncoder> get()
@@ -541,6 +546,7 @@
     RenderCommandEncoderStates mStateCache = {};
 
     bool mPipelineStateSet = false;
+    const uint64_t mSerial = 0;
 };
 
 class BlitCommandEncoder final : public CommandEncoder
diff --git a/src/libANGLE/renderer/metal/mtl_command_buffer.mm b/src/libANGLE/renderer/metal/mtl_command_buffer.mm
index 74f2d9a..2f226f6 100644
--- a/src/libANGLE/renderer/metal/mtl_command_buffer.mm
+++ b/src/libANGLE/renderer/metal/mtl_command_buffer.mm
@@ -11,6 +11,7 @@
 #include "libANGLE/renderer/metal/mtl_command_buffer.h"
 
 #include <cassert>
+#include <cstdint>
 #if ANGLE_MTL_SIMULATE_DISCARD_FRAMEBUFFER
 #    include <random>
 #endif
@@ -579,6 +580,11 @@
         std::memory_order_relaxed);
 }
 
+uint64_t CommandQueue::getNextRenderEncoderSerial()
+{
+    return ++mRenderEncoderCounter;
+}
+
 // CommandBuffer implementation
 CommandBuffer::CommandBuffer(CommandQueue *cmdQueue) : mCmdQueue(*cmdQueue) {}
 
@@ -1065,7 +1071,9 @@
 // RenderCommandEncoder implemtation
 RenderCommandEncoder::RenderCommandEncoder(CommandBuffer *cmdBuffer,
                                            const OcclusionQueryPool &queryPool)
-    : CommandEncoder(cmdBuffer, RENDER), mOcclusionQueryPool(queryPool)
+    : CommandEncoder(cmdBuffer, RENDER),
+      mOcclusionQueryPool(queryPool),
+      mSerial(cmdBuffer->cmdQueue().getNextRenderEncoderSerial())
 {
     ANGLE_MTL_OBJC_SCOPE
     {
@@ -1556,6 +1564,7 @@
         return *this;
     }
 
+    buffer->setLastWritingRenderEncoderSerial(mSerial);
     cmdBuffer().setWriteDependency(buffer);
 
     id<MTLBuffer> mtlBuffer = (buffer ? buffer->get() : nil);
@@ -2183,10 +2192,14 @@
     }
 
 #if TARGET_OS_OSX || TARGET_OS_MACCATALYST
-    // Only MacOS has separated storage for resource on CPU and GPU and needs explicit
-    // synchronization
-    cmdBuffer().setReadDependency(buffer);
-    [get() synchronizeResource:buffer->get()];
+    if (buffer->get().storageMode == MTLStorageModeManaged)
+    {
+        // Only MacOS has separated storage for resource on CPU and GPU and needs explicit
+        // synchronization
+        cmdBuffer().setReadDependency(buffer);
+
+        [get() synchronizeResource:buffer->get()];
+    }
 #endif
     return *this;
 }
diff --git a/src/libANGLE/renderer/metal/mtl_common.h b/src/libANGLE/renderer/metal/mtl_common.h
index 0d6df36..25d075b 100644
--- a/src/libANGLE/renderer/metal/mtl_common.h
+++ b/src/libANGLE/renderer/metal/mtl_common.h
@@ -106,7 +106,7 @@
 // The max size of a buffer that will be allocated in shared memory.
 // NOTE(hqle): This is just a hint. There is no official document on what is the max allowed size
 // for shared memory.
-constexpr size_t kSharedMemBufferMaxBufSizeHint = 128 * 1024;
+constexpr size_t kSharedMemBufferMaxBufSizeHint = 256 * 1024;
 
 constexpr size_t kDefaultAttributeSize = 4 * sizeof(float);
 
diff --git a/src/libANGLE/renderer/metal/mtl_render_utils.mm b/src/libANGLE/renderer/metal/mtl_render_utils.mm
index 4b854d5..8fceb87 100644
--- a/src/libANGLE/renderer/metal/mtl_render_utils.mm
+++ b/src/libANGLE/renderer/metal/mtl_render_utils.mm
@@ -2094,8 +2094,7 @@
              contextMtl->getRenderCommandEncoder()))
         {
             IndexGenerationParams cpuPathParams = params;
-            cpuPathParams.indices =
-                elementBufferMtl->getClientShadowCopyData(contextMtl) + srcOffset;
+            cpuPathParams.indices = elementBufferMtl->getBufferDataReadOnly(contextMtl) + srcOffset;
             return generateTriFanBufferFromElementsArrayCPU(contextMtl, cpuPathParams,
                                                             indicesGenerated);
         }
@@ -2223,8 +2222,7 @@
              contextMtl->getRenderCommandEncoder()))
         {
             IndexGenerationParams cpuPathParams = params;
-            cpuPathParams.indices =
-                elementBufferMtl->getClientShadowCopyData(contextMtl) + srcOffset;
+            cpuPathParams.indices = elementBufferMtl->getBufferDataReadOnly(contextMtl) + srcOffset;
             return generateLineLoopBufferFromElementsArrayCPU(contextMtl, cpuPathParams,
                                                               indicesGenerated);
         }
diff --git a/src/libANGLE/renderer/metal/mtl_resources.h b/src/libANGLE/renderer/metal/mtl_resources.h
index afbc985..c6ae440 100644
--- a/src/libANGLE/renderer/metal/mtl_resources.h
+++ b/src/libANGLE/renderer/metal/mtl_resources.h
@@ -56,6 +56,7 @@
     bool hasPendingWorks(Context *context) const;
 
     void setUsedByCommandBufferWithQueueSerial(uint64_t serial, bool writing);
+    void setWrittenToByRenderEncoder(uint64_t serial);
 
     uint64_t getCommandBufferQueueSerial() const { return mUsageRef->cmdBufferQueueSerial; }
 
@@ -71,6 +72,15 @@
     bool isCPUReadMemDirty() const { return mUsageRef->cpuReadMemDirty; }
     void resetCPUReadMemDirty() { mUsageRef->cpuReadMemDirty = false; }
 
+    bool getLastWritingRenderEncoderSerial() const
+    {
+        return mUsageRef->lastWritingRenderEncoderSerial;
+    }
+    void setLastWritingRenderEncoderSerial(uint64_t serial) const
+    {
+        mUsageRef->lastWritingRenderEncoderSerial = serial;
+    }
+
     virtual size_t estimatedByteSize() const = 0;
     virtual id getID() const                 = 0;
 
@@ -98,6 +108,9 @@
 
         // This flag is useful for BufferMtl to know whether it should update the shadow copy
         bool cpuReadMemDirty = false;
+
+        // The id of the last render encoder to write to this resource
+        uint64_t lastWritingRenderEncoderSerial = 0;
     };
 
     // One resource object might just be a view of another resource. For example, a texture 2d
diff --git a/src/libANGLE/renderer/metal/mtl_resources.mm b/src/libANGLE/renderer/metal/mtl_resources.mm
index 976eb1c..6eff044 100644
--- a/src/libANGLE/renderer/metal/mtl_resources.mm
+++ b/src/libANGLE/renderer/metal/mtl_resources.mm
@@ -1054,8 +1054,9 @@
     {
         if (get().storageMode == MTLStorageModeManaged)
         {
-            size_t startOffset = std::min(offsetWritten, size());
-            size_t endOffset   = std::min(offsetWritten + sizeWritten, size());
+            size_t bufferSize  = size();
+            size_t startOffset = std::min(offsetWritten, bufferSize);
+            size_t endOffset   = std::min(offsetWritten + sizeWritten, bufferSize);
             size_t clampedSize = endOffset - startOffset;
             if (clampedSize > 0)
             {
diff --git a/src/tests/gl_tests/BufferDataTest.cpp b/src/tests/gl_tests/BufferDataTest.cpp
index 47bd7ec..9240719 100644
--- a/src/tests/gl_tests/BufferDataTest.cpp
+++ b/src/tests/gl_tests/BufferDataTest.cpp
@@ -1055,6 +1055,149 @@
     ASSERT_GL_NO_ERROR();
 }
 
+// This a shortened version of dEQP functional.buffer.copy.basic.array_copy_read. It provoked
+// a bug in copyBufferSubData. The bug appeared to be that conversion buffers were not marked
+// as dirty and therefore after copyBufferSubData the next draw call using the buffer that
+// just had data copied to it was not re-converted. It's not clear to me how this ever worked
+// or why changes to bufferSubData from
+// https://chromium-review.googlesource.com/c/angle/angle/+/3842641 made this issue appear and
+// why it wasn't already broken.
+TEST_P(BufferDataTestES3, CopyBufferSubDataDraw)
+{
+    const char simpleVertex[]   = R"(attribute vec2 position;
+attribute vec4 color;
+varying vec4 vColor;
+void main()
+{
+    gl_Position = vec4(position, 0, 1);
+    vColor = color;
+}
+)";
+    const char simpleFragment[] = R"(precision mediump float;
+varying vec4 vColor;
+void main()
+{
+    gl_FragColor = vColor;
+}
+)";
+
+    ANGLE_GL_PROGRAM(program, simpleVertex, simpleFragment);
+    glUseProgram(program);
+
+    GLint colorLoc = glGetAttribLocation(program, "color");
+    ASSERT_NE(-1, colorLoc);
+    GLint posLoc = glGetAttribLocation(program, "position");
+    ASSERT_NE(-1, posLoc);
+
+    glClearColor(0, 0, 0, 0);
+
+    GLBuffer srcBuffer;  // green
+    GLBuffer dstBuffer;  // red
+
+    constexpr size_t numElements = 399;
+    std::vector<GLColorRGB> reds(numElements, GLColorRGB::red);
+    std::vector<GLColorRGB> greens(numElements, GLColorRGB::green);
+    constexpr size_t sizeOfElem  = sizeof(decltype(greens)::value_type);
+    constexpr size_t sizeInBytes = numElements * sizeOfElem;
+
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glBufferData(GL_ARRAY_BUFFER, sizeInBytes, greens.data(), GL_STREAM_DRAW);
+
+    glBindBuffer(GL_COPY_READ_BUFFER, dstBuffer);
+    glBufferData(GL_COPY_READ_BUFFER, sizeInBytes, reds.data(), GL_STREAM_DRAW);
+    ASSERT_GL_NO_ERROR();
+
+    constexpr size_t numQuads = numElements / 4;
+
+    // Generate quads that fill clip space to use all the vertex colors
+    std::vector<float> positions(numQuads * 4 * 2);
+    for (size_t quad = 0; quad < numQuads; ++quad)
+    {
+        size_t offset = quad * 4 * 2;
+        float x0      = float(quad + 0) / numQuads * 2.0f - 1.0f;
+        float x1      = float(quad + 1) / numQuads * 2.0f - 1.0f;
+
+        /*
+           2--3
+           |  |
+           0--1
+        */
+        positions[offset + 0] = x0;
+        positions[offset + 1] = -1;
+        positions[offset + 2] = x1;
+        positions[offset + 3] = -1;
+        positions[offset + 4] = x0;
+        positions[offset + 5] = 1;
+        positions[offset + 6] = x1;
+        positions[offset + 7] = 1;
+    }
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    glEnableVertexAttribArray(posLoc);
+    glVertexAttribPointer(posLoc, 2, GL_FLOAT, GL_FALSE, 0, positions.data());
+    ASSERT_GL_NO_ERROR();
+
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glEnableVertexAttribArray(colorLoc);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    ASSERT_GL_NO_ERROR();
+
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    std::vector<GLushort> indices(numQuads * 6);
+    for (size_t quad = 0; quad < numQuads; ++quad)
+    {
+        size_t ndx          = quad * 4;
+        size_t offset       = quad * 6;
+        indices[offset + 0] = ndx;
+        indices[offset + 1] = ndx + 1;
+        indices[offset + 2] = ndx + 2;
+        indices[offset + 3] = ndx + 2;
+        indices[offset + 4] = ndx + 1;
+        indices[offset + 5] = ndx + 3;
+    }
+    GLBuffer indexBuffer;
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, indexBuffer);
+    glBufferData(GL_ELEMENT_ARRAY_BUFFER, indices.size() * sizeof(decltype(indices)::value_type),
+                 indices.data(), GL_STATIC_DRAW);
+
+    // Draw with srcBuffer (green)
+    glDrawElements(GL_TRIANGLES, numQuads * 6, GL_UNSIGNED_SHORT, 0);
+    EXPECT_PIXEL_RECT_EQ(0, 0, 16, 16, GLColor::green);
+    ASSERT_GL_NO_ERROR();
+
+    // Draw with dstBuffer (red)
+    glBindBuffer(GL_ARRAY_BUFFER, dstBuffer);
+    glEnableVertexAttribArray(colorLoc);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glDrawElements(GL_TRIANGLES, numQuads * 6, GL_UNSIGNED_SHORT, 0);
+    EXPECT_PIXEL_RECT_EQ(0, 0, 16, 16, GLColor::red);
+    ASSERT_GL_NO_ERROR();
+
+    // Copy src to dst. Yes, we're using GL_COPY_READ_BUFFER as dest because that's what the dEQP
+    // test was testing.
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glBindBuffer(GL_COPY_READ_BUFFER, dstBuffer);
+    glCopyBufferSubData(GL_ARRAY_BUFFER, GL_COPY_READ_BUFFER, 0, 0, sizeInBytes);
+    ASSERT_GL_NO_ERROR();
+
+    // Draw with srcBuffer. It should still be green.
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glEnableVertexAttribArray(colorLoc);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glDrawElements(GL_TRIANGLES, numQuads * 6, GL_UNSIGNED_SHORT, 0);
+    EXPECT_PIXEL_RECT_EQ(0, 0, 16, 16, GLColor::green);
+    ASSERT_GL_NO_ERROR();
+
+    // Draw with dstBuffer. It should now be green too.
+    glBindBuffer(GL_ARRAY_BUFFER, dstBuffer);
+    glEnableVertexAttribArray(colorLoc);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glDrawElements(GL_TRIANGLES, numQuads * 6, GL_UNSIGNED_SHORT, 0);
+    EXPECT_PIXEL_RECT_EQ(0, 0, 16, 16, GLColor::green);
+
+    ASSERT_GL_NO_ERROR();
+}
+
 // Ensures that calling glBufferData on a mapped buffer results in an unmapped buffer
 TEST_P(BufferDataTestES3, BufferDataUnmap)
 {
diff --git a/util/angle_features_autogen.cpp b/util/angle_features_autogen.cpp
index 9e59ca3..3859179 100644
--- a/util/angle_features_autogen.cpp
+++ b/util/angle_features_autogen.cpp
@@ -38,6 +38,9 @@
      "allowTranslateUniformBlockToStructuredBuffer"},
     {Feature::AlwaysCallUseProgramAfterLink, "alwaysCallUseProgramAfterLink"},
     {Feature::AlwaysUnbindFramebufferTexture2D, "alwaysUnbindFramebufferTexture2D"},
+    {Feature::AlwaysUseManagedStorageModeForBuffers, "alwaysUseManagedStorageModeForBuffers"},
+    {Feature::AlwaysUseSharedStorageModeForBuffers, "alwaysUseSharedStorageModeForBuffers"},
+    {Feature::AlwaysUseStagedBufferUpdates, "alwaysUseStagedBufferUpdates"},
     {Feature::AsyncCommandQueue, "asyncCommandQueue"},
     {Feature::Avoid1BitAlphaTextureFormats, "avoid1BitAlphaTextureFormats"},
     {Feature::BindTransformFeedbackBufferBeforeBindBufferRange,
@@ -187,6 +190,7 @@
     {Feature::PreemptivelyStartProvokingVertexCommandBuffer,
      "preemptivelyStartProvokingVertexCommandBuffer"},
     {Feature::PreferAggregateBarrierCalls, "preferAggregateBarrierCalls"},
+    {Feature::PreferCpuForBuffersubdata, "preferCpuForBuffersubdata"},
     {Feature::PreferCPUForBufferSubData, "preferCPUForBufferSubData"},
     {Feature::PreferDeviceLocalMemoryHostVisible, "preferDeviceLocalMemoryHostVisible"},
     {Feature::PreferDrawClearOverVkCmdClearAttachments, "preferDrawClearOverVkCmdClearAttachments"},
@@ -317,6 +321,7 @@
     {Feature::UseInstancedPointSpriteEmulation, "useInstancedPointSpriteEmulation"},
     {Feature::UseMultipleDescriptorsForExternalFormats, "useMultipleDescriptorsForExternalFormats"},
     {Feature::UseNonZeroStencilWriteMaskStaticState, "useNonZeroStencilWriteMaskStaticState"},
+    {Feature::UseShadowBuffersWhenAppropriate, "useShadowBuffersWhenAppropriate"},
     {Feature::UseSystemMemoryForConstantBuffers, "useSystemMemoryForConstantBuffers"},
     {Feature::UseUnusedBlocksWithStandardOrSharedLayout,
      "useUnusedBlocksWithStandardOrSharedLayout"},
diff --git a/util/angle_features_autogen.h b/util/angle_features_autogen.h
index 99ce6b5..7c2eac6 100644
--- a/util/angle_features_autogen.h
+++ b/util/angle_features_autogen.h
@@ -37,6 +37,9 @@
     AllowTranslateUniformBlockToStructuredBuffer,
     AlwaysCallUseProgramAfterLink,
     AlwaysUnbindFramebufferTexture2D,
+    AlwaysUseManagedStorageModeForBuffers,
+    AlwaysUseSharedStorageModeForBuffers,
+    AlwaysUseStagedBufferUpdates,
     AsyncCommandQueue,
     Avoid1BitAlphaTextureFormats,
     BindTransformFeedbackBufferBeforeBindBufferRange,
@@ -176,6 +179,7 @@
     PrecisionSafeDivision,
     PreemptivelyStartProvokingVertexCommandBuffer,
     PreferAggregateBarrierCalls,
+    PreferCpuForBuffersubdata,
     PreferCPUForBufferSubData,
     PreferDeviceLocalMemoryHostVisible,
     PreferDrawClearOverVkCmdClearAttachments,
@@ -293,6 +297,7 @@
     UseInstancedPointSpriteEmulation,
     UseMultipleDescriptorsForExternalFormats,
     UseNonZeroStencilWriteMaskStaticState,
+    UseShadowBuffersWhenAppropriate,
     UseSystemMemoryForConstantBuffers,
     UseUnusedBlocksWithStandardOrSharedLayout,
     VertexIDDoesNotIncludeBaseVertex,