Don't glMapBufferRange on the host if possible

It is expensive to

- let glMapBufferRange to the host at all
- read from the GPU mapped pointer when on the host

The second item will introduce a stall similar to
glClientWaitSync/glFinish on the host GPU.

Thus, track when the buffer is possibly dirty and only read from it
during those times. At other times, mapped buffers that are only used
for write can be implemented in a completely feed-forward way.

Change-Id: Ife9393e4eb5238411e6eae0abb49272523a8d935
diff --git a/shared/OpenglCodecCommon/GLClientState.cpp b/shared/OpenglCodecCommon/GLClientState.cpp
index a31f697..f61f133 100644
--- a/shared/OpenglCodecCommon/GLClientState.cpp
+++ b/shared/OpenglCodecCommon/GLClientState.cpp
@@ -421,6 +421,33 @@
     }
 }
 
+#ifdef GFXSTREAM
+
+void GLClientState::addBuffer(GLuint id) {
+    mBufferIds.add(id);
+    mBufferIds.set(id, true);
+    mHostMappedBufferDirty.add(id);
+}
+
+void GLClientState::removeBuffer(GLuint id) {
+    mHostMappedBufferDirty.remove(id);
+    mBufferIds.remove(id);
+}
+
+bool GLClientState::bufferIdExists(GLuint id) const {
+    return mBufferIds.get(id);
+}
+
+void GLClientState::setBufferHostMapDirty(GLuint id, bool dirty) {
+    mHostMappedBufferDirty.set(id, dirty);
+}
+
+bool GLClientState::isBufferHostMapDirty(GLuint id) const {
+    return mHostMappedBufferDirty.get(id);
+}
+
+#else // GFXSTREAM
+
 void GLClientState::addBuffer(GLuint id) {
     mBufferIds.insert(id);
 }
@@ -433,6 +460,66 @@
     return mBufferIds.find(id) != mBufferIds.end();
 }
 
+void setBufferHostMapDirty(GLuint id, bool dirty) {
+    (void)id;
+    (void)dirty;
+}
+
+bool isBufferHostMapDirty(GLuint id) const {
+    (void)id;
+    return true;
+}
+
+#endif // !GFXSTREAM
+
+void GLClientState::setBoundPixelPackBufferDirtyForHostMap() {
+    if (m_pixelPackBuffer)
+        setBufferHostMapDirty(m_pixelPackBuffer, true /* dirty */);
+}
+
+void GLClientState::setBoundTransformFeedbackBuffersDirtyForHostMap() {
+    if (m_transformFeedbackBuffer)
+        setBufferHostMapDirty(
+            m_transformFeedbackBuffer,
+            true /* dirty */);
+
+    for (size_t i = 0; i < m_indexedTransformFeedbackBuffers.size(); ++i)
+        if (m_indexedTransformFeedbackBuffers[i].buffer)
+            setBufferHostMapDirty(
+                m_indexedTransformFeedbackBuffers[i].buffer,
+                true /* dirty */);
+}
+
+void GLClientState::setBoundShaderStorageBuffersDirtyForHostMap() {
+    if (m_glesMajorVersion == 3 && m_glesMinorVersion == 0) return;
+
+    if (m_shaderStorageBuffer)
+        setBufferHostMapDirty(
+            m_shaderStorageBuffer,
+            true /* dirty */);
+
+    for (size_t i = 0; i < m_indexedShaderStorageBuffers.size(); ++i)
+        if (m_indexedShaderStorageBuffers[i].buffer)
+            setBufferHostMapDirty(
+                m_indexedShaderStorageBuffers[i].buffer,
+                true /* dirty */);
+}
+
+void GLClientState::setBoundAtomicCounterBuffersDirtyForHostMap() {
+    if (m_glesMajorVersion == 3 && m_glesMinorVersion == 0) return;
+
+    if (m_atomicCounterBuffer)
+        setBufferHostMapDirty(
+            m_atomicCounterBuffer,
+            true /* dirty */);
+
+    for (size_t i = 0; i < m_indexedAtomicCounterBuffers.size(); ++i)
+        if (m_indexedAtomicCounterBuffers[i].buffer)
+            setBufferHostMapDirty(
+                m_indexedAtomicCounterBuffers[i].buffer,
+                true /* dirty */);
+}
+
 void GLClientState::unBindBuffer(GLuint id) {
     if (m_arrayBuffer == id) {
         m_arrayBuffer = 0;
@@ -619,6 +706,31 @@
     }
 }
 
+void GLClientState::postDraw() {
+    setBoundTransformFeedbackBuffersDirtyForHostMap();
+    setBoundShaderStorageBuffersDirtyForHostMap();
+    setBoundAtomicCounterBuffersDirtyForHostMap();
+}
+
+void GLClientState::postReadPixels() {
+    setBoundPixelPackBufferDirtyForHostMap();
+}
+
+void GLClientState::postDispatchCompute() {
+    setBoundShaderStorageBuffersDirtyForHostMap();
+    setBoundAtomicCounterBuffersDirtyForHostMap();
+}
+
+bool GLClientState::shouldSkipHostMapBuffer(GLenum target) {
+    GLuint id = getBuffer(target);
+    return !isBufferHostMapDirty(id);
+}
+
+void GLClientState::onHostMappedBuffer(GLenum target) {
+    GLuint id = getBuffer(target);
+    setBufferHostMapDirty(id, false /* not dirty */);
+}
+
 int GLClientState::getBuffer(GLenum target) {
     int ret=0;
     switch (target) {
diff --git a/shared/OpenglCodecCommon/GLClientState.h b/shared/OpenglCodecCommon/GLClientState.h
index b7f5655..1137041 100644
--- a/shared/OpenglCodecCommon/GLClientState.h
+++ b/shared/OpenglCodecCommon/GLClientState.h
@@ -22,6 +22,10 @@
 #define GL_APIENTRYP
 #endif
 
+#ifdef GFXSTREAM
+#include "StateTrackingSupport.h"
+#endif
+
 #include "TextureSharedData.h"
 
 #include <GLES/gl.h>
@@ -240,12 +244,27 @@
     bool bufferIdExists(GLuint id) const;
     void unBindBuffer(GLuint id);
 
+    void setBufferHostMapDirty(GLuint id, bool dirty);
+    bool isBufferHostMapDirty(GLuint id) const;
+
+    void setBoundPixelPackBufferDirtyForHostMap();
+    void setBoundTransformFeedbackBuffersDirtyForHostMap();
+    void setBoundShaderStorageBuffersDirtyForHostMap();
+    void setBoundAtomicCounterBuffersDirtyForHostMap();
+
     int bindBuffer(GLenum target, GLuint id);
     void bindIndexedBuffer(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size, GLintptr stride, GLintptr effectiveStride);
     int getMaxIndexedBufferBindings(GLenum target) const;
     bool isNonIndexedBindNoOp(GLenum target, GLuint buffer);
     bool isIndexedBindNoOp(GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size, GLintptr stride, GLintptr effectiveStride);
 
+    void postDraw();
+    void postReadPixels();
+    void postDispatchCompute();
+
+    bool shouldSkipHostMapBuffer(GLenum target);
+    void onHostMappedBuffer(GLenum target);
+
     int getBuffer(GLenum target);
     GLuint getLastEncodedBufferBind(GLenum target);
     void setLastEncodedBufferBind(GLenum target, GLuint id);
@@ -446,7 +465,12 @@
     bool m_initialized;
     PixelStoreState m_pixelStore;
 
+#ifdef GFXSTREAM
+    PredicateMap<false> mBufferIds;
+    PredicateMap<true> mHostMappedBufferDirty;
+#else
     std::set<GLuint> mBufferIds;
+#endif
 
     // GL_ARRAY_BUFFER_BINDING is separate from VAO state
     GLuint m_arrayBuffer;
diff --git a/system/GLESv2_enc/GL2Encoder.cpp b/system/GLESv2_enc/GL2Encoder.cpp
index bafd1b6..a770979 100755
--- a/system/GLESv2_enc/GL2Encoder.cpp
+++ b/system/GLESv2_enc/GL2Encoder.cpp
@@ -375,6 +375,9 @@
 
     OVERRIDE(glInvalidateFramebuffer);
     OVERRIDE(glInvalidateSubFramebuffer);
+
+    OVERRIDE(glDispatchCompute);
+    OVERRIDE(glDispatchComputeIndirect);
 }
 
 GL2Encoder::~GL2Encoder()
@@ -1338,6 +1341,8 @@
     } else {
         ctx->m_glDrawArrays_enc(ctx, mode, first, count);
     }
+
+    ctx->m_state->postDraw();
 }
 
 
@@ -1428,6 +1433,8 @@
             ALOGE("glDrawElements: direct index & direct buffer data - will be implemented in later versions;\n");
         }
     }
+
+    ctx->m_state->postDraw();
 }
 
 void GL2Encoder::s_glDrawArraysNullAEMU(void *self, GLenum mode, GLint first, GLsizei count)
@@ -1451,6 +1458,7 @@
         ctx->m_glDrawArraysNullAEMU_enc(ctx, mode, first, count);
     }
     ctx->flushDrawCall();
+    ctx->m_state->postDraw();
 }
 
 void GL2Encoder::s_glDrawElementsNullAEMU(void *self, GLenum mode, GLsizei count, GLenum type, const void *indices)
@@ -1544,6 +1552,7 @@
             ALOGE("glDrawElementsNullAEMU: direct index & direct buffer data - will be implemented in later versions;\n");
         }
     }
+    ctx->m_state->postDraw();
 }
 
 GLint * GL2Encoder::getCompressedTextureFormats()
@@ -2950,11 +2959,17 @@
         ((access & GL_MAP_WRITE_BIT) &&
         (!(access & GL_MAP_INVALIDATE_RANGE_BIT) &&
          !(access & GL_MAP_INVALIDATE_BUFFER_BIT)))) {
+
+        if (ctx->m_state->shouldSkipHostMapBuffer(target))
+            return bits;
+
         ctx->glMapBufferRangeAEMU(
                 ctx, target,
                 offset, length,
                 access,
                 bits);
+
+        ctx->m_state->onHostMappedBuffer(target);
     }
 
     return bits;
@@ -4146,6 +4161,7 @@
         ctx->m_glDrawArraysInstanced_enc(ctx, mode, first, count, primcount);
     }
     ctx->m_stream->flush();
+    ctx->m_state->postDraw();
 }
 
 void GL2Encoder::s_glDrawElementsInstanced(void* self, GLenum mode, GLsizei count, GLenum type, const void* indices, GLsizei primcount)
@@ -4237,6 +4253,7 @@
             ALOGE("glDrawElements: direct index & direct buffer data - will be implemented in later versions;\n");
         }
     }
+    ctx->m_state->postDraw();
 }
 
 void GL2Encoder::s_glDrawRangeElements(void* self, GLenum mode, GLuint start, GLuint end, GLsizei count, GLenum type, const void* indices)
@@ -4334,6 +4351,7 @@
             ALOGE("glDrawElements: direct index & direct buffer data - will be implemented in later versions;\n");
         }
     }
+    ctx->m_state->postDraw();
 }
 
 const GLubyte* GL2Encoder::s_glGetStringi(void* self, GLenum name, GLuint index) {
@@ -4436,6 +4454,7 @@
                 ctx, x, y, width, height,
                 format, type, pixels);
     }
+    ctx->m_state->postReadPixels();
 }
 
 // Track enabled state for some things like:
@@ -5362,6 +5381,7 @@
         // This is purely for debug/dev purposes.
         ctx->glDrawArraysIndirectDataAEMU(ctx, mode, indirect, indirectStructSize);
     }
+    ctx->m_state->postDraw();
 }
 
 void GL2Encoder::s_glDrawElementsIndirect(void* self, GLenum mode, GLenum type, const void* indirect) {
@@ -5391,7 +5411,7 @@
         // This is purely for debug/dev purposes.
         ctx->glDrawElementsIndirectDataAEMU(ctx, mode, type, indirect, indirectStructSize);
     }
-
+    ctx->m_state->postDraw();
 }
 
 void GL2Encoder::s_glTexStorage2DMultisample(void* self, GLenum target, GLsizei samples, GLenum internalformat, GLsizei width, GLsizei height, GLboolean fixedsamplelocations) {
@@ -5427,6 +5447,7 @@
     SET_ERROR_IF(bufSize < glesv2_enc::pixelDataSize(self, width, height, format,
         type, 1), GL_INVALID_OPERATION);
     s_glReadPixels(self, x, y, width, height, format, type, pixels);
+    ctx->m_state->postReadPixels();
 }
 
 void GL2Encoder::s_glGetnUniformfvEXT(void *self, GLuint program, GLint location,
@@ -5458,3 +5479,15 @@
     SET_ERROR_IF(height < 0, GL_INVALID_VALUE);
     ctx->m_glInvalidateSubFramebuffer_enc(ctx, target, numAttachments, attachments, x, y, width, height);
 }
+
+void GL2Encoder::s_glDispatchCompute(void* self, GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z) {
+    GL2Encoder *ctx = (GL2Encoder*)self;
+    ctx->m_glDispatchCompute_enc(ctx, num_groups_x, num_groups_y, num_groups_z);
+    ctx->m_state->postDispatchCompute();
+}
+
+void GL2Encoder::s_glDispatchComputeIndirect(void* self, GLintptr indirect) {
+    GL2Encoder *ctx = (GL2Encoder*)self;
+    ctx->m_glDispatchComputeIndirect_enc(ctx, indirect);
+    ctx->m_state->postDispatchCompute();
+}
diff --git a/system/GLESv2_enc/GL2Encoder.h b/system/GLESv2_enc/GL2Encoder.h
index 0ceb9de..0b42cb7 100644
--- a/system/GLESv2_enc/GL2Encoder.h
+++ b/system/GLESv2_enc/GL2Encoder.h
@@ -769,6 +769,13 @@
     glInvalidateFramebuffer_client_proc_t m_glInvalidateFramebuffer_enc;
     glInvalidateSubFramebuffer_client_proc_t m_glInvalidateSubFramebuffer_enc;;
 
+    // Dispatch compute
+    static void s_glDispatchCompute(void* self, GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z);
+    static void s_glDispatchComputeIndirect(void* self, GLintptr indirect);
+
+    glDispatchCompute_client_proc_t m_glDispatchCompute_enc;
+    glDispatchComputeIndirect_client_proc_t m_glDispatchComputeIndirect_enc;
+
 public:
     glEGLImageTargetTexture2DOES_client_proc_t m_glEGLImageTargetTexture2DOES_enc;