i965/blorp: Align vertex buffers to 64B

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Cc: "13.0 17.0" <mesa-stable@lists.freedesktop.org>
(cherry picked from commit f938354362655a378d474c5f79c52cea9852ab91)
[Emil Velikov: brw_state_batch has different signature]
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>

Conflicts:
	src/mesa/drivers/dri/i965/genX_blorp_exec.c
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
index 8e011e9..adcce92 100644
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -130,9 +130,21 @@
    assert(batch->blorp->driver_ctx == batch->driver_batch);
    struct brw_context *brw = batch->driver_batch;
 
+   /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
+    *
+    *    "The VF cache needs to be invalidated before binding and then using
+    *    Vertex Buffers that overlap with any previously bound Vertex Buffer
+    *    (at a 64B granularity) since the last invalidation.  A VF cache
+    *    invalidate is performed by setting the "VF Cache Invalidation Enable"
+    *    bit in PIPE_CONTROL."
+    *
+    * This restriction first appears in the Skylake PRM but the internal docs
+    * also list it as being an issue on Broadwell.  In order to avoid this
+    * problem, we align all vertex buffer allocations to 64 bytes.
+    */
    uint32_t offset;
    void *data = brw_state_batch(brw, AUB_TRACE_VERTEX_BUFFER,
-                                size, 32, &offset);
+                                size, 64, &offset);
 
    *addr = (struct blorp_address) {
       .buffer = brw->batch.bo,