v3dv/pipeline_cache: cache v3dv_shader_variants

This also includes being able to serialize them as part of
GetPipelineCacheData and to deserialize it as part of
CreatePipelineCache.

So now we can also upload the assembly of the variant as part of the
PipelineCache creation.

Note that from all this the tricky part was the prog_data
serialization. v3d_compile allocates and fill a new prog_data, with
rzalloc. Among other things because it also allocates internally the
uniform list. So we needed to replicate that when deserializating the
prog_data. Ideally we would like to avoid that, and allocate as much
resources as possible using vk_alloc, but that would mean a somewhat
deep change on the v3d_compiler, that we want to avoid as much
possible for now.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index fb0747f..5c2e83d 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -2730,7 +2730,7 @@
                                VK_PIPELINE_BIND_POINT_GRAPHICS);
 
    VkResult vk_result;
-   variant = v3dv_get_shader_variant(p_stage, &local_key.base,
+   variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base,
                                      sizeof(struct v3d_fs_key),
                                      &cmd_buffer->device->alloc,
                                      &vk_result);
@@ -2761,7 +2761,7 @@
    cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer,
                                VK_PIPELINE_BIND_POINT_GRAPHICS);
 
-   variant = v3dv_get_shader_variant(p_stage, &local_key.base,
+   variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base,
                                      sizeof(struct v3d_vs_key),
                                      &cmd_buffer->device->alloc,
                                      &vk_result);
@@ -2782,7 +2782,7 @@
 
    cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer,
                                VK_PIPELINE_BIND_POINT_GRAPHICS);
-   variant = v3dv_get_shader_variant(p_stage, &local_key.base,
+   variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base,
                                      sizeof(struct v3d_vs_key),
                                      &cmd_buffer->device->alloc,
                                      &vk_result);
@@ -2813,7 +2813,7 @@
                                VK_PIPELINE_BIND_POINT_COMPUTE);
 
    VkResult result;
-   variant = v3dv_get_shader_variant(p_stage, &local_key,
+   variant = v3dv_get_shader_variant(p_stage, NULL, &local_key,
                                      sizeof(struct v3d_key),
                                      &cmd_buffer->device->alloc,
                                      &result);
diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
index 9e77656..0d4dacf 100644
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -1288,18 +1288,22 @@
 }
 
 /* FIXME: right now this just asks for an bo for the exact size of the qpu
- * assembly. It would be good to be slighly smarter and having one "all
- * shaders" bo per pipeline, so each p_stage/variant would save their offset
- * on such. That is really relevant due the fact that bo are always aligned to
- * 4096, so that would allow to use less memory.
+ * assembly. It would be good to be able to re-use bos to avoid bo
+ * fragmentation. This could be tricky though, as right now we are uploading
+ * the assembly from two paths, when compiling a shader, or when deserializing
+ * from the pipeline cache. This also means that the same variant can be
+ * shared by different objects. So with the current approach it is clear who
+ * owns the assembly bo, but if shared, who owns the shared bo?
  *
  * For now one-bo per-assembly would work.
  *
  * Returns false if it was not able to allocate or map the assembly bo memory.
  */
 static bool
-upload_assembly(struct v3dv_pipeline_stage *p_stage,
+upload_assembly(struct v3dv_device *device,
                 struct v3dv_shader_variant *variant,
+                gl_shader_stage stage,
+                bool is_coord,
                 const void *data,
                 uint32_t size)
 {
@@ -1308,11 +1312,10 @@
     * have any bo
     */
    assert(variant->assembly_bo == NULL);
-   struct v3dv_device *device = p_stage->pipeline->device;
 
-   switch (p_stage->stage) {
+   switch (stage) {
    case MESA_SHADER_VERTEX:
-      name = (p_stage->is_coord == true) ? "coord_shader_assembly" :
+      name = (is_coord == true) ? "coord_shader_assembly" :
          "vertex_shader_assembly";
       break;
    case MESA_SHADER_FRAGMENT:
@@ -1340,92 +1343,30 @@
 
    memcpy(bo->map, data, size);
 
-   v3dv_bo_unmap(device, bo);
-
+   /* We don't unmap the assembly bo, as we would use to gather the assembly
+    * when serializing the variant.
+    */
    variant->assembly_bo = bo;
 
    return true;
 }
 
-/* For a given key, it returns the compiled version of the shader. If it was
- * already compiled, it gets it from the p_stage cache, if not it compiles is
- * through the v3d compiler
+/*
+ * Adds a shader variant to the pipeline shader variant cache, updates
+ * pipeline spill structures if needed.
  *
- * If the method returns NULL it means that it was not able to allocate the
- * resources for the variant. out_vk_result would return which OOM applies.
- *
- * Returns a new reference of the shader_variant to the caller.
+ * Assumes that the caller already checked that the variant is not on such
+ * cache.
  */
-struct v3dv_shader_variant*
-v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
-                        struct v3d_key *key,
-                        size_t key_size,
-                        const VkAllocationCallbacks *pAllocator,
-                        VkResult *out_vk_result)
+static void
+pipeline_add_variant_to_cache(struct v3dv_pipeline_stage *p_stage,
+                              struct v3d_key *key,
+                              size_t key_size,
+                              struct v3dv_shader_variant *variant)
 {
    struct hash_table *ht = p_stage->cache;
-   struct hash_entry *entry = _mesa_hash_table_search(ht, key);
-
-   if (entry) {
-      *out_vk_result = VK_SUCCESS;
-      v3dv_shader_variant_ref(entry->data);
-      return entry->data;
-   }
-
    struct v3dv_pipeline *pipeline = p_stage->pipeline;
    struct v3dv_device *device = pipeline->device;
-   struct v3dv_shader_variant *variant =
-      vk_zalloc(&device->alloc, sizeof(*variant), 8,
-                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-
-   if (variant == NULL) {
-      *out_vk_result = VK_ERROR_OUT_OF_HOST_MEMORY;
-      return NULL;
-   }
-   variant->ref_cnt = 1;
-
-   struct v3dv_physical_device *physical_device =
-      &pipeline->device->instance->physicalDevice;
-   const struct v3d_compiler *compiler = physical_device->compiler;
-
-   uint32_t variant_id = p_atomic_inc_return(&p_stage->compiled_variant_count);
-
-   if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                    v3d_debug_flag_for_shader_stage(p_stage->stage))) {
-      fprintf(stderr, "Just before v3d_compile: %s prog %d variant %d NIR:\n",
-              gl_shader_stage_name(p_stage->stage),
-              p_stage->program_id,
-              variant_id);
-      nir_print_shader(p_stage->nir, stderr);
-      fprintf(stderr, "\n");
-   }
-
-   uint64_t *qpu_insts;
-   uint32_t qpu_insts_size;
-
-   qpu_insts = v3d_compile(compiler,
-                           key, &variant->prog_data.base,
-                           p_stage->nir,
-                           shader_debug_output, NULL,
-                           p_stage->program_id,
-                           variant_id,
-                           &qpu_insts_size);
-
-   if (!qpu_insts) {
-      fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n",
-              gl_shader_stage_name(p_stage->stage),
-              p_stage->program_id);
-   } else {
-      if (!upload_assembly(p_stage, variant, qpu_insts, qpu_insts_size)) {
-         free(qpu_insts);
-         v3dv_shader_variant_unref(device, variant);
-
-         *out_vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
-         return NULL;
-      }
-   }
-
-   free(qpu_insts);
 
    if (ht) {
       struct v3d_key *dup_key;
@@ -1450,8 +1391,184 @@
          v3dv_bo_alloc(device, total_spill_size, "spill", true);
       pipeline->spill.size_per_thread = variant->prog_data.base->spill_size;
    }
+}
+
+
+static void
+pipeline_hash_variant(const struct v3dv_pipeline_stage *p_stage,
+                      struct v3d_key *key,
+                      size_t key_size,
+                      unsigned char *sha1_out)
+{
+   struct mesa_sha1 ctx;
+   struct v3dv_pipeline *pipeline = p_stage->pipeline;
+   _mesa_sha1_init(&ctx);
+
+   if (p_stage->stage == MESA_SHADER_COMPUTE) {
+      _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
+   } else {
+      /* We need to include both on the sha1 key as one could affect the other
+       * during linking (like if vertex output are constants, then the
+       * fragment shader would load_const intead of load_input). An
+       * alternative would be to use the serialized nir, but that seems like
+       * an overkill
+       */
+      _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
+                        sizeof(pipeline->vs->shader_sha1));
+      _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
+                        sizeof(pipeline->fs->shader_sha1));
+   }
+   _mesa_sha1_update(&ctx, key, key_size);
+
+   _mesa_sha1_final(&ctx, sha1_out);
+}
+
+/*
+ * Creates a new shader_variant_create. Note that for prog_data is const, so
+ * it is used only to copy to their own prog_data
+ *
+ * Creation includes allocating a shader source bo, and filling it up.
+ */
+struct v3dv_shader_variant *
+v3dv_shader_variant_create(struct v3dv_device *device,
+                           gl_shader_stage stage,
+                           bool is_coord,
+                           const unsigned char *variant_sha1,
+                           struct v3d_prog_data *prog_data,
+                           uint32_t prog_data_size,
+                           const uint64_t *qpu_insts,
+                           uint32_t qpu_insts_size,
+                           VkResult *out_vk_result)
+{
+   struct v3dv_shader_variant *variant =
+      vk_zalloc(&device->alloc, sizeof(*variant), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (variant == NULL) {
+      *out_vk_result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      return NULL;
+   }
+
+   variant->ref_cnt = 1;
+   variant->stage = stage;
+   variant->is_coord = is_coord;
+   memcpy(variant->variant_sha1, variant_sha1, sizeof(variant->variant_sha1));
+   variant->prog_data_size = prog_data_size;
+   variant->prog_data.base = prog_data;
+
+   if (qpu_insts) {
+      if (!upload_assembly(device, variant, stage, is_coord,
+                           qpu_insts, qpu_insts_size)) {
+         ralloc_free(variant->prog_data.base);
+         vk_free(&device->alloc, variant);
+
+         *out_vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+         return NULL;
+      }
+      variant->qpu_insts_size = qpu_insts_size;
+   }
 
    *out_vk_result = VK_SUCCESS;
+
+   return variant;
+}
+
+/* For a given key, it returns the compiled version of the shader. If it was
+ * already compiled, it gets it from the p_stage cache, if not it compiles is
+ * through the v3d compiler
+ *
+ * If the method returns NULL it means that it was not able to allocate the
+ * resources for the variant. out_vk_result would return which OOM applies.
+ *
+ * Returns a new reference of the shader_variant to the caller.
+ */
+struct v3dv_shader_variant*
+v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
+                        struct v3dv_pipeline_cache *cache,
+                        struct v3d_key *key,
+                        size_t key_size,
+                        const VkAllocationCallbacks *pAllocator,
+                        VkResult *out_vk_result)
+{
+   /* We first try to get the variant from the internal p_stage cache
+    * variant
+    */
+   struct hash_table *ht = p_stage->cache;
+   struct hash_entry *entry = _mesa_hash_table_search(ht, key);
+
+   if (entry) {
+      *out_vk_result = VK_SUCCESS;
+      v3dv_shader_variant_ref(entry->data);
+      return entry->data;
+   }
+
+   /* Now we search on the pipeline cache if available */
+   struct v3dv_pipeline *pipeline = p_stage->pipeline;
+   unsigned char variant_sha1[20];
+   pipeline_hash_variant(p_stage, key, key_size, variant_sha1);
+
+   struct v3dv_shader_variant *variant =
+      v3dv_pipeline_cache_search_for_variant(pipeline,
+                                             cache,
+                                             variant_sha1);
+
+   if (variant) {
+      pipeline_add_variant_to_cache(p_stage, key, key_size, variant);
+      *out_vk_result = VK_SUCCESS;
+      return variant;
+   }
+
+   /* If we don't find the variant in any cache, we compile one and add the
+    * variant to the cache
+    */
+   struct v3dv_device *device = pipeline->device;
+   struct v3dv_physical_device *physical_device =
+      &pipeline->device->instance->physicalDevice;
+   const struct v3d_compiler *compiler = physical_device->compiler;
+
+   uint32_t variant_id = p_atomic_inc_return(&p_stage->compiled_variant_count);
+
+   if (V3D_DEBUG & (V3D_DEBUG_NIR |
+                    v3d_debug_flag_for_shader_stage(p_stage->stage))) {
+      fprintf(stderr, "Just before v3d_compile: %s prog %d variant %d NIR:\n",
+              gl_shader_stage_name(p_stage->stage),
+              p_stage->program_id,
+              variant_id);
+      nir_print_shader(p_stage->nir, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   uint64_t *qpu_insts;
+   uint32_t qpu_insts_size;
+   struct v3d_prog_data *prog_data;
+
+   qpu_insts = v3d_compile(compiler,
+                           key, &prog_data,
+                           p_stage->nir,
+                           shader_debug_output, NULL,
+                           p_stage->program_id,
+                           variant_id,
+                           &qpu_insts_size);
+
+   if (!qpu_insts) {
+      fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n",
+              gl_shader_stage_name(p_stage->stage),
+              p_stage->program_id);
+   }
+
+   variant = v3dv_shader_variant_create(device, p_stage->stage, p_stage->is_coord,
+                                        variant_sha1,
+                                        prog_data, v3d_prog_data_size(p_stage->stage),
+                                        qpu_insts, qpu_insts_size,
+                                        out_vk_result);
+   if (qpu_insts)
+      free(qpu_insts);
+
+   if (*out_vk_result == VK_SUCCESS) {
+      pipeline_add_variant_to_cache(p_stage, key, key_size, variant);
+      v3dv_pipeline_cache_upload_variant(pipeline, cache, variant);
+   }
+
    return variant;
 }
 
@@ -1731,6 +1848,12 @@
       next_stage = stages[stage];
    }
 
+   /* Assign p_stage to the pipeline. We need to do this before start to
+    * compile because p_stage sha1 is computed with all the stages
+    */
+   pipeline->vs = stages[MESA_SHADER_VERTEX];
+   pipeline->fs = stages[MESA_SHADER_FRAGMENT];
+
    /* Compiling to vir. Note that at this point we are compiling a default
     * variant. Binding to textures, and other stuff (that would need a
     * cmd_buffer) would need a recompile
@@ -1757,7 +1880,6 @@
           */
          lower_vs_io(p_stage->nir);
 
-         pipeline->vs = p_stage;
          pipeline->vs_bin = pipeline_stage_create_vs_bin(pipeline->vs, pAllocator);
 
          /* FIXME: likely this to be moved to a gather info method to a full
@@ -1776,7 +1898,7 @@
          pipeline_populate_v3d_vs_key(key, pCreateInfo, pipeline->vs);
          VkResult vk_result;
          pipeline->vs->current_variant =
-            v3dv_get_shader_variant(pipeline->vs, &key->base, sizeof(*key),
+            v3dv_get_shader_variant(pipeline->vs, cache, &key->base, sizeof(*key),
                                     pAllocator, &vk_result);
          if (vk_result != VK_SUCCESS)
             return vk_result;
@@ -1784,7 +1906,7 @@
          key = &pipeline->vs_bin->key.vs;
          pipeline_populate_v3d_vs_key(key, pCreateInfo, pipeline->vs_bin);
          pipeline->vs_bin->current_variant =
-            v3dv_get_shader_variant(pipeline->vs_bin, &key->base, sizeof(*key),
+            v3dv_get_shader_variant(pipeline->vs_bin, cache, &key->base, sizeof(*key),
                                     pAllocator, &vk_result);
          if (vk_result != VK_SUCCESS)
             return vk_result;
@@ -1794,8 +1916,6 @@
       case MESA_SHADER_FRAGMENT: {
          struct v3d_fs_key *key = &p_stage->key.fs;
 
-         pipeline->fs = p_stage;
-
          pipeline_populate_v3d_fs_key(key, pCreateInfo, p_stage,
                                       get_ucp_enable_mask(stages));
 
@@ -1803,7 +1923,7 @@
 
          VkResult vk_result;
          p_stage->current_variant =
-            v3dv_get_shader_variant(p_stage, &key->base, sizeof(*key),
+            v3dv_get_shader_variant(p_stage, cache, &key->base, sizeof(*key),
                                     pAllocator, &vk_result);
          if (vk_result != VK_SUCCESS)
             return vk_result;
@@ -2821,7 +2941,7 @@
 
     VkResult result;
     p_stage->current_variant =
-      v3dv_get_shader_variant(p_stage, key, sizeof(*key), alloc, &result);
+       v3dv_get_shader_variant(p_stage, cache, key, sizeof(*key), alloc, &result);
    return result;
 }
 
diff --git a/src/broadcom/vulkan/v3dv_pipeline_cache.c b/src/broadcom/vulkan/v3dv_pipeline_cache.c
index d0874d1..7d290a5 100644
--- a/src/broadcom/vulkan/v3dv_pipeline_cache.c
+++ b/src/broadcom/vulkan/v3dv_pipeline_cache.c
@@ -56,6 +56,10 @@
    fprintf(stderr, "  NIR cache entries:      %d\n", cache->nir_stats.count);
    fprintf(stderr, "  NIR cache miss count:   %d\n", cache->nir_stats.miss);
    fprintf(stderr, "  NIR cache hit  count:   %d\n", cache->nir_stats.hit);
+
+   fprintf(stderr, "  variant cache entries:      %d\n", cache->variant_stats.count);
+   fprintf(stderr, "  variant cache miss count:   %d\n", cache->variant_stats.miss);
+   fprintf(stderr, "  variant cache hit  count:   %d\n", cache->variant_stats.hit);
 }
 
 void
@@ -186,12 +190,154 @@
       cache->nir_stats.miss = 0;
       cache->nir_stats.hit = 0;
       cache->nir_stats.count = 0;
+
+      cache->variant_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
+                                                     sha1_compare_func);
+      cache->variant_stats.miss = 0;
+      cache->variant_stats.hit = 0;
+      cache->variant_stats.count = 0;
    } else {
       cache->nir_cache = NULL;
+      cache->variant_cache = NULL;
    }
 
 }
 
+struct v3dv_shader_variant*
+v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
+                                       struct v3dv_pipeline_cache *cache,
+                                       unsigned char sha1_key[20])
+{
+   if (!cache || !cache->nir_cache)
+      return NULL;
+
+   if (unlikely(dump_stats)) {
+      char sha1buf[41];
+      _mesa_sha1_format(sha1buf, sha1_key);
+
+      fprintf(stderr, "pipeline cache %p, search variant with key %s\n", cache, sha1buf);
+   }
+
+   pthread_mutex_lock(&cache->mutex);
+
+   struct hash_entry *entry =
+      _mesa_hash_table_search(cache->variant_cache, sha1_key);
+
+   if (entry) {
+      struct v3dv_shader_variant *variant =
+         (struct v3dv_shader_variant *) entry->data;
+
+      if (unlikely(dump_stats)) {
+         fprintf(stderr, "\tcache hit: %p\n", variant);
+         cache->variant_stats.hit++;
+         cache_dump_stats(cache);
+      }
+
+      if (variant)
+         v3dv_shader_variant_ref(variant);
+
+      pthread_mutex_unlock(&cache->mutex);
+      return variant;
+   }
+
+   if (unlikely(dump_stats)) {
+      fprintf(stderr, "\tcache miss\n");
+      cache->variant_stats.miss++;
+      cache_dump_stats(cache);
+   }
+
+   pthread_mutex_unlock(&cache->mutex);
+   return NULL;
+}
+
+void
+v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
+                                   struct v3dv_pipeline_cache *cache,
+                                   struct v3dv_shader_variant  *variant)
+{
+   if (!cache || !cache->variant_cache)
+      return;
+
+   pthread_mutex_lock(&cache->mutex);
+   struct hash_entry *entry =
+      _mesa_hash_table_search(cache->variant_cache, variant->variant_sha1);
+
+   if (entry) {
+      pthread_mutex_unlock(&cache->mutex);
+      return;
+   }
+
+   v3dv_shader_variant_ref(variant);
+   _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
+   if (unlikely(dump_stats)) {
+      char sha1buf[41];
+      _mesa_sha1_format(sha1buf, variant->variant_sha1);
+
+      fprintf(stderr, "pipeline cache %p, new variant entry with key %s\n\t%p\n",
+              cache, sha1buf, variant);
+      cache->variant_stats.count++;
+      cache_dump_stats(cache);
+   }
+
+   pthread_mutex_unlock(&cache->mutex);
+}
+
+static struct v3dv_shader_variant*
+shader_variant_create_from_blob(struct v3dv_device *device,
+                                struct blob_reader *blob)
+{
+   VkResult result;
+
+   gl_shader_stage stage = blob_read_uint32(blob);
+   bool is_coord = blob_read_uint8(blob);
+
+   const unsigned char *variant_sha1 = blob_read_bytes(blob, 20);
+
+   uint32_t prog_data_size = blob_read_uint32(blob);
+   /* FIXME: as we include the stage perhaps we can avoid prog_data_size? */
+   assert(prog_data_size == v3d_prog_data_size(stage));
+
+   const void *prog_data = blob_read_bytes(blob, prog_data_size);
+   if (blob->overrun)
+      return NULL;
+
+   uint32_t ulist_count = blob_read_uint32(blob);
+   uint32_t contents_size = sizeof(enum quniform_contents) * ulist_count;
+   const void *contents_data = blob_read_bytes(blob, contents_size);
+   if (blob->overrun)
+      return NULL;
+
+   uint ulist_data_size = sizeof(uint32_t) * ulist_count;
+   const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size);
+   if (blob->overrun)
+      return NULL;
+
+   uint32_t qpu_insts_size = blob_read_uint32(blob);
+   const uint64_t *qpu_insts = blob_read_bytes(blob, qpu_insts_size);
+   if (blob->overrun)
+      return NULL;
+
+   /* shader_variant_create expects a newly created prog_data for their own,
+    * as it is what the v3d compiler returns. So we are also allocating one
+    * (including the uniform list) and filled it up with the data that we read
+    * from the blob
+    */
+   struct v3d_prog_data *new_prog_data = rzalloc_size(NULL, prog_data_size);
+   memcpy(new_prog_data, prog_data, prog_data_size);
+   struct v3d_uniform_list *ulist = &new_prog_data->uniforms;
+   ulist->count = ulist_count;
+   ulist->contents = ralloc_array(new_prog_data, enum quniform_contents, ulist->count);
+   memcpy(ulist->contents, contents_data, contents_size);
+   ulist->data = ralloc_array(new_prog_data, uint32_t, ulist->count);
+   memcpy(ulist->data, ulist_data_data, ulist_data_size);
+
+   return v3dv_shader_variant_create(device, stage, is_coord,
+                                     variant_sha1,
+                                     new_prog_data, prog_data_size,
+                                     qpu_insts, qpu_insts_size,
+                                     &result);
+}
+
 static void
 pipeline_cache_load(struct v3dv_pipeline_cache *cache,
                     size_t size,
@@ -201,6 +347,21 @@
    struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
    struct vk_pipeline_cache_header header;
 
+   if (cache->variant_cache == NULL)
+      return;
+
+   struct blob_reader blob;
+   blob_reader_init(&blob, data, size);
+
+   blob_copy_bytes(&blob, &header, sizeof(header));
+   uint32_t count = blob_read_uint32(&blob);
+   if (blob.overrun)
+      return;
+
+   if (unlikely(dump_stats)) {
+      fprintf(stderr, "pipeline cache %p, loading %i variant entries\n", cache, count);
+   }
+
    if (size < sizeof(header))
       return;
    memcpy(&header, data, sizeof(header));
@@ -215,9 +376,16 @@
    if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0)
       return;
 
-   /* FIXME: at this point we only verify the header but we dont really load
-    * any data. pending to implement serialize/deserialize among other things.
-    */
+   for (uint32_t i = 0; i < count; i++) {
+      struct v3dv_shader_variant *variant =
+         shader_variant_create_from_blob(device, &blob);
+      if (!variant)
+         break;
+      _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
+      if (unlikely(dump_stats))
+         cache->variant_stats.count++;
+   }
+
 }
 
 VkResult
@@ -271,6 +439,15 @@
          ralloc_free(entry->data);
 
       _mesa_hash_table_destroy(cache->nir_cache, NULL);
+
+      hash_table_foreach(cache->variant_cache, entry) {
+         struct v3dv_shader_variant *variant = entry->data;
+         if (variant)
+            v3dv_shader_variant_unref(device, variant);
+      }
+
+      _mesa_hash_table_destroy(cache->variant_cache, NULL);
+
    }
 
    vk_free2(&device->alloc, pAllocator, cache);
@@ -288,6 +465,30 @@
    return VK_SUCCESS;
 }
 
+static bool
+shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
+                             struct blob *blob)
+{
+   blob_write_uint32(blob, variant->stage);
+   blob_write_uint8(blob, variant->is_coord);
+
+   blob_write_bytes(blob, variant->variant_sha1, sizeof(variant->variant_sha1));
+
+   blob_write_uint32(blob, variant->prog_data_size);
+   blob_write_bytes(blob, variant->prog_data.base, variant->prog_data_size);
+
+   struct v3d_uniform_list *ulist = &variant->prog_data.base->uniforms;
+   blob_write_uint32(blob, ulist->count);
+   blob_write_bytes(blob, ulist->contents, sizeof(enum quniform_contents) * ulist->count);
+   blob_write_bytes(blob, ulist->data, sizeof(uint32_t) * ulist->count);
+
+   blob_write_uint32(blob, variant->qpu_insts_size);
+   assert(variant->assembly_bo->map);
+   blob_write_bytes(blob, variant->assembly_bo->map, variant->qpu_insts_size);
+
+   return !blob->out_of_memory;
+}
+
 VkResult
 v3dv_GetPipelineCacheData(VkDevice _device,
                           VkPipelineCache _cache,
@@ -296,32 +497,68 @@
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);
+
+   struct blob blob;
+   if (pData) {
+      blob_init_fixed(&blob, pData, *pDataSize);
+   } else {
+      blob_init_fixed(&blob, NULL, SIZE_MAX);
+   }
+
    struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
-   struct vk_pipeline_cache_header *header;
    VkResult result = VK_SUCCESS;
 
    pthread_mutex_lock(&cache->mutex);
 
-   /* FIXME: at this point the cache data is just the header */
-   const size_t size = sizeof(*header);
-   if (pData == NULL) {
-      pthread_mutex_unlock(&cache->mutex);
-      *pDataSize = size;
-      return VK_SUCCESS;
-   }
-   if (*pDataSize < sizeof(*header)) {
-      pthread_mutex_unlock(&cache->mutex);
+   struct vk_pipeline_cache_header header = {
+      .header_size = sizeof(struct vk_pipeline_cache_header),
+      .header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
+      .vendor_id = v3dv_physical_device_vendor_id(pdevice),
+      .device_id = v3dv_physical_device_device_id(pdevice),
+   };
+   memcpy(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
+   blob_write_bytes(&blob, &header, sizeof(header));
+
+   uint32_t count = 0;
+   intptr_t count_offset = blob_reserve_uint32(&blob);
+   if (count_offset < 0) {
       *pDataSize = 0;
+      blob_finish(&blob);
+      pthread_mutex_unlock(&cache->mutex);
       return VK_INCOMPLETE;
    }
 
-   header = pData;
-   header->header_size = sizeof(*header);
-   header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
-   header->vendor_id = v3dv_physical_device_vendor_id(pdevice);
-   header->device_id = v3dv_physical_device_device_id(pdevice);
-   memcpy(header->uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
+   if (cache->variant_cache) {
+      hash_table_foreach(cache->variant_cache, entry) {
+         struct v3dv_shader_variant *variant = entry->data;
+
+         size_t save_size = blob.size;
+         if (!shader_variant_write_to_blob(variant, &blob)) {
+            /* If it fails reset to the previous size and bail */
+            blob.size = save_size;
+            pthread_mutex_unlock(&cache->mutex);
+            result = VK_INCOMPLETE;
+            break;
+         }
+
+         count++;
+      }
+   }
+
+   blob_overwrite_uint32(&blob, count_offset, count);
+
+   *pDataSize = blob.size;
+
+   blob_finish(&blob);
+
+   if (unlikely(dump_stats)) {
+      assert(count <= cache->variant_stats.count);
+      fprintf(stderr, "GetPipelineCacheData: serializing cache %p, "
+              "%i variant entries, %u DataSize\n",
+              cache, count, (uint32_t) *pDataSize);
+   }
 
    pthread_mutex_unlock(&cache->mutex);
+
    return result;
 }
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index a58063e..115840a 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -278,6 +278,9 @@
 
    struct hash_table *nir_cache;
    struct v3dv_pipeline_cache_stats nir_stats;
+
+   struct hash_table *variant_cache;
+   struct v3dv_pipeline_cache_stats variant_stats;
 };
 
 struct v3dv_device {
@@ -1221,6 +1224,14 @@
 struct v3dv_shader_variant {
    uint32_t ref_cnt;
 
+   gl_shader_stage stage;
+   bool is_coord;
+
+   /* key for the pipeline cache, it is p_stage shader_sha1 + v3d compiler
+    * sha1
+    */
+   unsigned char variant_sha1[20];
+
    union {
       struct v3d_prog_data *base;
       struct v3d_vs_prog_data *vs;
@@ -1228,11 +1239,16 @@
       struct v3d_compute_prog_data *cs;
    } prog_data;
 
+   /* We explicitly save the prog_data_size as it would make easier to
+    * serialize
+    */
+   uint32_t prog_data_size;
    /* FIXME: using one bo per shader. Eventually we would be interested on
     * reusing the same bo for all the shaders, like a bo per v3dv_pipeline for
     * shaders.
     */
    struct v3dv_bo *assembly_bo;
+   uint32_t qpu_insts_size;
 };
 
 /*
@@ -1278,11 +1294,13 @@
       struct v3d_fs_key fs;
    } key;
 
-   /* Cache with all the shader variant.
+   /* Cache with all the shader variants built for this pipeline. This one is
+    * required over the pipeline cache because we still allow to create shader
+    * variants after Pipeline creation.
     */
    struct hash_table *cache;
 
-   struct v3dv_shader_variant *current_variant;
+   struct v3dv_shader_variant*current_variant;
 
    /* FIXME: only make sense on vs, so perhaps a v3dv key like radv? or a kind
     * of pipe_draw_info
@@ -1712,11 +1730,23 @@
 
 struct v3dv_shader_variant *
 v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage,
+                        struct v3dv_pipeline_cache *cache,
                         struct v3d_key *key,
                         size_t key_size,
                         const VkAllocationCallbacks *pAllocator,
                         VkResult *out_vk_result);
 
+struct v3dv_shader_variant *
+v3dv_shader_variant_create(struct v3dv_device *device,
+                           gl_shader_stage stage,
+                           bool is_coord,
+                           const unsigned char *variant_sha1,
+                           struct v3d_prog_data *prog_data,
+                           uint32_t prog_data_size,
+                           const uint64_t *qpu_insts,
+                           uint32_t qpu_insts_size,
+                           VkResult *out_vk_result);
+
 void
 v3dv_shader_variant_destroy(struct v3dv_device *device,
                             struct v3dv_shader_variant *variant);
@@ -1786,6 +1816,16 @@
                                                const nir_shader_compiler_options *nir_options,
                                                unsigned char sha1_key[20]);
 
+struct v3dv_shader_variant*
+v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
+                                       struct v3dv_pipeline_cache *cache,
+                                       unsigned char sha1_key[20]);
+
+void
+v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
+                                   struct v3dv_pipeline_cache *cache,
+                                   struct v3dv_shader_variant  *variant);
+
 
 #define V3DV_DEFINE_HANDLE_CASTS(__v3dv_type, __VkType)   \
                                                         \