src/broadcom/vulkan/v3dv_pipeline_cache.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2019 Raspberry Pi
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "v3dv_private.h"
 #include "vulkan/util/vk_util.h"
 #include "util/blob.h"
 #include "nir/nir_serialize.h"

 static const bool dump_stats = false;
 static const bool dump_stats_verbose = false;

 static uint32_t
 sha1_hash_func(const void *sha1)
 {
    return _mesa_hash_data(sha1, 20);
 }

 static bool
 sha1_compare_func(const void *sha1_a, const void *sha1_b)
 {
    return memcmp(sha1_a, sha1_b, 20) == 0;
 }

 struct serialized_nir {
    unsigned char sha1_key[20];
    size_t size;
    char data[0];
 };

 static void
 cache_dump_stats(struct v3dv_pipeline_cache *cache)
 {
    if (!dump_stats_verbose)
       return;

    fprintf(stderr, "  NIR cache entries:      %d\n", cache->nir_stats.count);
    fprintf(stderr, "  NIR cache miss count:   %d\n", cache->nir_stats.miss);
    fprintf(stderr, "  NIR cache hit  count:   %d\n", cache->nir_stats.hit);

    fprintf(stderr, "  variant cache entries:      %d\n", cache->variant_stats.count);
    fprintf(stderr, "  variant cache miss count:   %d\n", cache->variant_stats.miss);
    fprintf(stderr, "  variant cache hit  count:   %d\n", cache->variant_stats.hit);
 }

 void
 v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
                                struct v3dv_pipeline_cache *cache,
                                nir_shader *nir,
                                unsigned char sha1_key[20])
 {
    if (!cache || !cache->nir_cache)
       return;

    pthread_mutex_lock(&cache->mutex);
    struct hash_entry *entry =
       _mesa_hash_table_search(cache->nir_cache, sha1_key);
    pthread_mutex_unlock(&cache->mutex);
    if (entry)
       return;

    struct blob blob;
    blob_init(&blob);

    nir_serialize(&blob, nir, false);
    if (blob.out_of_memory) {
       blob_finish(&blob);
       return;
    }

    pthread_mutex_lock(&cache->mutex);
    /* Because ralloc isn't thread-safe, we have to do all this inside the
     * lock.  We could unlock for the big memcpy but it's probably not worth
     * the hassle.
     */
    entry = _mesa_hash_table_search(cache->nir_cache, sha1_key);
    if (entry) {
       blob_finish(&blob);
       pthread_mutex_unlock(&cache->mutex);
       return;
    }

    struct serialized_nir *snir =
       ralloc_size(cache->nir_cache, sizeof(*snir) + blob.size);
    memcpy(snir->sha1_key, sha1_key, 20);
    snir->size = blob.size;
    memcpy(snir->data, blob.data, blob.size);

    blob_finish(&blob);

    if (unlikely(dump_stats)) {
       char sha1buf[41];
       _mesa_sha1_format(sha1buf, snir->sha1_key);
       fprintf(stderr, "pipeline cache %p, new nir entry %s\n", cache, sha1buf);

       cache->nir_stats.count++;
       cache_dump_stats(cache);
    }

    _mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);

    pthread_mutex_unlock(&cache->mutex);
 }

 nir_shader*
 v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
                                    struct v3dv_pipeline_cache *cache,
                                    const nir_shader_compiler_options *nir_options,
                                    unsigned char sha1_key[20])
 {
    if (!cache || !cache->nir_cache)
       return NULL;

    if (unlikely(dump_stats)) {
       char sha1buf[41];
       _mesa_sha1_format(sha1buf, sha1_key);

       fprintf(stderr, "pipeline cache %p, search for nir %s\n", cache, sha1buf);
    }

    const struct serialized_nir *snir = NULL;

    pthread_mutex_lock(&cache->mutex);
    struct hash_entry *entry =
       _mesa_hash_table_search(cache->nir_cache, sha1_key);
    if (entry)
       snir = entry->data;
    pthread_mutex_unlock(&cache->mutex);

    if (snir) {
       struct blob_reader blob;
       blob_reader_init(&blob, snir->data, snir->size);

       /* We use context NULL as we want the p_stage to keep the reference to
        * nir, as we keep open the possibility of provide a shader variant
        * after cache creation
        */
       nir_shader *nir = nir_deserialize(NULL, nir_options, &blob);
       if (blob.overrun) {
          ralloc_free(nir);
       } else {
          if (unlikely(dump_stats)) {
             cache->nir_stats.hit++;
             cache_dump_stats(cache);
          }
          return nir;
       }
    }

    if (unlikely(dump_stats)) {
       cache->nir_stats.miss++;
       cache_dump_stats(cache);
    }

    return NULL;
 }

 static void
 pipeline_cache_init(struct v3dv_pipeline_cache *cache,
                     struct v3dv_device *device,
                     bool cache_enabled)
 {
    cache->_loader_data.loaderMagic = ICD_LOADER_MAGIC;

    cache->device = device;
    pthread_mutex_init(&cache->mutex, NULL);

    if (cache_enabled) {
       cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
                                                  sha1_compare_func);
       cache->nir_stats.miss = 0;
       cache->nir_stats.hit = 0;
       cache->nir_stats.count = 0;

       cache->variant_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
                                                      sha1_compare_func);
       cache->variant_stats.miss = 0;
       cache->variant_stats.hit = 0;
       cache->variant_stats.count = 0;
    } else {
       cache->nir_cache = NULL;
       cache->variant_cache = NULL;
    }

 }

 struct v3dv_shader_variant*
 v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
                                        struct v3dv_pipeline_cache *cache,
                                        unsigned char sha1_key[20])
 {
    if (!cache || !cache->nir_cache)
       return NULL;

    if (unlikely(dump_stats)) {
       char sha1buf[41];
       _mesa_sha1_format(sha1buf, sha1_key);

       fprintf(stderr, "pipeline cache %p, search variant with key %s\n", cache, sha1buf);
    }

    pthread_mutex_lock(&cache->mutex);

    struct hash_entry *entry =
       _mesa_hash_table_search(cache->variant_cache, sha1_key);

    if (entry) {
       struct v3dv_shader_variant *variant =
          (struct v3dv_shader_variant *) entry->data;

       if (unlikely(dump_stats)) {
          fprintf(stderr, "\tcache hit: %p\n", variant);
          cache->variant_stats.hit++;
          cache_dump_stats(cache);
       }

       if (variant)
          v3dv_shader_variant_ref(variant);

       pthread_mutex_unlock(&cache->mutex);
       return variant;
    }

    if (unlikely(dump_stats)) {
       fprintf(stderr, "\tcache miss\n");
       cache->variant_stats.miss++;
       cache_dump_stats(cache);
    }

    pthread_mutex_unlock(&cache->mutex);
    return NULL;
 }

 void
 v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
                                    struct v3dv_pipeline_cache *cache,
                                    struct v3dv_shader_variant  *variant)
 {
    if (!cache || !cache->variant_cache)
       return;

    pthread_mutex_lock(&cache->mutex);
    struct hash_entry *entry =
       _mesa_hash_table_search(cache->variant_cache, variant->variant_sha1);

    if (entry) {
       pthread_mutex_unlock(&cache->mutex);
       return;
    }

    v3dv_shader_variant_ref(variant);
    _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
    if (unlikely(dump_stats)) {
       char sha1buf[41];
       _mesa_sha1_format(sha1buf, variant->variant_sha1);

       fprintf(stderr, "pipeline cache %p, new variant entry with key %s\n\t%p\n",
               cache, sha1buf, variant);
       cache->variant_stats.count++;
       cache_dump_stats(cache);
    }

    pthread_mutex_unlock(&cache->mutex);
 }

 static struct v3dv_shader_variant*
 shader_variant_create_from_blob(struct v3dv_device *device,
                                 struct blob_reader *blob)
 {
    VkResult result;

    gl_shader_stage stage = blob_read_uint32(blob);
    bool is_coord = blob_read_uint8(blob);

    const unsigned char *variant_sha1 = blob_read_bytes(blob, 20);

    uint32_t prog_data_size = blob_read_uint32(blob);
    /* FIXME: as we include the stage perhaps we can avoid prog_data_size? */
    assert(prog_data_size == v3d_prog_data_size(stage));

    const void *prog_data = blob_read_bytes(blob, prog_data_size);
    if (blob->overrun)
       return NULL;

    uint32_t ulist_count = blob_read_uint32(blob);
    uint32_t contents_size = sizeof(enum quniform_contents) * ulist_count;
    const void *contents_data = blob_read_bytes(blob, contents_size);
    if (blob->overrun)
       return NULL;

    uint ulist_data_size = sizeof(uint32_t) * ulist_count;
    const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size);
    if (blob->overrun)
       return NULL;

    uint32_t qpu_insts_size = blob_read_uint32(blob);
    const uint64_t *qpu_insts = blob_read_bytes(blob, qpu_insts_size);
    if (blob->overrun)
       return NULL;

    /* shader_variant_create expects a newly created prog_data for their own,
     * as it is what the v3d compiler returns. So we are also allocating one
     * (including the uniform list) and filled it up with the data that we read
     * from the blob
     */
    struct v3d_prog_data *new_prog_data = rzalloc_size(NULL, prog_data_size);
    memcpy(new_prog_data, prog_data, prog_data_size);
    struct v3d_uniform_list *ulist = &new_prog_data->uniforms;
    ulist->count = ulist_count;
    ulist->contents = ralloc_array(new_prog_data, enum quniform_contents, ulist->count);
    memcpy(ulist->contents, contents_data, contents_size);
    ulist->data = ralloc_array(new_prog_data, uint32_t, ulist->count);
    memcpy(ulist->data, ulist_data_data, ulist_data_size);

    return v3dv_shader_variant_create(device, stage, is_coord,
                                      variant_sha1,
                                      new_prog_data, prog_data_size,
                                      qpu_insts, qpu_insts_size,
                                      &result);
 }

 static void
 pipeline_cache_load(struct v3dv_pipeline_cache *cache,
                     size_t size,
                     const void *data)
 {
    struct v3dv_device *device = cache->device;
    struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
    struct vk_pipeline_cache_header header;

    if (cache->variant_cache == NULL)
       return;

    struct blob_reader blob;
    blob_reader_init(&blob, data, size);

    blob_copy_bytes(&blob, &header, sizeof(header));
    uint32_t count = blob_read_uint32(&blob);
    if (blob.overrun)
       return;

    if (unlikely(dump_stats)) {
       fprintf(stderr, "pipeline cache %p, loading %i variant entries\n", cache, count);
    }

    if (size < sizeof(header))
       return;
    memcpy(&header, data, sizeof(header));
    if (header.header_size < sizeof(header))
       return;
    if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
       return;
    if (header.vendor_id != v3dv_physical_device_vendor_id(pdevice))
       return;
    if (header.device_id != v3dv_physical_device_device_id(pdevice))
       return;
    if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0)
       return;

    for (uint32_t i = 0; i < count; i++) {
       struct v3dv_shader_variant *variant =
          shader_variant_create_from_blob(device, &blob);
       if (!variant)
          break;
       _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
       if (unlikely(dump_stats))
          cache->variant_stats.count++;
    }

 }

 VkResult
 v3dv_CreatePipelineCache(VkDevice _device,
                          const VkPipelineCacheCreateInfo *pCreateInfo,
                          const VkAllocationCallbacks *pAllocator,
                          VkPipelineCache *pPipelineCache)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    struct v3dv_pipeline_cache *cache;

    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
    assert(pCreateInfo->flags == 0);

    cache = vk_alloc2(&device->alloc, pAllocator,
                      sizeof(*cache), 8,
                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

    if (cache == NULL)
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

    pipeline_cache_init(cache, device,
                        device->instance->pipeline_cache_enabled);

    if (pCreateInfo->initialDataSize > 0) {
       pipeline_cache_load(cache,
                           pCreateInfo->initialDataSize,
                           pCreateInfo->pInitialData);
    }

    *pPipelineCache = v3dv_pipeline_cache_to_handle(cache);

    return VK_SUCCESS;
 }

 void
 v3dv_DestroyPipelineCache(VkDevice _device,
                           VkPipelineCache _cache,
                           const VkAllocationCallbacks *pAllocator)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);

    if (!cache)
       return;

    pthread_mutex_destroy(&cache->mutex);

    if (cache->nir_cache) {
       hash_table_foreach(cache->nir_cache, entry)
          ralloc_free(entry->data);

       _mesa_hash_table_destroy(cache->nir_cache, NULL);

       hash_table_foreach(cache->variant_cache, entry) {
          struct v3dv_shader_variant *variant = entry->data;
          if (variant)
             v3dv_shader_variant_unref(device, variant);
       }

       _mesa_hash_table_destroy(cache->variant_cache, NULL);

    }

    vk_free2(&device->alloc, pAllocator, cache);
 }

 VkResult
 v3dv_MergePipelineCaches(VkDevice device,
                          VkPipelineCache dstCache,
                          uint32_t srcCacheCount,
                          const VkPipelineCache *pSrcCaches)
 {
    /* FIXME: at this point there are not other content that the header cache,
     * so merging pipeline caches would be always successful
     */
    return VK_SUCCESS;
 }

 static bool
 shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
                              struct blob *blob)
 {
    blob_write_uint32(blob, variant->stage);
    blob_write_uint8(blob, variant->is_coord);

    blob_write_bytes(blob, variant->variant_sha1, sizeof(variant->variant_sha1));

    blob_write_uint32(blob, variant->prog_data_size);
    blob_write_bytes(blob, variant->prog_data.base, variant->prog_data_size);

    struct v3d_uniform_list *ulist = &variant->prog_data.base->uniforms;
    blob_write_uint32(blob, ulist->count);
    blob_write_bytes(blob, ulist->contents, sizeof(enum quniform_contents) * ulist->count);
    blob_write_bytes(blob, ulist->data, sizeof(uint32_t) * ulist->count);

    blob_write_uint32(blob, variant->qpu_insts_size);
    assert(variant->assembly_bo->map);
    blob_write_bytes(blob, variant->assembly_bo->map, variant->qpu_insts_size);

    return !blob->out_of_memory;
 }

 VkResult
 v3dv_GetPipelineCacheData(VkDevice _device,
                           VkPipelineCache _cache,
                           size_t *pDataSize,
                           void *pData)
 {
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);

    struct blob blob;
    if (pData) {
       blob_init_fixed(&blob, pData, *pDataSize);
    } else {
       blob_init_fixed(&blob, NULL, SIZE_MAX);
    }

    struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
    VkResult result = VK_SUCCESS;

    pthread_mutex_lock(&cache->mutex);

    struct vk_pipeline_cache_header header = {
       .header_size = sizeof(struct vk_pipeline_cache_header),
       .header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
       .vendor_id = v3dv_physical_device_vendor_id(pdevice),
       .device_id = v3dv_physical_device_device_id(pdevice),
    };
    memcpy(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
    blob_write_bytes(&blob, &header, sizeof(header));

    uint32_t count = 0;
    intptr_t count_offset = blob_reserve_uint32(&blob);
    if (count_offset < 0) {
       *pDataSize = 0;
       blob_finish(&blob);
       pthread_mutex_unlock(&cache->mutex);
       return VK_INCOMPLETE;
    }

    if (cache->variant_cache) {
       hash_table_foreach(cache->variant_cache, entry) {
          struct v3dv_shader_variant *variant = entry->data;

          size_t save_size = blob.size;
          if (!shader_variant_write_to_blob(variant, &blob)) {
             /* If it fails reset to the previous size and bail */
             blob.size = save_size;
             pthread_mutex_unlock(&cache->mutex);
             result = VK_INCOMPLETE;
             break;
          }

          count++;
       }
    }

    blob_overwrite_uint32(&blob, count_offset, count);

    *pDataSize = blob.size;

    blob_finish(&blob);

    if (unlikely(dump_stats)) {
       assert(count <= cache->variant_stats.count);
       fprintf(stderr, "GetPipelineCacheData: serializing cache %p, "
               "%i variant entries, %u DataSize\n",
               cache, count, (uint32_t) *pDataSize);
    }

    pthread_mutex_unlock(&cache->mutex);

    return result;
 }
	/*
	* Copyright © 2019 Raspberry Pi
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/

	#include "v3dv_private.h"
	#include "vulkan/util/vk_util.h"
	#include "util/blob.h"
	#include "nir/nir_serialize.h"

	static const bool dump_stats = false;
	static const bool dump_stats_verbose = false;

	static uint32_t
	sha1_hash_func(const void *sha1)
	{
	return _mesa_hash_data(sha1, 20);
	}

	static bool
	sha1_compare_func(const void sha1_a, const void sha1_b)
	{
	return memcmp(sha1_a, sha1_b, 20) == 0;
	}

	struct serialized_nir {
	unsigned char sha1_key[20];
	size_t size;
	char data[0];
	};

	static void
	cache_dump_stats(struct v3dv_pipeline_cache *cache)
	{
	if (!dump_stats_verbose)
	return;

	fprintf(stderr, " NIR cache entries: %d\n", cache->nir_stats.count);
	fprintf(stderr, " NIR cache miss count: %d\n", cache->nir_stats.miss);
	fprintf(stderr, " NIR cache hit count: %d\n", cache->nir_stats.hit);

	fprintf(stderr, " variant cache entries: %d\n", cache->variant_stats.count);
	fprintf(stderr, " variant cache miss count: %d\n", cache->variant_stats.miss);
	fprintf(stderr, " variant cache hit count: %d\n", cache->variant_stats.hit);
	}

	void
	v3dv_pipeline_cache_upload_nir(struct v3dv_pipeline *pipeline,
	struct v3dv_pipeline_cache *cache,
	nir_shader *nir,
	unsigned char sha1_key[20])
	{
	if (!cache \|\| !cache->nir_cache)
	return;

	pthread_mutex_lock(&cache->mutex);
	struct hash_entry *entry =
	_mesa_hash_table_search(cache->nir_cache, sha1_key);
	pthread_mutex_unlock(&cache->mutex);
	if (entry)
	return;

	struct blob blob;
	blob_init(&blob);

	nir_serialize(&blob, nir, false);
	if (blob.out_of_memory) {
	blob_finish(&blob);
	return;
	}

	pthread_mutex_lock(&cache->mutex);
	/* Because ralloc isn't thread-safe, we have to do all this inside the
	* lock. We could unlock for the big memcpy but it's probably not worth
	* the hassle.
	*/
	entry = _mesa_hash_table_search(cache->nir_cache, sha1_key);
	if (entry) {
	blob_finish(&blob);
	pthread_mutex_unlock(&cache->mutex);
	return;
	}

	struct serialized_nir *snir =
	ralloc_size(cache->nir_cache, sizeof(*snir) + blob.size);
	memcpy(snir->sha1_key, sha1_key, 20);
	snir->size = blob.size;
	memcpy(snir->data, blob.data, blob.size);

	blob_finish(&blob);

	if (unlikely(dump_stats)) {
	char sha1buf[41];
	_mesa_sha1_format(sha1buf, snir->sha1_key);
	fprintf(stderr, "pipeline cache %p, new nir entry %s\n", cache, sha1buf);

	cache->nir_stats.count++;
	cache_dump_stats(cache);
	}

	_mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);

	pthread_mutex_unlock(&cache->mutex);
	}

	nir_shader*
	v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
	struct v3dv_pipeline_cache *cache,
	const nir_shader_compiler_options *nir_options,
	unsigned char sha1_key[20])
	{
	if (!cache \|\| !cache->nir_cache)
	return NULL;

	if (unlikely(dump_stats)) {
	char sha1buf[41];
	_mesa_sha1_format(sha1buf, sha1_key);

	fprintf(stderr, "pipeline cache %p, search for nir %s\n", cache, sha1buf);
	}

	const struct serialized_nir *snir = NULL;

	pthread_mutex_lock(&cache->mutex);
	struct hash_entry *entry =
	_mesa_hash_table_search(cache->nir_cache, sha1_key);
	if (entry)
	snir = entry->data;
	pthread_mutex_unlock(&cache->mutex);

	if (snir) {
	struct blob_reader blob;
	blob_reader_init(&blob, snir->data, snir->size);

	/* We use context NULL as we want the p_stage to keep the reference to
	* nir, as we keep open the possibility of provide a shader variant
	* after cache creation
	*/
	nir_shader *nir = nir_deserialize(NULL, nir_options, &blob);
	if (blob.overrun) {
	ralloc_free(nir);
	} else {
	if (unlikely(dump_stats)) {
	cache->nir_stats.hit++;
	cache_dump_stats(cache);
	}
	return nir;
	}
	}

	if (unlikely(dump_stats)) {
	cache->nir_stats.miss++;
	cache_dump_stats(cache);
	}

	return NULL;
	}

	static void
	pipeline_cache_init(struct v3dv_pipeline_cache *cache,
	struct v3dv_device *device,
	bool cache_enabled)
	{
	cache->_loader_data.loaderMagic = ICD_LOADER_MAGIC;

	cache->device = device;
	pthread_mutex_init(&cache->mutex, NULL);

	if (cache_enabled) {
	cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
	sha1_compare_func);
	cache->nir_stats.miss = 0;
	cache->nir_stats.hit = 0;
	cache->nir_stats.count = 0;

	cache->variant_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
	sha1_compare_func);
	cache->variant_stats.miss = 0;
	cache->variant_stats.hit = 0;
	cache->variant_stats.count = 0;
	} else {
	cache->nir_cache = NULL;
	cache->variant_cache = NULL;
	}

	}

	struct v3dv_shader_variant*
	v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline,
	struct v3dv_pipeline_cache *cache,
	unsigned char sha1_key[20])
	{
	if (!cache \|\| !cache->nir_cache)
	return NULL;

	if (unlikely(dump_stats)) {
	char sha1buf[41];
	_mesa_sha1_format(sha1buf, sha1_key);

	fprintf(stderr, "pipeline cache %p, search variant with key %s\n", cache, sha1buf);
	}

	pthread_mutex_lock(&cache->mutex);

	struct hash_entry *entry =
	_mesa_hash_table_search(cache->variant_cache, sha1_key);

	if (entry) {
	struct v3dv_shader_variant *variant =
	(struct v3dv_shader_variant *) entry->data;

	if (unlikely(dump_stats)) {
	fprintf(stderr, "\tcache hit: %p\n", variant);
	cache->variant_stats.hit++;
	cache_dump_stats(cache);
	}

	if (variant)
	v3dv_shader_variant_ref(variant);

	pthread_mutex_unlock(&cache->mutex);
	return variant;
	}

	if (unlikely(dump_stats)) {
	fprintf(stderr, "\tcache miss\n");
	cache->variant_stats.miss++;
	cache_dump_stats(cache);
	}

	pthread_mutex_unlock(&cache->mutex);
	return NULL;
	}

	void
	v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline,
	struct v3dv_pipeline_cache *cache,
	struct v3dv_shader_variant *variant)
	{
	if (!cache \|\| !cache->variant_cache)
	return;

	pthread_mutex_lock(&cache->mutex);
	struct hash_entry *entry =
	_mesa_hash_table_search(cache->variant_cache, variant->variant_sha1);

	if (entry) {
	pthread_mutex_unlock(&cache->mutex);
	return;
	}

	v3dv_shader_variant_ref(variant);
	_mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
	if (unlikely(dump_stats)) {
	char sha1buf[41];
	_mesa_sha1_format(sha1buf, variant->variant_sha1);

	fprintf(stderr, "pipeline cache %p, new variant entry with key %s\n\t%p\n",
	cache, sha1buf, variant);
	cache->variant_stats.count++;
	cache_dump_stats(cache);
	}

	pthread_mutex_unlock(&cache->mutex);
	}

	static struct v3dv_shader_variant*
	shader_variant_create_from_blob(struct v3dv_device *device,
	struct blob_reader *blob)
	{
	VkResult result;

	gl_shader_stage stage = blob_read_uint32(blob);
	bool is_coord = blob_read_uint8(blob);

	const unsigned char *variant_sha1 = blob_read_bytes(blob, 20);

	uint32_t prog_data_size = blob_read_uint32(blob);
	/* FIXME: as we include the stage perhaps we can avoid prog_data_size? */
	assert(prog_data_size == v3d_prog_data_size(stage));

	const void *prog_data = blob_read_bytes(blob, prog_data_size);
	if (blob->overrun)
	return NULL;

	uint32_t ulist_count = blob_read_uint32(blob);
	uint32_t contents_size = sizeof(enum quniform_contents) * ulist_count;
	const void *contents_data = blob_read_bytes(blob, contents_size);
	if (blob->overrun)
	return NULL;

	uint ulist_data_size = sizeof(uint32_t) * ulist_count;
	const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size);
	if (blob->overrun)
	return NULL;

	uint32_t qpu_insts_size = blob_read_uint32(blob);
	const uint64_t *qpu_insts = blob_read_bytes(blob, qpu_insts_size);
	if (blob->overrun)
	return NULL;

	/* shader_variant_create expects a newly created prog_data for their own,
	* as it is what the v3d compiler returns. So we are also allocating one
	* (including the uniform list) and filled it up with the data that we read
	* from the blob
	*/
	struct v3d_prog_data *new_prog_data = rzalloc_size(NULL, prog_data_size);
	memcpy(new_prog_data, prog_data, prog_data_size);
	struct v3d_uniform_list *ulist = &new_prog_data->uniforms;
	ulist->count = ulist_count;
	ulist->contents = ralloc_array(new_prog_data, enum quniform_contents, ulist->count);
	memcpy(ulist->contents, contents_data, contents_size);
	ulist->data = ralloc_array(new_prog_data, uint32_t, ulist->count);
	memcpy(ulist->data, ulist_data_data, ulist_data_size);

	return v3dv_shader_variant_create(device, stage, is_coord,
	variant_sha1,
	new_prog_data, prog_data_size,
	qpu_insts, qpu_insts_size,
	&result);
	}

	static void
	pipeline_cache_load(struct v3dv_pipeline_cache *cache,
	size_t size,
	const void *data)
	{
	struct v3dv_device *device = cache->device;
	struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
	struct vk_pipeline_cache_header header;

	if (cache->variant_cache == NULL)
	return;

	struct blob_reader blob;
	blob_reader_init(&blob, data, size);

	blob_copy_bytes(&blob, &header, sizeof(header));
	uint32_t count = blob_read_uint32(&blob);
	if (blob.overrun)
	return;

	if (unlikely(dump_stats)) {
	fprintf(stderr, "pipeline cache %p, loading %i variant entries\n", cache, count);
	}

	if (size < sizeof(header))
	return;
	memcpy(&header, data, sizeof(header));
	if (header.header_size < sizeof(header))
	return;
	if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
	return;
	if (header.vendor_id != v3dv_physical_device_vendor_id(pdevice))
	return;
	if (header.device_id != v3dv_physical_device_device_id(pdevice))
	return;
	if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0)
	return;

	for (uint32_t i = 0; i < count; i++) {
	struct v3dv_shader_variant *variant =
	shader_variant_create_from_blob(device, &blob);
	if (!variant)
	break;
	_mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant);
	if (unlikely(dump_stats))
	cache->variant_stats.count++;
	}

	}

	VkResult
	v3dv_CreatePipelineCache(VkDevice _device,
	const VkPipelineCacheCreateInfo *pCreateInfo,
	const VkAllocationCallbacks *pAllocator,
	VkPipelineCache *pPipelineCache)
	{
	V3DV_FROM_HANDLE(v3dv_device, device, _device);
	struct v3dv_pipeline_cache *cache;

	assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
	assert(pCreateInfo->flags == 0);

	cache = vk_alloc2(&device->alloc, pAllocator,
	sizeof(*cache), 8,
	VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

	if (cache == NULL)
	return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);

	pipeline_cache_init(cache, device,
	device->instance->pipeline_cache_enabled);

	if (pCreateInfo->initialDataSize > 0) {
	pipeline_cache_load(cache,
	pCreateInfo->initialDataSize,
	pCreateInfo->pInitialData);
	}

	*pPipelineCache = v3dv_pipeline_cache_to_handle(cache);

	return VK_SUCCESS;
	}

	void
	v3dv_DestroyPipelineCache(VkDevice _device,
	VkPipelineCache _cache,
	const VkAllocationCallbacks *pAllocator)
	{
	V3DV_FROM_HANDLE(v3dv_device, device, _device);
	V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);

	if (!cache)
	return;

	pthread_mutex_destroy(&cache->mutex);

	if (cache->nir_cache) {
	hash_table_foreach(cache->nir_cache, entry)
	ralloc_free(entry->data);

	_mesa_hash_table_destroy(cache->nir_cache, NULL);

	hash_table_foreach(cache->variant_cache, entry) {
	struct v3dv_shader_variant *variant = entry->data;
	if (variant)
	v3dv_shader_variant_unref(device, variant);
	}

	_mesa_hash_table_destroy(cache->variant_cache, NULL);

	}

	vk_free2(&device->alloc, pAllocator, cache);
	}

	VkResult
	v3dv_MergePipelineCaches(VkDevice device,
	VkPipelineCache dstCache,
	uint32_t srcCacheCount,
	const VkPipelineCache *pSrcCaches)
	{
	/* FIXME: at this point there are not other content that the header cache,
	* so merging pipeline caches would be always successful
	*/
	return VK_SUCCESS;
	}

	static bool
	shader_variant_write_to_blob(const struct v3dv_shader_variant *variant,
	struct blob *blob)
	{
	blob_write_uint32(blob, variant->stage);
	blob_write_uint8(blob, variant->is_coord);

	blob_write_bytes(blob, variant->variant_sha1, sizeof(variant->variant_sha1));

	blob_write_uint32(blob, variant->prog_data_size);
	blob_write_bytes(blob, variant->prog_data.base, variant->prog_data_size);

	struct v3d_uniform_list *ulist = &variant->prog_data.base->uniforms;
	blob_write_uint32(blob, ulist->count);
	blob_write_bytes(blob, ulist->contents, sizeof(enum quniform_contents) * ulist->count);
	blob_write_bytes(blob, ulist->data, sizeof(uint32_t) * ulist->count);

	blob_write_uint32(blob, variant->qpu_insts_size);
	assert(variant->assembly_bo->map);
	blob_write_bytes(blob, variant->assembly_bo->map, variant->qpu_insts_size);

	return !blob->out_of_memory;
	}

	VkResult
	v3dv_GetPipelineCacheData(VkDevice _device,
	VkPipelineCache _cache,
	size_t *pDataSize,
	void *pData)
	{
	V3DV_FROM_HANDLE(v3dv_device, device, _device);
	V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);

	struct blob blob;
	if (pData) {
	blob_init_fixed(&blob, pData, *pDataSize);
	} else {
	blob_init_fixed(&blob, NULL, SIZE_MAX);
	}

	struct v3dv_physical_device *pdevice = &device->instance->physicalDevice;
	VkResult result = VK_SUCCESS;

	pthread_mutex_lock(&cache->mutex);

	struct vk_pipeline_cache_header header = {
	.header_size = sizeof(struct vk_pipeline_cache_header),
	.header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
	.vendor_id = v3dv_physical_device_vendor_id(pdevice),
	.device_id = v3dv_physical_device_device_id(pdevice),
	};
	memcpy(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
	blob_write_bytes(&blob, &header, sizeof(header));

	uint32_t count = 0;
	intptr_t count_offset = blob_reserve_uint32(&blob);
	if (count_offset < 0) {
	*pDataSize = 0;
	blob_finish(&blob);
	pthread_mutex_unlock(&cache->mutex);
	return VK_INCOMPLETE;
	}

	if (cache->variant_cache) {
	hash_table_foreach(cache->variant_cache, entry) {
	struct v3dv_shader_variant *variant = entry->data;

	size_t save_size = blob.size;
	if (!shader_variant_write_to_blob(variant, &blob)) {
	/* If it fails reset to the previous size and bail */
	blob.size = save_size;
	pthread_mutex_unlock(&cache->mutex);
	result = VK_INCOMPLETE;
	break;
	}

	count++;
	}
	}

	blob_overwrite_uint32(&blob, count_offset, count);

	*pDataSize = blob.size;

	blob_finish(&blob);

	if (unlikely(dump_stats)) {
	assert(count <= cache->variant_stats.count);
	fprintf(stderr, "GetPipelineCacheData: serializing cache %p, "
	"%i variant entries, %u DataSize\n",
	cache, count, (uint32_t) *pDataSize);
	}

	pthread_mutex_unlock(&cache->mutex);

	return result;
	}