src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "brw_nir.h"
 #include "compiler/nir/nir_builder.h"
 #include "util/u_math.h"
 #include "util/bitscan.h"

 static nir_ssa_def *
 dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
                   nir_ssa_def *store_src, int offset,
                   unsigned num_components, unsigned bit_size,
                   unsigned align)
 {
    const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];

    nir_intrinsic_instr *dup =
       nir_intrinsic_instr_create(b->shader, intrin->intrinsic);

    nir_src *intrin_offset_src = nir_get_io_offset_src(intrin);
    for (unsigned i = 0; i < info->num_srcs; i++) {
       assert(intrin->src[i].is_ssa);
       if (i == 0 && store_src) {
          assert(!info->has_dest);
          assert(&intrin->src[i] != intrin_offset_src);
          dup->src[i] = nir_src_for_ssa(store_src);
       } else if (&intrin->src[i] == intrin_offset_src) {
          dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa,
                                                        offset));
       } else {
          dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa);
       }
    }

    dup->num_components = num_components;

    for (unsigned i = 0; i < info->num_indices; i++)
       dup->const_index[i] = intrin->const_index[i];

    nir_intrinsic_set_align(dup, align, 0);

    if (info->has_dest) {
       assert(intrin->dest.is_ssa);
       nir_ssa_dest_init(&dup->instr, &dup->dest,
                         num_components, bit_size,
                         intrin->dest.ssa.name);
    } else {
       nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1);
    }

    nir_builder_instr_insert(b, &dup->instr);

    return info->has_dest ? &dup->dest.ssa : NULL;
 }

 static bool
 lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
                         const struct gen_device_info *devinfo)
 {
    const bool needs_scalar =
       intrin->intrinsic == nir_intrinsic_load_scratch;

    assert(intrin->dest.is_ssa);
    const unsigned bit_size = intrin->dest.ssa.bit_size;
    const unsigned num_components = intrin->dest.ssa.num_components;
    const unsigned bytes_read = num_components * (bit_size / 8);
    const unsigned align = nir_intrinsic_align(intrin);

    if (bit_size == 32 && align >= 32 &&
        (!needs_scalar || intrin->num_components == 1))
       return false;

    nir_ssa_def *result;
    nir_src *offset_src = nir_get_io_offset_src(intrin);
    if (bit_size < 32 && nir_src_is_const(*offset_src)) {
       /* The offset is constant so we can use a 32-bit load and just shift it
        * around as needed.
        */
       const int load_offset = nir_src_as_uint(*offset_src) % 4;
       assert(load_offset % (bit_size / 8) == 0);
       const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, 4);
       /* A 16-bit vec4 is a 32-bit vec2.  We add an extra component in case
        * we offset into a component with load_offset.
        */
       assert(load_comps32 <= 3);

       nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset,
                                             load_comps32, 32, 4);
       result = nir_extract_bits(b, &load, 1, load_offset * 8,
                                 num_components, bit_size);
    } else {
       /* Otherwise, we have to break it into smaller loads.  We could end up
        * with as many as 32 loads if we're loading a u64vec16 from scratch.
        */
       nir_ssa_def *loads[32];
       unsigned num_loads = 0;
       int load_offset = 0;
       while (load_offset < bytes_read) {
          const unsigned bytes_left = bytes_read - load_offset;
          unsigned load_bit_size, load_comps;
          if (align < 4) {
             load_comps = 1;
             /* Choose a byte, word, or dword */
             load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8;
          } else {
             assert(load_offset % 4 == 0);
             load_bit_size = 32;
             load_comps = needs_scalar ? 1 :
                          DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
          }

          loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset,
                                                 load_comps, load_bit_size,
                                                 align);

          load_offset += load_comps * (load_bit_size / 8);
       }
       assert(num_loads <= ARRAY_SIZE(loads));
       result = nir_extract_bits(b, loads, num_loads, 0,
                                 num_components, bit_size);
    }

    nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                             nir_src_for_ssa(result));
    nir_instr_remove(&intrin->instr);

    return true;
 }

 static bool
 lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
                          const struct gen_device_info *devinfo)
 {
    const bool needs_scalar =
       intrin->intrinsic == nir_intrinsic_store_scratch;

    assert(intrin->src[0].is_ssa);
    nir_ssa_def *value = intrin->src[0].ssa;

    assert(intrin->num_components == value->num_components);
    const unsigned bit_size = value->bit_size;
    const unsigned num_components = intrin->num_components;
    const unsigned bytes_written = num_components * (bit_size / 8);
    const unsigned align_mul = nir_intrinsic_align_mul(intrin);
    const unsigned align_offset = nir_intrinsic_align_offset(intrin);
    const unsigned align = nir_intrinsic_align(intrin);

    nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin);
    assert(writemask < (1 << num_components));

    if ((value->bit_size <= 32 && num_components == 1) ||
        (value->bit_size == 32 && align >= 32 &&
         writemask == (1 << num_components) - 1 &&
         !needs_scalar))
       return false;

    nir_src *offset_src = nir_get_io_offset_src(intrin);
    const bool offset_is_const = nir_src_is_const(*offset_src);
    const unsigned const_offset =
       offset_is_const ? nir_src_as_uint(*offset_src) : 0;

    const unsigned byte_size = bit_size / 8;
    assert(byte_size <= sizeof(uint64_t));

    BITSET_DECLARE(mask, NIR_MAX_VEC_COMPONENTS * sizeof(uint64_t));
    BITSET_ZERO(mask);

    for (unsigned i = 0; i < num_components; i++) {
       if (writemask & (1u << i))
          BITSET_SET_RANGE(mask, i * byte_size, ((i + 1) * byte_size) - 1);
    }

    while (BITSET_FFS(mask) != 0) {
       const int start = BITSET_FFS(mask) - 1;

       int end;
       for (end = start + 1; end < bytes_written; end++) {
          if (!(BITSET_TEST(mask, end)))
             break;
       }
       /* The size of the current contiguous chunk in bytes */
       const unsigned chunk_bytes = end - start;

       const bool is_dword_aligned =
          (align_mul >= 4 && (align_offset + start) % 4 == 0) ||
          (offset_is_const && (start + const_offset) % 4 == 0);

       unsigned store_comps, store_bit_size, store_align;
       if (chunk_bytes >= 4 && is_dword_aligned) {
          store_align = MAX2(align, 4);
          store_bit_size = 32;
          store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4;
       } else {
          store_align = align;
          store_comps = 1;
          store_bit_size = MIN2(chunk_bytes, 4) * 8;
          /* The bit size must be a power of two */
          if (store_bit_size == 24)
             store_bit_size = 16;
       }
       const unsigned store_bytes = store_comps * (store_bit_size / 8);

       nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8,
                                              store_comps, store_bit_size);

       dup_mem_intrinsic(b, intrin, packed, start,
                         store_comps, store_bit_size, store_align);

       BITSET_CLEAR_RANGE(mask, start, (start + store_bytes - 1));
    }

    nir_instr_remove(&intrin->instr);

    return true;
 }

 static bool
 lower_mem_access_bit_sizes_impl(nir_function_impl *impl,
                                 const struct gen_device_info *devinfo)
 {
    bool progress = false;

    nir_builder b;
    nir_builder_init(&b, impl);

    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;

          b.cursor = nir_after_instr(instr);

          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
          switch (intrin->intrinsic) {
          case nir_intrinsic_load_global:
          case nir_intrinsic_load_ssbo:
          case nir_intrinsic_load_shared:
          case nir_intrinsic_load_scratch:
             if (lower_mem_load_bit_size(&b, intrin, devinfo))
                progress = true;
             break;

          case nir_intrinsic_store_global:
          case nir_intrinsic_store_ssbo:
          case nir_intrinsic_store_shared:
          case nir_intrinsic_store_scratch:
             if (lower_mem_store_bit_size(&b, intrin, devinfo))
                progress = true;
             break;

          default:
             break;
          }
       }
    }

    if (progress) {
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
    } else {
       nir_metadata_preserve(impl, nir_metadata_all);
    }

    return progress;
 }

 /**
  * This pass loads arbitrary SSBO and shared memory load/store operations to
  * intrinsics which are natively handleable by GEN hardware.  In particular,
  * we have two general types of memory load/store messages:
  *
  *  - Untyped surface read/write:  These can load/store between one and four
  *    dword components to/from a dword-aligned offset.
  *
  *  - Byte scattered read/write:  These can load/store a single byte, word, or
  *    dword scalar to/from an unaligned byte offset.
  *
  * Neither type of message can do a write-masked store.  This pass converts
  * all nir load/store intrinsics into a series of either 8 or 32-bit
  * load/store intrinsics with a number of components that we can directly
  * handle in hardware and with a trivial write-mask.
  *
  * For scratch access, additional consideration has to be made due to the way
  * that we swizzle the memory addresses to achieve decent cache locality.  In
  * particular, even though untyped surface read/write messages exist and work,
  * we can't use them to load multiple components in a single SEND.  For more
  * detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr.
  */
 bool
 brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
                                    const struct gen_device_info *devinfo)
 {
    bool progress = false;

    nir_foreach_function(func, shader) {
       if (func->impl && lower_mem_access_bit_sizes_impl(func->impl, devinfo))
          progress = true;
    }

    return progress;
 }
	/*
	* Copyright © 2018 Intel Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/

	#include "brw_nir.h"
	#include "compiler/nir/nir_builder.h"
	#include "util/u_math.h"
	#include "util/bitscan.h"

	static nir_ssa_def *
	dup_mem_intrinsic(nir_builder b, nir_intrinsic_instr intrin,
	nir_ssa_def *store_src, int offset,
	unsigned num_components, unsigned bit_size,
	unsigned align)
	{
	const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];

	nir_intrinsic_instr *dup =
	nir_intrinsic_instr_create(b->shader, intrin->intrinsic);

	nir_src *intrin_offset_src = nir_get_io_offset_src(intrin);
	for (unsigned i = 0; i < info->num_srcs; i++) {
	assert(intrin->src[i].is_ssa);
	if (i == 0 && store_src) {
	assert(!info->has_dest);
	assert(&intrin->src[i] != intrin_offset_src);
	dup->src[i] = nir_src_for_ssa(store_src);
	} else if (&intrin->src[i] == intrin_offset_src) {
	dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa,
	offset));
	} else {
	dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa);
	}
	}

	dup->num_components = num_components;

	for (unsigned i = 0; i < info->num_indices; i++)
	dup->const_index[i] = intrin->const_index[i];

	nir_intrinsic_set_align(dup, align, 0);

	if (info->has_dest) {
	assert(intrin->dest.is_ssa);
	nir_ssa_dest_init(&dup->instr, &dup->dest,
	num_components, bit_size,
	intrin->dest.ssa.name);
	} else {
	nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1);
	}

	nir_builder_instr_insert(b, &dup->instr);

	return info->has_dest ? &dup->dest.ssa : NULL;
	}

	static bool
	lower_mem_load_bit_size(nir_builder b, nir_intrinsic_instr intrin,
	const struct gen_device_info *devinfo)
	{
	const bool needs_scalar =
	intrin->intrinsic == nir_intrinsic_load_scratch;

	assert(intrin->dest.is_ssa);
	const unsigned bit_size = intrin->dest.ssa.bit_size;
	const unsigned num_components = intrin->dest.ssa.num_components;
	const unsigned bytes_read = num_components * (bit_size / 8);
	const unsigned align = nir_intrinsic_align(intrin);

	if (bit_size == 32 && align >= 32 &&
	(!needs_scalar \|\| intrin->num_components == 1))
	return false;

	nir_ssa_def *result;
	nir_src *offset_src = nir_get_io_offset_src(intrin);
	if (bit_size < 32 && nir_src_is_const(*offset_src)) {
	/* The offset is constant so we can use a 32-bit load and just shift it
	* around as needed.
	*/
	const int load_offset = nir_src_as_uint(*offset_src) % 4;
	assert(load_offset % (bit_size / 8) == 0);
	const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, 4);
	/* A 16-bit vec4 is a 32-bit vec2. We add an extra component in case
	* we offset into a component with load_offset.
	*/
	assert(load_comps32 <= 3);

	nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset,
	load_comps32, 32, 4);
	result = nir_extract_bits(b, &load, 1, load_offset * 8,
	num_components, bit_size);
	} else {
	/* Otherwise, we have to break it into smaller loads. We could end up
	* with as many as 32 loads if we're loading a u64vec16 from scratch.
	*/
	nir_ssa_def *loads[32];
	unsigned num_loads = 0;
	int load_offset = 0;
	while (load_offset < bytes_read) {
	const unsigned bytes_left = bytes_read - load_offset;
	unsigned load_bit_size, load_comps;
	if (align < 4) {
	load_comps = 1;
	/* Choose a byte, word, or dword */
	load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8;
	} else {
	assert(load_offset % 4 == 0);
	load_bit_size = 32;
	load_comps = needs_scalar ? 1 :
	DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
	}

	loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset,
	load_comps, load_bit_size,
	align);

	load_offset += load_comps * (load_bit_size / 8);
	}
	assert(num_loads <= ARRAY_SIZE(loads));
	result = nir_extract_bits(b, loads, num_loads, 0,
	num_components, bit_size);
	}

	nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
	nir_src_for_ssa(result));
	nir_instr_remove(&intrin->instr);

	return true;
	}

	static bool
	lower_mem_store_bit_size(nir_builder b, nir_intrinsic_instr intrin,
	const struct gen_device_info *devinfo)
	{
	const bool needs_scalar =
	intrin->intrinsic == nir_intrinsic_store_scratch;

	assert(intrin->src[0].is_ssa);
	nir_ssa_def *value = intrin->src[0].ssa;

	assert(intrin->num_components == value->num_components);
	const unsigned bit_size = value->bit_size;
	const unsigned num_components = intrin->num_components;
	const unsigned bytes_written = num_components * (bit_size / 8);
	const unsigned align_mul = nir_intrinsic_align_mul(intrin);
	const unsigned align_offset = nir_intrinsic_align_offset(intrin);
	const unsigned align = nir_intrinsic_align(intrin);

	nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin);
	assert(writemask < (1 << num_components));

	if ((value->bit_size <= 32 && num_components == 1) \|\|
	(value->bit_size == 32 && align >= 32 &&
	writemask == (1 << num_components) - 1 &&
	!needs_scalar))
	return false;

	nir_src *offset_src = nir_get_io_offset_src(intrin);
	const bool offset_is_const = nir_src_is_const(*offset_src);
	const unsigned const_offset =
	offset_is_const ? nir_src_as_uint(*offset_src) : 0;

	const unsigned byte_size = bit_size / 8;
	assert(byte_size <= sizeof(uint64_t));

	BITSET_DECLARE(mask, NIR_MAX_VEC_COMPONENTS * sizeof(uint64_t));
	BITSET_ZERO(mask);

	for (unsigned i = 0; i < num_components; i++) {
	if (writemask & (1u << i))
	BITSET_SET_RANGE(mask, i * byte_size, ((i + 1) * byte_size) - 1);
	}

	while (BITSET_FFS(mask) != 0) {
	const int start = BITSET_FFS(mask) - 1;

	int end;
	for (end = start + 1; end < bytes_written; end++) {
	if (!(BITSET_TEST(mask, end)))
	break;
	}
	/* The size of the current contiguous chunk in bytes */
	const unsigned chunk_bytes = end - start;

	const bool is_dword_aligned =
	(align_mul >= 4 && (align_offset + start) % 4 == 0) \|\|
	(offset_is_const && (start + const_offset) % 4 == 0);

	unsigned store_comps, store_bit_size, store_align;
	if (chunk_bytes >= 4 && is_dword_aligned) {
	store_align = MAX2(align, 4);
	store_bit_size = 32;
	store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4;
	} else {
	store_align = align;
	store_comps = 1;
	store_bit_size = MIN2(chunk_bytes, 4) * 8;
	/* The bit size must be a power of two */
	if (store_bit_size == 24)
	store_bit_size = 16;
	}
	const unsigned store_bytes = store_comps * (store_bit_size / 8);

	nir_ssa_def packed = nir_extract_bits(b, &value, 1, start 8,
	store_comps, store_bit_size);

	dup_mem_intrinsic(b, intrin, packed, start,
	store_comps, store_bit_size, store_align);

	BITSET_CLEAR_RANGE(mask, start, (start + store_bytes - 1));
	}

	nir_instr_remove(&intrin->instr);

	return true;
	}

	static bool
	lower_mem_access_bit_sizes_impl(nir_function_impl *impl,
	const struct gen_device_info *devinfo)
	{
	bool progress = false;

	nir_builder b;
	nir_builder_init(&b, impl);

	nir_foreach_block(block, impl) {
	nir_foreach_instr_safe(instr, block) {
	if (instr->type != nir_instr_type_intrinsic)
	continue;

	b.cursor = nir_after_instr(instr);

	nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
	switch (intrin->intrinsic) {
	case nir_intrinsic_load_global:
	case nir_intrinsic_load_ssbo:
	case nir_intrinsic_load_shared:
	case nir_intrinsic_load_scratch:
	if (lower_mem_load_bit_size(&b, intrin, devinfo))
	progress = true;
	break;

	case nir_intrinsic_store_global:
	case nir_intrinsic_store_ssbo:
	case nir_intrinsic_store_shared:
	case nir_intrinsic_store_scratch:
	if (lower_mem_store_bit_size(&b, intrin, devinfo))
	progress = true;
	break;

	default:
	break;
	}
	}
	}

	if (progress) {
	nir_metadata_preserve(impl, nir_metadata_block_index \|
	nir_metadata_dominance);
	} else {
	nir_metadata_preserve(impl, nir_metadata_all);
	}

	return progress;
	}

	/**
	* This pass loads arbitrary SSBO and shared memory load/store operations to
	* intrinsics which are natively handleable by GEN hardware. In particular,
	* we have two general types of memory load/store messages:
	*
	* - Untyped surface read/write: These can load/store between one and four
	* dword components to/from a dword-aligned offset.
	*
	* - Byte scattered read/write: These can load/store a single byte, word, or
	* dword scalar to/from an unaligned byte offset.
	*
	* Neither type of message can do a write-masked store. This pass converts
	* all nir load/store intrinsics into a series of either 8 or 32-bit
	* load/store intrinsics with a number of components that we can directly
	* handle in hardware and with a trivial write-mask.
	*
	* For scratch access, additional consideration has to be made due to the way
	* that we swizzle the memory addresses to achieve decent cache locality. In
	* particular, even though untyped surface read/write messages exist and work,
	* we can't use them to load multiple components in a single SEND. For more
	* detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr.
	*/
	bool
	brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
	const struct gen_device_info *devinfo)
	{
	bool progress = false;

	nir_foreach_function(func, shader) {
	if (func->impl && lower_mem_access_bit_sizes_impl(func->impl, devinfo))
	progress = true;
	}

	return progress;
	}