src/compiler/nir/nir_group_loads.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2021 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 /* This is a new block-level load instruction scheduler where loads are grouped
  * according to their indirection level within a basic block. An indirection
  * is when a result of one load is used as a source of another load. The result
  * is that disjoint ALU opcode groups and load (texture) opcode groups are
  * created where each next load group is the next level of indirection.
  * It's done by finding the first and last load with the same indirection
  * level, and moving all unrelated instructions between them after the last
  * load except for load sources, which are moved before the first load.
  * It naturally suits hardware that has limits on texture indirections, but
  * other hardware can benefit too. Only texture, image, and SSBO load and
  * atomic instructions are grouped.
  *
  * There is an option to group only those loads that use the same resource
  * variable. This increases the chance to get more cache hits than if the loads
  * were spread out.
  *
  * The increased register usage is offset by the increase in observed memory
  * bandwidth due to more cache hits (dependent on hw behavior) and thus
  * decrease the subgroup lifetime, which allows registers to be deallocated
  * and reused sooner. In some bandwidth-bound cases, low register usage doesn't
  * benefit at all. Doubling the register usage and using those registers to
  * amplify observed bandwidth can improve performance a lot.
  *
  * It's recommended to run a hw-specific instruction scheduler after this to
  * prevent spilling.
  */

 #include "nir.h"

 static bool
 is_memory_load(nir_instr *instr)
 {
    /* Count texture_size too because it has the same latency as cache hits. */
    if (instr->type == nir_instr_type_tex)
       return true;

    if (instr->type == nir_instr_type_intrinsic) {
       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
       const char *name = nir_intrinsic_infos[intr->intrinsic].name;

       /* TODO: nir_intrinsics.py could do this */
       /* load_ubo is ignored because it's usually cheap. */
       if (!nir_intrinsic_writes_external_memory(intr) &&
           !strstr(name, "shared") &&
           (strstr(name, "ssbo") || strstr(name, "image")))
          return true;
    }

    return false;
 }

 static nir_instr *
 get_intrinsic_resource(nir_intrinsic_instr *intr)
 {
    /* This is also the list of intrinsics that are grouped. */
    /* load_ubo is ignored because it's usually cheap. */
    switch (intr->intrinsic) {
    case nir_intrinsic_image_load:
    case nir_intrinsic_image_deref_load:
    case nir_intrinsic_image_sparse_load:
    case nir_intrinsic_image_deref_sparse_load:
    /* Group image_size too because it has the same latency as cache hits. */
    case nir_intrinsic_image_samples_identical:
    case nir_intrinsic_image_deref_samples_identical:
    case nir_intrinsic_bindless_image_samples_identical:
    case nir_intrinsic_image_size:
    case nir_intrinsic_image_deref_size:
    case nir_intrinsic_bindless_image_load:
    case nir_intrinsic_bindless_image_sparse_load:
    case nir_intrinsic_load_ssbo:
    case nir_intrinsic_image_fragment_mask_load_amd:
    case nir_intrinsic_image_deref_fragment_mask_load_amd:
    case nir_intrinsic_bindless_image_fragment_mask_load_amd:
       return intr->src[0].ssa->parent_instr;
    default:
       return NULL;
    }
 }

 /* Track only those that we want to group. */
 static bool
 is_grouped_load(nir_instr *instr)
 {
    /* Count texture_size too because it has the same latency as cache hits. */
    if (instr->type == nir_instr_type_tex)
       return true;

    if (instr->type == nir_instr_type_intrinsic)
       return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL;

    return false;
 }

 static bool
 can_move(nir_instr *instr, uint8_t current_indirection_level)
 {
    /* Grouping is done by moving everything else out of the first/last
     * instruction range of the indirection level.
     */
    if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level)
       return false;

    if (instr->type == nir_instr_type_alu ||
        instr->type == nir_instr_type_deref ||
        instr->type == nir_instr_type_tex ||
        instr->type == nir_instr_type_load_const ||
        instr->type == nir_instr_type_undef)
       return true;

    if (instr->type == nir_instr_type_intrinsic &&
        nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
       return true;

    return false;
 }

 static nir_instr *
 get_uniform_inst_resource(nir_instr *instr)
 {
    if (instr->type == nir_instr_type_tex) {
       nir_tex_instr *tex = nir_instr_as_tex(instr);

       if (tex->texture_non_uniform)
          return NULL;

       for (unsigned i = 0; i < tex->num_srcs; i++) {
          switch (tex->src[i].src_type) {
          case nir_tex_src_texture_deref:
          case nir_tex_src_texture_handle:
             return tex->src[i].src.ssa->parent_instr;
          default:
             break;
          }
       }
       return NULL;
    }

    if (instr->type == nir_instr_type_intrinsic)
       return get_intrinsic_resource(nir_instr_as_intrinsic(instr));

    return NULL;
 }

 struct check_sources_state {
    nir_block *block;
    uint32_t first_index;
 };

 static bool
 has_only_sources_less_than(nir_src *src, void *data)
 {
    struct check_sources_state *state = (struct check_sources_state *)data;

    /* true if nir_foreach_src should keep going */
    return state->block != src->ssa->parent_instr->block ||
           src->ssa->parent_instr->index < state->first_index;
 }

 static void
 group_loads(nir_instr *first, nir_instr *last)
 {
    /* Walk the instruction range between the first and last backward, and
     * move those that have no uses within the range after the last one.
     */
    for (nir_instr *instr = exec_node_data_backward(nir_instr,
                                                    last->node.prev, node);
         instr != first;
         instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) {
       /* Only move instructions without side effects. */
       if (!can_move(instr, first->pass_flags))
          continue;

       nir_def *def = nir_instr_def(instr);
       if (def) {
          bool all_uses_after_last = true;

          nir_foreach_use(use, def) {
             if (nir_src_parent_instr(use)->block == instr->block &&
                 nir_src_parent_instr(use)->index <= last->index) {
                all_uses_after_last = false;
                break;
             }
          }

          if (all_uses_after_last) {
             nir_instr *move_instr = instr;
             /* Set the last instruction because we'll delete the current one. */
             instr = exec_node_data_forward(nir_instr, instr->node.next, node);

             /* Move the instruction after the last and update its index
              * to indicate that it's after it.
              */
             nir_instr_move(nir_after_instr(last), move_instr);
             move_instr->index = last->index + 1;
          }
       }
    }

    struct check_sources_state state;
    state.block = first->block;
    state.first_index = first->index;

    /* Walk the instruction range between the first and last forward, and move
     * those that have no sources within the range before the first one.
     */
    for (nir_instr *instr = exec_node_data_forward(nir_instr,
                                                   first->node.next, node);
         instr != last;
         instr = exec_node_data_forward(nir_instr, instr->node.next, node)) {
       /* Only move instructions without side effects. */
       if (!can_move(instr, first->pass_flags))
          continue;

       if (nir_foreach_src(instr, has_only_sources_less_than, &state)) {
          nir_instr *move_instr = instr;
          /* Set the last instruction because we'll delete the current one. */
          instr = exec_node_data_backward(nir_instr, instr->node.prev, node);

          /* Move the instruction before the first and update its index
           * to indicate that it's before it.
           */
          nir_instr_move(nir_before_instr(first), move_instr);
          move_instr->index = first->index - 1;
       }
    }
 }

 static bool
 is_pseudo_inst(nir_instr *instr)
 {
    /* Other instructions do not usually contribute to the shader binary size. */
    return instr->type != nir_instr_type_alu &&
           instr->type != nir_instr_type_call &&
           instr->type != nir_instr_type_tex &&
           instr->type != nir_instr_type_intrinsic;
 }

 static void
 set_instr_indices(nir_block *block)
 {
    /* Start with 1 because we'll move instruction before the first one
     * and will want to label it 0.
     */
    unsigned counter = 1;
    nir_instr *last = NULL;

    nir_foreach_instr(instr, block) {
       /* Make sure grouped instructions don't have the same index as pseudo
        * instructions.
        */
       if (last && is_pseudo_inst(last) && is_grouped_load(instr))
          counter++;

       /* Set each instruction's index within the block. */
       instr->index = counter;

       /* Only count non-pseudo instructions. */
       if (!is_pseudo_inst(instr))
          counter++;

       last = instr;
    }
 }

 static void
 handle_load_range(nir_instr **first, nir_instr **last,
                   nir_instr *current, unsigned max_distance)
 {
    if (*first && *last &&
        (!current || current->index > (*first)->index + max_distance)) {
       assert(*first != *last);
       group_loads(*first, *last);
       set_instr_indices((*first)->block);
       *first = NULL;
       *last = NULL;
    }
 }

 static bool
 is_barrier(nir_instr *instr)
 {
    if (instr->type == nir_instr_type_intrinsic) {
       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
       const char *name = nir_intrinsic_infos[intr->intrinsic].name;

       if (intr->intrinsic == nir_intrinsic_discard ||
           intr->intrinsic == nir_intrinsic_discard_if ||
           intr->intrinsic == nir_intrinsic_terminate ||
           intr->intrinsic == nir_intrinsic_terminate_if ||
           /* TODO: nir_intrinsics.py could do this */
           strstr(name, "barrier"))
          return true;
    }

    return false;
 }

 struct indirection_state {
    nir_block *block;
    unsigned indirections;
 };

 static unsigned
 get_num_indirections(nir_instr *instr);

 static bool
 gather_indirections(nir_src *src, void *data)
 {
    struct indirection_state *state = (struct indirection_state *)data;
    nir_instr *instr = src->ssa->parent_instr;

    /* We only count indirections within the same block. */
    if (instr->block == state->block) {
       unsigned indirections = get_num_indirections(src->ssa->parent_instr);

       if (instr->type == nir_instr_type_tex || is_memory_load(instr))
          indirections++;

       state->indirections = MAX2(state->indirections, indirections);
    }

    return true; /* whether nir_foreach_src should keep going */
 }

 /* Return the number of load indirections within the block. */
 static unsigned
 get_num_indirections(nir_instr *instr)
 {
    /* Don't traverse phis because we could end up in an infinite recursion
     * if the phi points to the current block (such as a loop body).
     */
    if (instr->type == nir_instr_type_phi)
       return 0;

    if (instr->index != UINT32_MAX)
       return instr->index; /* we've visited this instruction before */

    struct indirection_state state;
    state.block = instr->block;
    state.indirections = 0;

    nir_foreach_src(instr, gather_indirections, &state);

    instr->index = state.indirections;
    return state.indirections;
 }

 static void
 process_block(nir_block *block, nir_load_grouping grouping,
               unsigned max_distance)
 {
    int max_indirection = -1;
    unsigned num_inst_per_level[256] = { 0 };

    /* UINT32_MAX means the instruction has not been visited. Once
     * an instruction has been visited and its indirection level has been
     * determined, we'll store the indirection level in the index. The next
     * instruction that visits it will use the index instead of recomputing
     * the indirection level, which would result in an exponetial time
     * complexity.
     */
    nir_foreach_instr(instr, block) {
       instr->index = UINT32_MAX; /* unknown */
    }

    /* Count the number of load indirections for each load instruction
     * within this block. Store it in pass_flags.
     */
    nir_foreach_instr(instr, block) {
       if (is_grouped_load(instr)) {
          unsigned indirections = get_num_indirections(instr);

          /* pass_flags has only 8 bits */
          indirections = MIN2(indirections, 255);
          num_inst_per_level[indirections]++;
          instr->pass_flags = indirections;

          max_indirection = MAX2(max_indirection, (int)indirections);
       }
    }

    /* 255 contains all indirection levels >= 255, so ignore them. */
    max_indirection = MIN2(max_indirection, 254);

    /* Each indirection level is grouped. */
    for (int level = 0; level <= max_indirection; level++) {
       if (num_inst_per_level[level] <= 1)
          continue;

       set_instr_indices(block);

       nir_instr *resource = NULL;
       nir_instr *first_load = NULL, *last_load = NULL;

       /* Find the first and last instruction that use the same
        * resource and are within a certain distance of each other.
        * If found, group them by moving all movable instructions
        * between them out.
        */
       nir_foreach_instr(current, block) {
          /* Don't group across barriers. */
          if (is_barrier(current)) {
             /* Group unconditionally.  */
             handle_load_range(&first_load, &last_load, NULL, 0);
             first_load = NULL;
             last_load = NULL;
             continue;
          }

          /* Only group load instructions with the same indirection level. */
          if (is_grouped_load(current) && current->pass_flags == level) {
             nir_instr *current_resource;

             switch (grouping) {
             case nir_group_all:
                if (!first_load)
                   first_load = current;
                else
                   last_load = current;
                break;

             case nir_group_same_resource_only:
                current_resource = get_uniform_inst_resource(current);

                if (current_resource) {
                   if (!first_load) {
                      first_load = current;
                      resource = current_resource;
                   } else if (current_resource == resource) {
                      last_load = current;
                   }
                }
             }
          }

          /* Group only if we exceeded the maximum distance. */
          handle_load_range(&first_load, &last_load, current, max_distance);
       }

       /* Group unconditionally.  */
       handle_load_range(&first_load, &last_load, NULL, 0);
    }
 }

 /* max_distance is the maximum distance between the first and last instruction
  * in a group.
  */
 void
 nir_group_loads(nir_shader *shader, nir_load_grouping grouping,
                 unsigned max_distance)
 {
    nir_foreach_function_impl(impl, shader) {
       nir_foreach_block(block, impl) {
          process_block(block, grouping, max_distance);
       }

       nir_metadata_preserve(impl, nir_metadata_block_index |
                                      nir_metadata_dominance |
                                      nir_metadata_loop_analysis);
    }
 }
	/*
	* Copyright © 2021 Advanced Micro Devices, Inc.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/

	/* This is a new block-level load instruction scheduler where loads are grouped
	* according to their indirection level within a basic block. An indirection
	* is when a result of one load is used as a source of another load. The result
	* is that disjoint ALU opcode groups and load (texture) opcode groups are
	* created where each next load group is the next level of indirection.
	* It's done by finding the first and last load with the same indirection
	* level, and moving all unrelated instructions between them after the last
	* load except for load sources, which are moved before the first load.
	* It naturally suits hardware that has limits on texture indirections, but
	* other hardware can benefit too. Only texture, image, and SSBO load and
	* atomic instructions are grouped.
	*
	* There is an option to group only those loads that use the same resource
	* variable. This increases the chance to get more cache hits than if the loads
	* were spread out.
	*
	* The increased register usage is offset by the increase in observed memory
	* bandwidth due to more cache hits (dependent on hw behavior) and thus
	* decrease the subgroup lifetime, which allows registers to be deallocated
	* and reused sooner. In some bandwidth-bound cases, low register usage doesn't
	* benefit at all. Doubling the register usage and using those registers to
	* amplify observed bandwidth can improve performance a lot.
	*
	* It's recommended to run a hw-specific instruction scheduler after this to
	* prevent spilling.
	*/

	#include "nir.h"

	static bool
	is_memory_load(nir_instr *instr)
	{
	/* Count texture_size too because it has the same latency as cache hits. */
	if (instr->type == nir_instr_type_tex)
	return true;

	if (instr->type == nir_instr_type_intrinsic) {
	nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
	const char *name = nir_intrinsic_infos[intr->intrinsic].name;

	/* TODO: nir_intrinsics.py could do this */
	/* load_ubo is ignored because it's usually cheap. */
	if (!nir_intrinsic_writes_external_memory(intr) &&
	!strstr(name, "shared") &&
	(strstr(name, "ssbo") \|\| strstr(name, "image")))
	return true;
	}

	return false;
	}

	static nir_instr *
	get_intrinsic_resource(nir_intrinsic_instr *intr)
	{
	/* This is also the list of intrinsics that are grouped. */
	/* load_ubo is ignored because it's usually cheap. */
	switch (intr->intrinsic) {
	case nir_intrinsic_image_load:
	case nir_intrinsic_image_deref_load:
	case nir_intrinsic_image_sparse_load:
	case nir_intrinsic_image_deref_sparse_load:
	/* Group image_size too because it has the same latency as cache hits. */
	case nir_intrinsic_image_samples_identical:
	case nir_intrinsic_image_deref_samples_identical:
	case nir_intrinsic_bindless_image_samples_identical:
	case nir_intrinsic_image_size:
	case nir_intrinsic_image_deref_size:
	case nir_intrinsic_bindless_image_load:
	case nir_intrinsic_bindless_image_sparse_load:
	case nir_intrinsic_load_ssbo:
	case nir_intrinsic_image_fragment_mask_load_amd:
	case nir_intrinsic_image_deref_fragment_mask_load_amd:
	case nir_intrinsic_bindless_image_fragment_mask_load_amd:
	return intr->src[0].ssa->parent_instr;
	default:
	return NULL;
	}
	}

	/* Track only those that we want to group. */
	static bool
	is_grouped_load(nir_instr *instr)
	{
	/* Count texture_size too because it has the same latency as cache hits. */
	if (instr->type == nir_instr_type_tex)
	return true;

	if (instr->type == nir_instr_type_intrinsic)
	return get_intrinsic_resource(nir_instr_as_intrinsic(instr)) != NULL;

	return false;
	}

	static bool
	can_move(nir_instr *instr, uint8_t current_indirection_level)
	{
	/* Grouping is done by moving everything else out of the first/last
	* instruction range of the indirection level.
	*/
	if (is_grouped_load(instr) && instr->pass_flags == current_indirection_level)
	return false;

	if (instr->type == nir_instr_type_alu \|\|
	instr->type == nir_instr_type_deref \|\|
	instr->type == nir_instr_type_tex \|\|
	instr->type == nir_instr_type_load_const \|\|
	instr->type == nir_instr_type_undef)
	return true;

	if (instr->type == nir_instr_type_intrinsic &&
	nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr)))
	return true;

	return false;
	}

	static nir_instr *
	get_uniform_inst_resource(nir_instr *instr)
	{
	if (instr->type == nir_instr_type_tex) {
	nir_tex_instr *tex = nir_instr_as_tex(instr);

	if (tex->texture_non_uniform)
	return NULL;

	for (unsigned i = 0; i < tex->num_srcs; i++) {
	switch (tex->src[i].src_type) {
	case nir_tex_src_texture_deref:
	case nir_tex_src_texture_handle:
	return tex->src[i].src.ssa->parent_instr;
	default:
	break;
	}
	}
	return NULL;
	}

	if (instr->type == nir_instr_type_intrinsic)
	return get_intrinsic_resource(nir_instr_as_intrinsic(instr));

	return NULL;
	}

	struct check_sources_state {
	nir_block *block;
	uint32_t first_index;
	};

	static bool
	has_only_sources_less_than(nir_src src, void data)
	{
	struct check_sources_state state = (struct check_sources_state )data;

	/* true if nir_foreach_src should keep going */
	return state->block != src->ssa->parent_instr->block \|\|
	src->ssa->parent_instr->index < state->first_index;
	}

	static void
	group_loads(nir_instr first, nir_instr last)
	{
	/* Walk the instruction range between the first and last backward, and
	* move those that have no uses within the range after the last one.
	*/
	for (nir_instr *instr = exec_node_data_backward(nir_instr,
	last->node.prev, node);
	instr != first;
	instr = exec_node_data_backward(nir_instr, instr->node.prev, node)) {
	/* Only move instructions without side effects. */
	if (!can_move(instr, first->pass_flags))
	continue;

	nir_def *def = nir_instr_def(instr);
	if (def) {
	bool all_uses_after_last = true;

	nir_foreach_use(use, def) {
	if (nir_src_parent_instr(use)->block == instr->block &&
	nir_src_parent_instr(use)->index <= last->index) {
	all_uses_after_last = false;
	break;
	}
	}

	if (all_uses_after_last) {
	nir_instr *move_instr = instr;
	/* Set the last instruction because we'll delete the current one. */
	instr = exec_node_data_forward(nir_instr, instr->node.next, node);

	/* Move the instruction after the last and update its index
	* to indicate that it's after it.
	*/
	nir_instr_move(nir_after_instr(last), move_instr);
	move_instr->index = last->index + 1;
	}
	}
	}

	struct check_sources_state state;
	state.block = first->block;
	state.first_index = first->index;

	/* Walk the instruction range between the first and last forward, and move
	* those that have no sources within the range before the first one.
	*/
	for (nir_instr *instr = exec_node_data_forward(nir_instr,
	first->node.next, node);
	instr != last;
	instr = exec_node_data_forward(nir_instr, instr->node.next, node)) {
	/* Only move instructions without side effects. */
	if (!can_move(instr, first->pass_flags))
	continue;

	if (nir_foreach_src(instr, has_only_sources_less_than, &state)) {
	nir_instr *move_instr = instr;
	/* Set the last instruction because we'll delete the current one. */
	instr = exec_node_data_backward(nir_instr, instr->node.prev, node);

	/* Move the instruction before the first and update its index
	* to indicate that it's before it.
	*/
	nir_instr_move(nir_before_instr(first), move_instr);
	move_instr->index = first->index - 1;
	}
	}
	}

	static bool
	is_pseudo_inst(nir_instr *instr)
	{
	/* Other instructions do not usually contribute to the shader binary size. */
	return instr->type != nir_instr_type_alu &&
	instr->type != nir_instr_type_call &&
	instr->type != nir_instr_type_tex &&
	instr->type != nir_instr_type_intrinsic;
	}

	static void
	set_instr_indices(nir_block *block)
	{
	/* Start with 1 because we'll move instruction before the first one
	* and will want to label it 0.
	*/
	unsigned counter = 1;
	nir_instr *last = NULL;

	nir_foreach_instr(instr, block) {
	/* Make sure grouped instructions don't have the same index as pseudo
	* instructions.
	*/
	if (last && is_pseudo_inst(last) && is_grouped_load(instr))
	counter++;

	/* Set each instruction's index within the block. */
	instr->index = counter;

	/* Only count non-pseudo instructions. */
	if (!is_pseudo_inst(instr))
	counter++;

	last = instr;
	}
	}

	static void
	handle_load_range(nir_instr first, nir_instr last,
	nir_instr *current, unsigned max_distance)
	{
	if (first && last &&
	(!current \|\| current->index > (*first)->index + max_distance)) {
	assert(first != last);
	group_loads(first, last);
	set_instr_indices((*first)->block);
	*first = NULL;
	*last = NULL;
	}
	}

	static bool
	is_barrier(nir_instr *instr)
	{
	if (instr->type == nir_instr_type_intrinsic) {
	nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
	const char *name = nir_intrinsic_infos[intr->intrinsic].name;

	if (intr->intrinsic == nir_intrinsic_discard \|\|
	intr->intrinsic == nir_intrinsic_discard_if \|\|
	intr->intrinsic == nir_intrinsic_terminate \|\|
	intr->intrinsic == nir_intrinsic_terminate_if \|\|
	/* TODO: nir_intrinsics.py could do this */
	strstr(name, "barrier"))
	return true;
	}

	return false;
	}

	struct indirection_state {
	nir_block *block;
	unsigned indirections;
	};

	static unsigned
	get_num_indirections(nir_instr *instr);

	static bool
	gather_indirections(nir_src src, void data)
	{
	struct indirection_state state = (struct indirection_state )data;
	nir_instr *instr = src->ssa->parent_instr;

	/* We only count indirections within the same block. */
	if (instr->block == state->block) {
	unsigned indirections = get_num_indirections(src->ssa->parent_instr);

	if (instr->type == nir_instr_type_tex \|\| is_memory_load(instr))
	indirections++;

	state->indirections = MAX2(state->indirections, indirections);
	}

	return true; /* whether nir_foreach_src should keep going */
	}

	/* Return the number of load indirections within the block. */
	static unsigned
	get_num_indirections(nir_instr *instr)
	{
	/* Don't traverse phis because we could end up in an infinite recursion
	* if the phi points to the current block (such as a loop body).
	*/
	if (instr->type == nir_instr_type_phi)
	return 0;

	if (instr->index != UINT32_MAX)
	return instr->index; /* we've visited this instruction before */

	struct indirection_state state;
	state.block = instr->block;
	state.indirections = 0;

	nir_foreach_src(instr, gather_indirections, &state);

	instr->index = state.indirections;
	return state.indirections;
	}

	static void
	process_block(nir_block *block, nir_load_grouping grouping,
	unsigned max_distance)
	{
	int max_indirection = -1;
	unsigned num_inst_per_level[256] = { 0 };

	/* UINT32_MAX means the instruction has not been visited. Once
	* an instruction has been visited and its indirection level has been
	* determined, we'll store the indirection level in the index. The next
	* instruction that visits it will use the index instead of recomputing
	* the indirection level, which would result in an exponetial time
	* complexity.
	*/
	nir_foreach_instr(instr, block) {
	instr->index = UINT32_MAX; /* unknown */
	}

	/* Count the number of load indirections for each load instruction
	* within this block. Store it in pass_flags.
	*/
	nir_foreach_instr(instr, block) {
	if (is_grouped_load(instr)) {
	unsigned indirections = get_num_indirections(instr);

	/* pass_flags has only 8 bits */
	indirections = MIN2(indirections, 255);
	num_inst_per_level[indirections]++;
	instr->pass_flags = indirections;

	max_indirection = MAX2(max_indirection, (int)indirections);
	}
	}

	/* 255 contains all indirection levels >= 255, so ignore them. */
	max_indirection = MIN2(max_indirection, 254);

	/* Each indirection level is grouped. */
	for (int level = 0; level <= max_indirection; level++) {
	if (num_inst_per_level[level] <= 1)
	continue;

	set_instr_indices(block);

	nir_instr *resource = NULL;
	nir_instr first_load = NULL, last_load = NULL;

	/* Find the first and last instruction that use the same
	* resource and are within a certain distance of each other.
	* If found, group them by moving all movable instructions
	* between them out.
	*/
	nir_foreach_instr(current, block) {
	/* Don't group across barriers. */
	if (is_barrier(current)) {
	/* Group unconditionally. */
	handle_load_range(&first_load, &last_load, NULL, 0);
	first_load = NULL;
	last_load = NULL;
	continue;
	}

	/* Only group load instructions with the same indirection level. */
	if (is_grouped_load(current) && current->pass_flags == level) {
	nir_instr *current_resource;

	switch (grouping) {
	case nir_group_all:
	if (!first_load)
	first_load = current;
	else
	last_load = current;
	break;

	case nir_group_same_resource_only:
	current_resource = get_uniform_inst_resource(current);

	if (current_resource) {
	if (!first_load) {
	first_load = current;
	resource = current_resource;
	} else if (current_resource == resource) {
	last_load = current;
	}
	}
	}
	}

	/* Group only if we exceeded the maximum distance. */
	handle_load_range(&first_load, &last_load, current, max_distance);
	}

	/* Group unconditionally. */
	handle_load_range(&first_load, &last_load, NULL, 0);
	}
	}

	/* max_distance is the maximum distance between the first and last instruction
	* in a group.
	*/
	void
	nir_group_loads(nir_shader *shader, nir_load_grouping grouping,
	unsigned max_distance)
	{
	nir_foreach_function_impl(impl, shader) {
	nir_foreach_block(block, impl) {
	process_block(block, grouping, max_distance);
	}

	nir_metadata_preserve(impl, nir_metadata_block_index \|
	nir_metadata_dominance \|
	nir_metadata_loop_analysis);
	}
	}