src/compiler/nir/nir_schedule.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2019 Broadcom
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #include "nir_schedule.h"
 #include "util/dag.h"
 #include "util/u_dynarray.h"

 /** @file
  *
  * Implements basic-block-level prepass instruction scheduling in NIR to
  * manage register pressure.
  *
  * This is based on the Goodman/Hsu paper (1988, cached copy at
  * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
  * make up the DDG for NIR (which can be mostly done using the NIR def/use
  * chains for SSA instructions, plus some edges for ordering register writes
  * vs reads, and some more for ordering intrinsics).  Then we pick heads off
  * of the DDG using their heuristic to emit the NIR instructions back into the
  * block in their new order.
  *
  * The hard case for prepass scheduling on GPUs seems to always be consuming
  * texture/ubo results.  The register pressure heuristic doesn't want to pick
  * an instr that starts consuming texture results because it usually won't be
  * the only usage, so that instruction will increase pressure.
  *
  * If you try to force consumption of tex results always, then in a case where
  * single sample is used for many outputs, you'll end up picking every other
  * user and expanding register pressure.  The partially_evaluated_path flag
  * helps tremendously, in that if you happen for whatever reason to pick a
  * texture sample's output, then you'll try to finish off that sample.  Future
  * work may include doing some local search before locking in a choice, to try
  * to more reliably find the case where just a few choices going against the
  * heuristic can manage to free the whole vector.
  */

 static bool debug;

 /**
  * Represents a node in the DDG for a NIR instruction.
  */
 typedef struct {
    struct dag_node dag; /* must be first for our u_dynarray_foreach */
    nir_instr *instr;
    bool partially_evaluated_path;

    /* Approximate estimate of the delay between starting this instruction and
     * its results being available.
     *
     * Accuracy is not too important, given that we're prepass scheduling here
     * and just trying to reduce excess dependencies introduced by a register
     * allocator by stretching out the live intervals of expensive
     * instructions.
     */
    uint32_t delay;

    /* Cost of the maximum-delay path from this node to the leaves. */
    uint32_t max_delay;

    /* scoreboard->time value when this instruction can be scheduled without
     * any stalls expected.
     */
    uint32_t ready_time;
 } nir_schedule_node;

 typedef struct {
    struct dag *dag;

    nir_shader *shader;

    /* Mapping from nir_register * or nir_ssa_def * to a struct set of
     * instructions remaining to be scheduled using the register.
     */
    struct hash_table *remaining_uses;

    /* Map from nir_instr to nir_schedule_node * */
    struct hash_table *instr_map;

    /* Set of nir_register * or nir_ssa_def * that have had any instruction
     * scheduled on them.
     */
    struct set *live_values;

    /* An abstract approximation of the number of nir_scheduler_node->delay
     * units since the start of the shader.
     */
    uint32_t time;

    /* Number of channels currently used by the NIR instructions that have been
     * scheduled.
     */
    int pressure;

    /* Options specified by the backend */
    const nir_schedule_options *options;
 } nir_schedule_scoreboard;

 /* When walking the instructions in reverse, we use this flag to swap
  * before/after in add_dep().
  */
 enum direction { F, R };

 struct nir_schedule_class_dep {
    int klass;
    nir_schedule_node *node;
    struct nir_schedule_class_dep *next;
 };

 typedef struct {
    nir_schedule_scoreboard *scoreboard;

    /* Map from nir_register to nir_schedule_node * */
    struct hash_table *reg_map;

    /* Scheduler nodes for last instruction involved in some class of dependency.
     */
    nir_schedule_node *load_input;
    nir_schedule_node *store_shared;
    nir_schedule_node *unknown_intrinsic;
    nir_schedule_node *discard;
    nir_schedule_node *jump;

    struct nir_schedule_class_dep *class_deps;

    enum direction dir;
 } nir_deps_state;

 static void *
 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
 {
    struct hash_entry *entry = _mesa_hash_table_search(ht, key);
    if (!entry)
       return NULL;
    return entry->data;
 }

 static nir_schedule_node *
 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
 {
    return _mesa_hash_table_search_data(instr_map, instr);
 }

 static struct set *
 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
 {
    if (src->is_ssa) {
       return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
    } else {
       return _mesa_hash_table_search_data(scoreboard->remaining_uses,
                                           src->reg.reg);
    }
 }

 static int
 nir_schedule_def_pressure(nir_ssa_def *def)
 {
    return def->num_components;
 }

 static int
 nir_schedule_src_pressure(nir_src *src)
 {
    if (src->is_ssa)
       return nir_schedule_def_pressure(src->ssa);
    else
       return src->reg.reg->num_components;
 }

 static int
 nir_schedule_dest_pressure(nir_dest *dest)
 {
    if (dest->is_ssa)
       return nir_schedule_def_pressure(&dest->ssa);
    else
       return dest->reg.reg->num_components;
 }

 /**
  * Adds a dependency such that @after must appear in the final program after
  * @before.
  *
  * We add @before as a child of @after, so that DAG heads are the outputs of
  * the program and we make our scheduling decisions bottom to top.
  */
 static void
 add_dep(nir_deps_state *state,
         nir_schedule_node *before,
         nir_schedule_node *after)
 {
    if (!before || !after)
       return;

    assert(before != after);

    if (state->dir == F)
       dag_add_edge(&before->dag, &after->dag, NULL);
    else
       dag_add_edge(&after->dag, &before->dag, NULL);
 }


 static void
 add_read_dep(nir_deps_state *state,
              nir_schedule_node *before,
              nir_schedule_node *after)
 {
    add_dep(state, before, after);
 }

 static void
 add_write_dep(nir_deps_state *state,
               nir_schedule_node **before,
               nir_schedule_node *after)
 {
    add_dep(state, *before, after);
    *before = after;
 }

 static bool
 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
 {
    nir_deps_state *state = in_state;

    if (src->is_ssa)
       return true;

    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
                                                       src->reg.reg);
    if (!entry)
       return true;
    nir_schedule_node *dst_n = entry->data;

    nir_schedule_node *src_n =
       nir_schedule_get_node(state->scoreboard->instr_map,
                             src->parent_instr);

    add_dep(state, dst_n, src_n);

    return true;
 }

 static bool
 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
 {
    nir_deps_state *state = in_state;

    if (dest->is_ssa)
       return true;

    nir_schedule_node *dest_n =
       nir_schedule_get_node(state->scoreboard->instr_map,
                             dest->reg.parent_instr);

    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
                                                       dest->reg.reg);
    if (!entry) {
       _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
       return true;
    }
    nir_schedule_node **before = (nir_schedule_node **)&entry->data;

    add_write_dep(state, before, dest_n);

    return true;
 }

 static bool
 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
 {
    nir_deps_state *state = in_state;
    struct hash_table *instr_map = state->scoreboard->instr_map;
    nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);

    nir_foreach_use(src, def) {
       nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
                                                        src->parent_instr);

       add_read_dep(state, def_n, use_n);
    }

    return true;
 }

 static struct nir_schedule_class_dep *
 nir_schedule_get_class_dep(nir_deps_state *state,
                            int klass)
 {
    for (struct nir_schedule_class_dep *class_dep = state->class_deps;
         class_dep != NULL;
         class_dep = class_dep->next) {
       if (class_dep->klass == klass)
          return class_dep;
    }

    struct nir_schedule_class_dep *class_dep =
       ralloc(state->reg_map, struct nir_schedule_class_dep);

    class_dep->klass = klass;
    class_dep->node = NULL;
    class_dep->next = state->class_deps;

    state->class_deps = class_dep;

    return class_dep;
 }

 static void
 nir_schedule_intrinsic_deps(nir_deps_state *state,
                             nir_intrinsic_instr *instr)
 {
    nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
                                                 &instr->instr);
    const nir_schedule_options *options = state->scoreboard->options;
    nir_schedule_dependency dep;

    if (options->intrinsic_cb &&
        options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
       struct nir_schedule_class_dep *class_dep =
          nir_schedule_get_class_dep(state, dep.klass);

       switch (dep.type) {
       case NIR_SCHEDULE_READ_DEPENDENCY:
          add_read_dep(state, class_dep->node, n);
          break;
       case NIR_SCHEDULE_WRITE_DEPENDENCY:
          add_write_dep(state, &class_dep->node, n);
          break;
       }
    }

    switch (instr->intrinsic) {
    case nir_intrinsic_load_uniform:
    case nir_intrinsic_load_ubo:
    case nir_intrinsic_load_front_face:
       break;

    case nir_intrinsic_discard:
    case nir_intrinsic_discard_if:
       /* We are adding two dependencies:
        *
        * * A individual one that we could use to add a read_dep while handling
        *   nir_instr_type_tex
        *
        * * Include it on the unknown intrinsic set, as we want discard to be
        *   serialized in in the same order relative to intervening stores or
        *   atomic accesses to SSBOs and images
        */
       add_write_dep(state, &state->discard, n);
       add_write_dep(state, &state->unknown_intrinsic, n);
       break;

    case nir_intrinsic_store_output:
       /* For some hardware and stages, output stores affect the same shared
        * memory as input loads.
        */
       if ((state->scoreboard->options->stages_with_shared_io_memory &
            (1 << state->scoreboard->shader->info.stage)))
          add_write_dep(state, &state->load_input, n);

       /* Make sure that preceding discards stay before the store_output */
       add_read_dep(state, state->discard, n);

       break;

    case nir_intrinsic_load_input:
    case nir_intrinsic_load_per_vertex_input:
       add_read_dep(state, state->load_input, n);
       break;

    case nir_intrinsic_load_shared:
       /* Don't move load_shared beyond a following store_shared, as it could
        * change their value
        */
       add_read_dep(state, state->store_shared, n);
       break;

    case nir_intrinsic_store_shared:
       add_write_dep(state, &state->store_shared, n);
       break;

    case nir_intrinsic_control_barrier:
    case nir_intrinsic_memory_barrier_shared:
       add_write_dep(state, &state->store_shared, n);

       /* Serialize against ssbos/atomics/etc. */
       add_write_dep(state, &state->unknown_intrinsic, n);
       break;

    default:
       /* Attempt to handle other intrinsics that we haven't individually
        * categorized by serializing them in the same order relative to each
        * other.
        */
       add_write_dep(state, &state->unknown_intrinsic, n);
       break;
    }
 }

 /**
  * Common code for dependencies that need to be tracked both forward and
  * backward.
  *
  * This is for things like "all reads of r4 have to happen between the r4
  * writes that surround them".
  */
 static void
 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
 {
    nir_instr *instr = n->instr;

    /* For NIR SSA defs, we only need to do a single pass of making the uses
     * depend on the def.
     */
    if (state->dir == F)
       nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);

    /* For NIR regs, track the last writer in the scheduler state so that we
     * can keep the writes in order and let reads get reordered only between
     * each write.
     */
    nir_foreach_src(instr, nir_schedule_reg_src_deps, state);

    nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);

    /* Make sure any other instructions keep their positions relative to
     * jumps.
     */
    if (instr->type != nir_instr_type_jump)
       add_read_dep(state, state->jump, n);

    switch (instr->type) {
    case nir_instr_type_ssa_undef:
    case nir_instr_type_load_const:
    case nir_instr_type_alu:
    case nir_instr_type_deref:
       break;

    case nir_instr_type_tex:
       /* Don't move texture ops before a discard, as that could increase
        * memory bandwidth for reading the discarded samples.
        */
       add_read_dep(state, state->discard, n);
       break;

    case nir_instr_type_jump:
       add_write_dep(state, &state->jump, n);
       break;

    case nir_instr_type_call:
       unreachable("Calls should have been lowered");
       break;

    case nir_instr_type_parallel_copy:
       unreachable("Parallel copies should have been lowered");
       break;

    case nir_instr_type_phi:
       unreachable("nir_schedule() should be called after lowering from SSA");
       break;

    case nir_instr_type_intrinsic:
       nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
       break;
    }
 }

 static void
 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 {
    nir_deps_state state = {
       .scoreboard = scoreboard,
       .dir = F,
       .reg_map = _mesa_pointer_hash_table_create(NULL),
    };

    nir_foreach_instr(instr, block) {
       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
                                                       instr);
       nir_schedule_calculate_deps(&state, node);
    }

    ralloc_free(state.reg_map);
 }

 static void
 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
 {
    nir_deps_state state = {
       .scoreboard = scoreboard,
       .dir = R,
       .reg_map = _mesa_pointer_hash_table_create(NULL),
    };

    nir_foreach_instr_reverse(instr, block) {
       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
                                                       instr);
       nir_schedule_calculate_deps(&state, node);
    }

    ralloc_free(state.reg_map);
 }

 typedef struct {
    nir_schedule_scoreboard *scoreboard;
    int regs_freed;
 } nir_schedule_regs_freed_state;

 static bool
 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
 {
    nir_schedule_regs_freed_state *state = in_state;
    nir_schedule_scoreboard *scoreboard = state->scoreboard;
    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);

    if (remaining_uses->entries == 1 &&
        _mesa_set_search(remaining_uses, src->parent_instr)) {
       state->regs_freed += nir_schedule_src_pressure(src);
    }

    return true;
 }

 static bool
 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
 {
    nir_schedule_regs_freed_state *state = in_state;

    state->regs_freed -= nir_schedule_def_pressure(def);

    return true;
 }

 static bool
 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
 {
    nir_schedule_regs_freed_state *state = in_state;
    nir_schedule_scoreboard *scoreboard = state->scoreboard;

    if (dest->is_ssa)
       return true;

    nir_register *reg = dest->reg.reg;

    /* Only the first def of a reg counts against register pressure. */
    if (!_mesa_set_search(scoreboard->live_values, reg))
       state->regs_freed -= nir_schedule_dest_pressure(dest);

    return true;
 }

 static int
 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
 {
    nir_schedule_regs_freed_state state = {
       .scoreboard = scoreboard,
    };

    nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);

    nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);

    nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);

    return state.regs_freed;
 }

 /**
  * Chooses an instruction that will minimise the register pressure as much as
  * possible. This should only be used as a fallback when the regular scheduling
  * generates a shader whose register allocation fails.
  */
 static nir_schedule_node *
 nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
 {
    nir_schedule_node *chosen = NULL;

    /* Find the leader in the ready (shouldn't-stall) set with the mininum
     * cost.
     */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (scoreboard->time < n->ready_time)
          continue;

       if (!chosen || chosen->max_delay > n->max_delay)
          chosen = n;
    }
    if (chosen) {
       if (debug) {
          fprintf(stderr, "chose (ready fallback):          ");
          nir_print_instr(chosen->instr, stderr);
          fprintf(stderr, "\n");
       }

       return chosen;
    }

    /* Otherwise, choose the leader with the minimum cost. */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (!chosen || chosen->max_delay > n->max_delay)
          chosen = n;
    }
    if (debug) {
       fprintf(stderr, "chose (leader fallback):         ");
       nir_print_instr(chosen->instr, stderr);
       fprintf(stderr, "\n");
    }

    return chosen;
 }

 /**
  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
  * Scheduling for Parallelism) heuristic.
  *
  * Picks an instruction on the critical that's ready to execute without
  * stalls, if possible, otherwise picks the instruction on the critical path.
  */
 static nir_schedule_node *
 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
 {
    nir_schedule_node *chosen = NULL;

    /* Find the leader in the ready (shouldn't-stall) set with the maximum
     * cost.
     */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (scoreboard->time < n->ready_time)
          continue;

       if (!chosen || chosen->max_delay < n->max_delay)
          chosen = n;
    }
    if (chosen) {
       if (debug) {
          fprintf(stderr, "chose (ready):          ");
          nir_print_instr(chosen->instr, stderr);
          fprintf(stderr, "\n");
       }

       return chosen;
    }

    /* Otherwise, choose the leader with the maximum cost. */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (!chosen || chosen->max_delay < n->max_delay)
          chosen = n;
    }
    if (debug) {
       fprintf(stderr, "chose (leader):         ");
       nir_print_instr(chosen->instr, stderr);
       fprintf(stderr, "\n");
    }

    return chosen;
 }

 /**
  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
  * Scheduling for Register pressure) heuristic.
  */
 static nir_schedule_node *
 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
 {
    nir_schedule_node *chosen = NULL;

    /* Find a ready inst with regs freed and pick the one with max cost. */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (n->ready_time > scoreboard->time)
          continue;

       int regs_freed = nir_schedule_regs_freed(scoreboard, n);

       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
          chosen = n;
       }
    }
    if (chosen) {
       if (debug) {
          fprintf(stderr, "chose (freed+ready):    ");
          nir_print_instr(chosen->instr, stderr);
          fprintf(stderr, "\n");
       }

       return chosen;
    }

    /* Find a leader with regs freed and pick the one with max cost. */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       int regs_freed = nir_schedule_regs_freed(scoreboard, n);

       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
          chosen = n;
       }
    }
    if (chosen) {
       if (debug) {
          fprintf(stderr, "chose (regs freed):     ");
          nir_print_instr(chosen->instr, stderr);
          fprintf(stderr, "\n");
       }

       return chosen;
    }

    /* Find a partially evaluated path and try to finish it off */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (n->partially_evaluated_path &&
           (!chosen || chosen->max_delay < n->max_delay)) {
          chosen = n;
       }
    }
    if (chosen) {
       if (debug) {
          fprintf(stderr, "chose (partial path):   ");
          nir_print_instr(chosen->instr, stderr);
          fprintf(stderr, "\n");
       }

       return chosen;
    }

    /* Contra the paper, pick a leader with no effect on used regs.  This may
     * open up new opportunities, as otherwise a single-operand instr consuming
     * a value will tend to block finding freeing that value.  This had a
     * massive effect on reducing spilling on V3D.
     *
     * XXX: Should this prioritize ready?
     */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (nir_schedule_regs_freed(scoreboard, n) != 0)
          continue;

       if (!chosen || chosen->max_delay < n->max_delay)
          chosen = n;
    }
    if (chosen) {
       if (debug) {
          fprintf(stderr, "chose (regs no-op):     ");
          nir_print_instr(chosen->instr, stderr);
          fprintf(stderr, "\n");
       }

       return chosen;
    }

    /* Pick the max delay of the remaining ready set. */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (n->ready_time > scoreboard->time)
          continue;
       if (!chosen || chosen->max_delay < n->max_delay)
          chosen = n;
    }
    if (chosen) {
       if (debug) {
          fprintf(stderr, "chose (ready max delay):   ");
          nir_print_instr(chosen->instr, stderr);
          fprintf(stderr, "\n");
       }
       return chosen;
    }

    /* Pick the max delay of the remaining leaders. */
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       if (!chosen || chosen->max_delay < n->max_delay)
          chosen = n;
    }

    if (debug) {
       fprintf(stderr, "chose (max delay):         ");
       nir_print_instr(chosen->instr, stderr);
       fprintf(stderr, "\n");
    }

    return chosen;
 }

 static void
 dump_state(nir_schedule_scoreboard *scoreboard)
 {
    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
       fprintf(stderr, "maxdel %5d ", n->max_delay);
       nir_print_instr(n->instr, stderr);
       fprintf(stderr, "\n");

       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
          nir_schedule_node *child = (nir_schedule_node *)edge->child;

          fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
          nir_print_instr(child->instr, stderr);
          fprintf(stderr, "\n");
       }
    }
 }

 static void
 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
                       void *reg_or_def,
                       nir_instr *reg_or_def_parent,
                       int pressure)
 {
    /* Make the value live if it's the first time it's been used. */
    if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
       _mesa_set_add(scoreboard->live_values, reg_or_def);
       scoreboard->pressure += pressure;
    }

    /* Make the value dead if it's the last remaining use.  Be careful when one
     * instruction uses a value twice to not decrement pressure twice.
     */
    struct set *remaining_uses =
       _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
    struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
    if (entry) {
       _mesa_set_remove(remaining_uses, entry);

       if (remaining_uses->entries == 0)
          scoreboard->pressure -= pressure;
    }
 }

 static bool
 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
 {
    nir_schedule_scoreboard *scoreboard = state;
    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);

    struct set_entry *entry = _mesa_set_search(remaining_uses,
                                               src->parent_instr);
    if (entry) {
       /* Once we've used an SSA value in one instruction, bump the priority of
        * the other uses so the SSA value can get fully consumed.
        *
        * We don't do this for registers, and it's would be a hassle and it's
        * unclear if that would help or not.  Also, skip it for constants, as
        * they're often folded as immediates into backend instructions and have
        * many unrelated instructions all referencing the same value (0).
        */
       if (src->is_ssa &&
           src->ssa->parent_instr->type != nir_instr_type_load_const) {
          nir_foreach_use(other_src, src->ssa) {
             if (other_src->parent_instr == src->parent_instr)
                continue;

             nir_schedule_node *n =
                nir_schedule_get_node(scoreboard->instr_map,
                                      other_src->parent_instr);

             if (n && !n->partially_evaluated_path) {
                if (debug) {
                   fprintf(stderr, "  New partially evaluated path: ");
                   nir_print_instr(n->instr, stderr);
                   fprintf(stderr, "\n");
                }

                n->partially_evaluated_path = true;
             }
          }
       }
    }

    nir_schedule_mark_use(scoreboard,
                          src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
                          src->parent_instr,
                          nir_schedule_src_pressure(src));

    return true;
 }

 static bool
 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
 {
    nir_schedule_scoreboard *scoreboard = state;

    nir_schedule_mark_use(scoreboard, def, def->parent_instr,
                          nir_schedule_def_pressure(def));

    return true;
 }

 static bool
 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
 {
    nir_schedule_scoreboard *scoreboard = state;

    /* SSA defs were handled in nir_schedule_mark_def_scheduled()
     */
    if (dest->is_ssa)
       return true;

    /* XXX: This is not actually accurate for regs -- the last use of a reg may
     * have a live interval that extends across control flow.  We should
     * calculate the live ranges of regs, and have scheduler nodes for the CF
     * nodes that also "use" the reg.
     */
    nir_schedule_mark_use(scoreboard, dest->reg.reg,
                          dest->reg.parent_instr,
                          nir_schedule_dest_pressure(dest));

    return true;
 }

 static void
 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
                                  nir_schedule_node *n)
 {
    nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
    nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
    nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);

    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
       nir_schedule_node *child = (nir_schedule_node *)edge->child;

       child->ready_time = MAX2(child->ready_time,
                                scoreboard->time + n->delay);

       if (child->dag.parent_count == 1) {
          if (debug) {
             fprintf(stderr, "  New DAG head: ");
             nir_print_instr(child->instr, stderr);
             fprintf(stderr, "\n");
          }
       }
    }

    dag_prune_head(scoreboard->dag, &n->dag);

    scoreboard->time = MAX2(n->ready_time, scoreboard->time);
    scoreboard->time++;
 }

 static void
 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
 {
    while (!list_is_empty(&scoreboard->dag->heads)) {
       if (debug) {
          fprintf(stderr, "current list:\n");
          dump_state(scoreboard);
       }

       nir_schedule_node *chosen;
       if (scoreboard->options->fallback)
          chosen = nir_schedule_choose_instruction_fallback(scoreboard);
       else if (scoreboard->pressure < scoreboard->options->threshold)
          chosen = nir_schedule_choose_instruction_csp(scoreboard);
       else
          chosen = nir_schedule_choose_instruction_csr(scoreboard);

       /* Now that we've scheduled a new instruction, some of its children may
        * be promoted to the list of instructions ready to be scheduled.
        */
       nir_schedule_mark_node_scheduled(scoreboard, chosen);

       /* Move the instruction to the end (so our first chosen instructions are
        * the start of the program).
        */
       exec_node_remove(&chosen->instr->node);
       exec_list_push_tail(&block->instr_list, &chosen->instr->node);

       if (debug)
          fprintf(stderr, "\n");
    }
 }

 static uint32_t
 nir_schedule_get_delay(nir_instr *instr)
 {
    switch (instr->type) {
    case nir_instr_type_ssa_undef:
    case nir_instr_type_load_const:
    case nir_instr_type_alu:
    case nir_instr_type_deref:
    case nir_instr_type_jump:
    case nir_instr_type_parallel_copy:
    case nir_instr_type_call:
    case nir_instr_type_phi:
       return 1;

    case nir_instr_type_intrinsic:
       /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
       return 1;

    case nir_instr_type_tex:
       /* Pick some large number to try to fetch textures early and sample them
        * late.
        */
       return 100;
    }

    return 0;
 }

 static void
 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
 {
    nir_schedule_node *n = (nir_schedule_node *)node;
    uint32_t max_delay = 0;

    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
       nir_schedule_node *child = (nir_schedule_node *)edge->child;
       max_delay = MAX2(child->max_delay, max_delay);
    }

    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
  }

 static void
 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
 {
    void *mem_ctx = ralloc_context(NULL);
    scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);

    scoreboard->dag = dag_create(mem_ctx);

    nir_foreach_instr(instr, block) {
       nir_schedule_node *n =
          rzalloc(mem_ctx, nir_schedule_node);

       n->instr = instr;
       n->delay = nir_schedule_get_delay(instr);
       dag_init_node(scoreboard->dag, &n->dag);

       _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
    }

    calculate_forward_deps(scoreboard, block);
    calculate_reverse_deps(scoreboard, block);

    dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);

    nir_schedule_instructions(scoreboard, block);

    ralloc_free(mem_ctx);
    scoreboard->instr_map = NULL;
 }

 static bool
 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
 {
    nir_schedule_scoreboard *scoreboard = state;
    struct set *def_uses = _mesa_pointer_set_create(scoreboard);

    _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);

    _mesa_set_add(def_uses, def->parent_instr);

    nir_foreach_use(src, def) {
       _mesa_set_add(def_uses, src->parent_instr);
    }

    /* XXX: Handle if uses */

    return true;
 }

 static nir_schedule_scoreboard *
 nir_schedule_get_scoreboard(nir_shader *shader,
                             const nir_schedule_options *options)
 {
    nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);

    scoreboard->shader = shader;
    scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
    scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
    scoreboard->options = options;
    scoreboard->pressure = 0;

    nir_foreach_function(function, shader) {
       nir_foreach_register(reg, &function->impl->registers) {
          struct set *register_uses =
             _mesa_pointer_set_create(scoreboard);

          _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);

          nir_foreach_use(src, reg) {
             _mesa_set_add(register_uses, src->parent_instr);
          }

          /* XXX: Handle if uses */

          nir_foreach_def(dest, reg) {
             _mesa_set_add(register_uses, dest->reg.parent_instr);
          }
       }

       nir_foreach_block(block, function->impl) {
          nir_foreach_instr(instr, block) {
             nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
                                 scoreboard);
          }

          /* XXX: We're ignoring if uses, which may prioritize scheduling other
           * uses of the if src even when it doesn't help.  That's not many
           * values, though, so meh.
           */
       }
    }

    return scoreboard;
 }

 static void
 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
 {
 #ifdef NDEBUG
    return;
 #endif

    bool any_uses = false;

    hash_table_foreach(scoreboard->remaining_uses, entry) {
       struct set *remaining_uses = entry->data;

       set_foreach(remaining_uses, instr_entry) {
          if (!any_uses) {
             fprintf(stderr, "Tracked uses remain after scheduling.  "
                     "Affected instructions: \n");
             any_uses = true;
          }
          nir_print_instr(instr_entry->key, stderr);
          fprintf(stderr, "\n");
       }
    }

    assert(!any_uses);
 }

 /**
  * Schedules the NIR instructions to try to decrease stalls (for example,
  * delaying texture reads) while managing register pressure.
  *
  * The threshold represents "number of NIR register/SSA def channels live
  * before switching the scheduling heuristic to reduce register pressure",
  * since most of our GPU architectures are scalar (extending to vector with a
  * flag wouldn't be hard).  This number should be a bit below the number of
  * registers available (counting any that may be occupied by system value
  * payload values, for example), since the heuristic may not always be able to
  * free a register immediately.  The amount below the limit is up to you to
  * tune.
  */
 void
 nir_schedule(nir_shader *shader,
              const nir_schedule_options *options)
 {
    nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
                                                                      options);

    if (debug) {
       fprintf(stderr, "NIR shader before scheduling:\n");
       nir_print_shader(shader, stderr);
    }

    nir_foreach_function(function, shader) {
       if (!function->impl)
          continue;

       nir_foreach_block(block, function->impl) {
          nir_schedule_block(scoreboard, block);
       }
    }

    nir_schedule_validate_uses(scoreboard);

    ralloc_free(scoreboard);
 }