| /* |
| * Copyright © 2010 Intel Corporation |
| * Copyright © 2014-2015 Broadcom |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| /** |
| * @file vc4_qir_schedule.c |
| * |
| * The basic model of the list scheduler is to take a basic block, compute a |
| * DAG of the dependencies from the bottom up, and make a list of the DAG |
| * heads. Heuristically pick a DAG head and schedule (remove) it, then put |
| * all the parents that are now DAG heads into the list of things to |
| * schedule. |
| * |
| * The goal of scheduling here, before register allocation and conversion to |
| * QPU instructions, is to reduce register pressure by reordering instructions |
| * to consume values when possible. |
| */ |
| |
| #include "vc4_qir.h" |
| #include "util/dag.h" |
| |
| static bool debug; |
| |
| struct schedule_node { |
| struct dag_node dag; |
| struct list_head link; |
| struct qinst *inst; |
| |
| /* Length of the longest (latency) chain from a DAG head to the this |
| * instruction. |
| */ |
| uint32_t delay; |
| |
| /* Longest time + latency_between(parent, this) of any parent of this |
| * node. |
| */ |
| uint32_t unblocked_time; |
| }; |
| |
| struct schedule_state { |
| struct dag *dag; |
| |
| uint32_t time; |
| |
| uint32_t *temp_writes; |
| |
| BITSET_WORD *temp_live; |
| }; |
| |
| /* When walking the instructions in reverse, we need to swap before/after in |
| * add_dep(). |
| */ |
| enum direction { F, R }; |
| |
| /** |
| * Marks a dependency between two intructions, that \p after must appear after |
| * \p before. |
| * |
| * Our dependencies are tracked as a DAG. Since we're scheduling bottom-up, |
| * the latest instructions with nothing left to schedule are the DAG heads, |
| * and their inputs are their children. |
| */ |
| static void |
| add_dep(enum direction dir, |
| struct schedule_node *before, |
| struct schedule_node *after) |
| { |
| if (!before || !after) |
| return; |
| |
| assert(before != after); |
| |
| if (dir == R) { |
| struct schedule_node *t = before; |
| before = after; |
| after = t; |
| } |
| |
| dag_add_edge(&after->dag, &before->dag, NULL); |
| } |
| |
| static void |
| add_write_dep(enum direction dir, |
| struct schedule_node **before, |
| struct schedule_node *after) |
| { |
| add_dep(dir, *before, after); |
| *before = after; |
| } |
| |
| struct schedule_setup_state { |
| struct schedule_node **last_temp_write; |
| struct schedule_node *last_sf; |
| struct schedule_node *last_vary_read; |
| struct schedule_node *last_vpm_read; |
| struct schedule_node *last_vpm_write; |
| struct schedule_node *last_tex_coord; |
| struct schedule_node *last_tex_result; |
| struct schedule_node *last_tlb; |
| struct schedule_node *last_uniforms_reset; |
| enum direction dir; |
| |
| /** |
| * Texture FIFO tracking. This is done top-to-bottom, and is used to |
| * track the QOP_TEX_RESULTs and add dependencies on previous ones |
| * when trying to submit texture coords with TFREQ full or new texture |
| * fetches with TXRCV full. |
| */ |
| struct { |
| struct schedule_node *node; |
| int coords; |
| } tex_fifo[8]; |
| int tfreq_count; /**< Number of texture coords outstanding. */ |
| int tfrcv_count; /**< Number of texture results outstanding. */ |
| int tex_fifo_pos; |
| }; |
| |
| static void |
| block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n) |
| { |
| add_dep(state->dir, state->tex_fifo[0].node, n); |
| |
| state->tfreq_count -= state->tex_fifo[0].coords; |
| state->tfrcv_count--; |
| |
| memmove(&state->tex_fifo[0], |
| &state->tex_fifo[1], |
| state->tex_fifo_pos * sizeof(state->tex_fifo[0])); |
| state->tex_fifo_pos--; |
| } |
| |
| /** |
| * Common code for dependencies that need to be tracked both forward and |
| * backward. |
| * |
| * This is for things like "all VPM reads have to happen in order." |
| */ |
| static void |
| calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) |
| { |
| struct qinst *inst = n->inst; |
| enum direction dir = state->dir; |
| |
| |
| /* Add deps for temp registers and varyings accesses. Note that we |
| * ignore uniforms accesses, because qir_reorder_uniforms() happens |
| * after this. |
| */ |
| for (int i = 0; i < qir_get_nsrc(inst); i++) { |
| switch (inst->src[i].file) { |
| case QFILE_TEMP: |
| add_dep(dir, |
| state->last_temp_write[inst->src[i].index], n); |
| break; |
| |
| case QFILE_VARY: |
| add_write_dep(dir, &state->last_vary_read, n); |
| break; |
| |
| case QFILE_VPM: |
| add_write_dep(dir, &state->last_vpm_read, n); |
| break; |
| |
| default: |
| break; |
| } |
| } |
| |
| switch (inst->op) { |
| case QOP_VARY_ADD_C: |
| add_dep(dir, state->last_vary_read, n); |
| break; |
| |
| case QOP_TEX_RESULT: |
| /* Results have to be fetched in order. */ |
| add_write_dep(dir, &state->last_tex_result, n); |
| break; |
| |
| case QOP_THRSW: |
| /* After a new THRSW, one must collect all texture samples |
| * queued since the previous THRSW/program start. For now, we |
| * have one THRSW in between each texture setup and its |
| * results collection as our input, and we just make sure that |
| * that ordering is maintained. |
| */ |
| add_write_dep(dir, &state->last_tex_coord, n); |
| add_write_dep(dir, &state->last_tex_result, n); |
| |
| /* accumulators and flags are lost across thread switches. */ |
| add_write_dep(dir, &state->last_sf, n); |
| |
| /* Setup, like the varyings, will need to be drained before we |
| * thread switch. |
| */ |
| add_write_dep(dir, &state->last_vary_read, n); |
| |
| /* The TLB-locking operations have to stay after the last |
| * thread switch. |
| */ |
| add_write_dep(dir, &state->last_tlb, n); |
| break; |
| |
| case QOP_TLB_COLOR_READ: |
| case QOP_MS_MASK: |
| add_write_dep(dir, &state->last_tlb, n); |
| break; |
| |
| default: |
| break; |
| } |
| |
| switch (inst->dst.file) { |
| case QFILE_VPM: |
| add_write_dep(dir, &state->last_vpm_write, n); |
| break; |
| |
| case QFILE_TEMP: |
| add_write_dep(dir, &state->last_temp_write[inst->dst.index], n); |
| break; |
| |
| case QFILE_TLB_COLOR_WRITE: |
| case QFILE_TLB_COLOR_WRITE_MS: |
| case QFILE_TLB_Z_WRITE: |
| case QFILE_TLB_STENCIL_SETUP: |
| add_write_dep(dir, &state->last_tlb, n); |
| break; |
| |
| case QFILE_TEX_S_DIRECT: |
| case QFILE_TEX_S: |
| case QFILE_TEX_T: |
| case QFILE_TEX_R: |
| case QFILE_TEX_B: |
| /* Texturing setup gets scheduled in order, because |
| * the uniforms referenced by them have to land in a |
| * specific order. |
| */ |
| add_write_dep(dir, &state->last_tex_coord, n); |
| break; |
| |
| default: |
| break; |
| } |
| |
| if (qir_depends_on_flags(inst)) |
| add_dep(dir, state->last_sf, n); |
| |
| if (inst->sf) |
| add_write_dep(dir, &state->last_sf, n); |
| } |
| |
| static void |
| calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, |
| struct list_head *schedule_list) |
| { |
| struct schedule_setup_state state; |
| |
| memset(&state, 0, sizeof(state)); |
| state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *, |
| c->num_temps); |
| state.dir = F; |
| |
| list_for_each_entry(struct schedule_node, n, schedule_list, link) { |
| struct qinst *inst = n->inst; |
| |
| calculate_deps(&state, n); |
| |
| for (int i = 0; i < qir_get_nsrc(inst); i++) { |
| switch (inst->src[i].file) { |
| case QFILE_UNIF: |
| add_dep(state.dir, state.last_uniforms_reset, n); |
| break; |
| default: |
| break; |
| } |
| } |
| |
| switch (inst->dst.file) { |
| case QFILE_TEX_S_DIRECT: |
| case QFILE_TEX_S: |
| case QFILE_TEX_T: |
| case QFILE_TEX_R: |
| case QFILE_TEX_B: |
| /* From the VC4 spec: |
| * |
| * "The TFREQ input FIFO holds two full lots of s, |
| * t, r, b data, plus associated setup data, per |
| * QPU, that is, there are eight data slots. For |
| * each texture request, slots are only consumed |
| * for the components of s, t, r, and b actually |
| * written. Thus the FIFO can hold four requests |
| * of just (s, t) data, or eight requests of just |
| * s data (for direct addressed data lookups). |
| * |
| * Note that there is one FIFO per QPU, and the |
| * FIFO has no concept of threads - that is, |
| * multi-threaded shaders must be careful to use |
| * only 1/2 the FIFO depth before reading |
| * back. Multi-threaded programs must also |
| * therefore always thread switch on texture |
| * fetch as the other thread may have data |
| * waiting in the FIFO." |
| * |
| * If the texture coordinate fifo is full, block this |
| * on the last QOP_TEX_RESULT. |
| */ |
| if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) { |
| block_until_tex_result(&state, n); |
| } |
| |
| /* From the VC4 spec: |
| * |
| * "Since the maximum number of texture requests |
| * in the input (TFREQ) FIFO is four lots of (s, |
| * t) data, the output (TFRCV) FIFO is sized to |
| * holds four lots of max-size color data per |
| * QPU. For non-float color, reads are packed |
| * RGBA8888 data (one read per pixel). For 16-bit |
| * float color, two reads are necessary per |
| * pixel, with reads packed as RG1616 then |
| * BA1616. So per QPU there are eight color slots |
| * in the TFRCV FIFO." |
| * |
| * If the texture result fifo is full, block adding |
| * any more to it until the last QOP_TEX_RESULT. |
| */ |
| if (inst->dst.file == QFILE_TEX_S || |
| inst->dst.file == QFILE_TEX_S_DIRECT) { |
| if (state.tfrcv_count == |
| (c->fs_threaded ? 2 : 4)) |
| block_until_tex_result(&state, n); |
| state.tfrcv_count++; |
| } |
| |
| state.tex_fifo[state.tex_fifo_pos].coords++; |
| state.tfreq_count++; |
| break; |
| |
| default: |
| break; |
| } |
| |
| switch (inst->op) { |
| case QOP_TEX_RESULT: |
| /* Results have to be fetched after the |
| * coordinate setup. Note that we're assuming |
| * here that our input shader has the texture |
| * coord setup and result fetch in order, |
| * which is true initially but not of our |
| * instruction stream after this pass. |
| */ |
| add_dep(state.dir, state.last_tex_coord, n); |
| |
| state.tex_fifo[state.tex_fifo_pos].node = n; |
| |
| state.tex_fifo_pos++; |
| memset(&state.tex_fifo[state.tex_fifo_pos], 0, |
| sizeof(state.tex_fifo[0])); |
| break; |
| |
| case QOP_UNIFORMS_RESET: |
| add_write_dep(state.dir, &state.last_uniforms_reset, n); |
| break; |
| |
| default: |
| break; |
| } |
| } |
| } |
| |
| static void |
| calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx, |
| struct list_head *schedule_list) |
| { |
| struct schedule_setup_state state; |
| |
| memset(&state, 0, sizeof(state)); |
| state.dir = R; |
| state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *, |
| c->num_temps); |
| |
| list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) { |
| calculate_deps(&state, n); |
| } |
| } |
| |
| static int |
| get_register_pressure_cost(struct schedule_state *state, struct qinst *inst) |
| { |
| int cost = 0; |
| |
| if (inst->dst.file == QFILE_TEMP && |
| state->temp_writes[inst->dst.index] == 1) |
| cost--; |
| |
| for (int i = 0; i < qir_get_nsrc(inst); i++) { |
| if (inst->src[i].file != QFILE_TEMP || |
| BITSET_TEST(state->temp_live, inst->src[i].index)) { |
| continue; |
| } |
| |
| bool already_counted = false; |
| for (int j = 0; j < i; j++) { |
| if (inst->src[i].file == inst->src[j].file && |
| inst->src[i].index == inst->src[j].index) { |
| already_counted = true; |
| } |
| } |
| if (!already_counted) |
| cost++; |
| } |
| |
| return cost; |
| } |
| |
| static bool |
| locks_scoreboard(struct qinst *inst) |
| { |
| if (inst->op == QOP_TLB_COLOR_READ) |
| return true; |
| |
| switch (inst->dst.file) { |
| case QFILE_TLB_Z_WRITE: |
| case QFILE_TLB_COLOR_WRITE: |
| case QFILE_TLB_COLOR_WRITE_MS: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| static struct schedule_node * |
| choose_instruction(struct schedule_state *state) |
| { |
| struct schedule_node *chosen = NULL; |
| |
| list_for_each_entry(struct schedule_node, n, &state->dag->heads, |
| dag.link) { |
| /* The branches aren't being tracked as dependencies. Make |
| * sure that they stay scheduled as the last instruction of |
| * the block, which is to say the first one we choose to |
| * schedule. |
| */ |
| if (n->inst->op == QOP_BRANCH) |
| return n; |
| |
| if (!chosen) { |
| chosen = n; |
| continue; |
| } |
| |
| /* Prefer scheduling things that lock the scoreboard, so that |
| * they appear late in the program and we get more parallelism |
| * between shaders on multiple QPUs hitting the same fragment. |
| */ |
| if (locks_scoreboard(n->inst) && |
| !locks_scoreboard(chosen->inst)) { |
| chosen = n; |
| continue; |
| } else if (!locks_scoreboard(n->inst) && |
| locks_scoreboard(chosen->inst)) { |
| continue; |
| } |
| |
| /* If we would block on the previously chosen node, but would |
| * block less on this one, then prefer it. |
| */ |
| if (chosen->unblocked_time > state->time && |
| n->unblocked_time < chosen->unblocked_time) { |
| chosen = n; |
| continue; |
| } else if (n->unblocked_time > state->time && |
| n->unblocked_time > chosen->unblocked_time) { |
| continue; |
| } |
| |
| /* If we can definitely reduce register pressure, do so |
| * immediately. |
| */ |
| int register_pressure_cost = |
| get_register_pressure_cost(state, n->inst); |
| int chosen_register_pressure_cost = |
| get_register_pressure_cost(state, chosen->inst); |
| |
| if (register_pressure_cost < chosen_register_pressure_cost) { |
| chosen = n; |
| continue; |
| } else if (register_pressure_cost > |
| chosen_register_pressure_cost) { |
| continue; |
| } |
| |
| /* Otherwise, prefer instructions with the deepest chain to |
| * the end of the program. This avoids the problem of |
| * "everything generates a temp, nothing finishes freeing one, |
| * guess I'll just keep emitting varying mul/adds". |
| */ |
| if (n->delay > chosen->delay) { |
| chosen = n; |
| continue; |
| } else if (n->delay < chosen->delay) { |
| continue; |
| } |
| } |
| |
| return chosen; |
| } |
| |
| static void |
| dump_state(struct vc4_compile *c, struct schedule_state *state) |
| { |
| uint32_t i = 0; |
| list_for_each_entry(struct schedule_node, n, &state->dag->heads, |
| dag.link) { |
| fprintf(stderr, "%3d: ", i++); |
| qir_dump_inst(c, n->inst); |
| fprintf(stderr, " (%d cost)\n", |
| get_register_pressure_cost(state, n->inst)); |
| |
| util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { |
| struct schedule_node *child = |
| (struct schedule_node *)edge->child; |
| fprintf(stderr, " - "); |
| qir_dump_inst(c, child->inst); |
| fprintf(stderr, " (%d parents)\n", |
| child->dag.parent_count); |
| } |
| } |
| } |
| |
| /* Estimate of how many instructions we should schedule between operations. |
| * |
| * These aren't in real cycle counts, because we're just estimating cycle |
| * times anyway. QIR instructions will get paired up when turned into QPU |
| * instructions, or extra NOP delays will have to be added due to register |
| * allocation choices. |
| */ |
| static uint32_t |
| latency_between(struct schedule_node *before, struct schedule_node *after) |
| { |
| if ((before->inst->dst.file == QFILE_TEX_S || |
| before->inst->dst.file == QFILE_TEX_S_DIRECT) && |
| after->inst->op == QOP_TEX_RESULT) |
| return 100; |
| |
| switch (before->inst->op) { |
| case QOP_RCP: |
| case QOP_RSQ: |
| case QOP_EXP2: |
| case QOP_LOG2: |
| for (int i = 0; i < qir_get_nsrc(after->inst); i++) { |
| if (after->inst->src[i].file == |
| before->inst->dst.file && |
| after->inst->src[i].index == |
| before->inst->dst.index) { |
| /* There are two QPU delay slots before we can |
| * read a math result, which could be up to 4 |
| * QIR instructions if they packed well. |
| */ |
| return 4; |
| } |
| } |
| break; |
| default: |
| break; |
| } |
| |
| return 1; |
| } |
| |
| /** Recursive computation of the delay member of a node. */ |
| static void |
| compute_delay(struct dag_node *node, void *state) |
| { |
| struct schedule_node *n = (struct schedule_node *)node; |
| |
| /* The color read needs to be scheduled late, to avoid locking the |
| * scoreboard early. This is our best tool for encouraging that. The |
| * other scoreboard locking ops will have this happen by default, |
| * since they are generally the DAG heads or close to them. |
| */ |
| if (n->inst->op == QOP_TLB_COLOR_READ) |
| n->delay = 1000; |
| else |
| n->delay = 1; |
| |
| util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { |
| struct schedule_node *child = |
| (struct schedule_node *)edge->child; |
| |
| n->delay = MAX2(n->delay, (child->delay + |
| latency_between(child, n))); |
| } |
| } |
| |
| static void |
| schedule_instructions(struct vc4_compile *c, |
| struct qblock *block, struct schedule_state *state) |
| { |
| if (debug) { |
| fprintf(stderr, "initial deps:\n"); |
| dump_state(c, state); |
| } |
| |
| state->time = 0; |
| while (!list_is_empty(&state->dag->heads)) { |
| struct schedule_node *chosen = choose_instruction(state); |
| struct qinst *inst = chosen->inst; |
| |
| if (debug) { |
| fprintf(stderr, "current list:\n"); |
| dump_state(c, state); |
| fprintf(stderr, "chose: "); |
| qir_dump_inst(c, inst); |
| fprintf(stderr, " (%d cost)\n", |
| get_register_pressure_cost(state, inst)); |
| } |
| |
| state->time = MAX2(state->time, chosen->unblocked_time); |
| |
| /* Schedule this instruction back onto the QIR list. */ |
| list_add(&inst->link, &block->instructions); |
| |
| /* Now that we've scheduled a new instruction, some of its |
| * children can be promoted to the list of instructions ready to |
| * be scheduled. Update the children's unblocked time for this |
| * DAG edge as we do so. |
| */ |
| util_dynarray_foreach(&chosen->dag.edges, |
| struct dag_edge, edge) { |
| struct schedule_node *child = |
| (struct schedule_node *)edge->child; |
| |
| child->unblocked_time = MAX2(child->unblocked_time, |
| state->time + |
| latency_between(child, |
| chosen)); |
| } |
| dag_prune_head(state->dag, &chosen->dag); |
| |
| /* Update our tracking of register pressure. */ |
| for (int i = 0; i < qir_get_nsrc(inst); i++) { |
| if (inst->src[i].file == QFILE_TEMP) |
| BITSET_SET(state->temp_live, inst->src[i].index); |
| } |
| if (inst->dst.file == QFILE_TEMP) { |
| state->temp_writes[inst->dst.index]--; |
| if (state->temp_writes[inst->dst.index] == 0) |
| BITSET_CLEAR(state->temp_live, inst->dst.index); |
| } |
| |
| state->time++; |
| } |
| } |
| |
| static void |
| qir_schedule_instructions_block(struct vc4_compile *c, |
| struct qblock *block) |
| { |
| struct schedule_state *state = rzalloc(NULL, struct schedule_state); |
| |
| state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps); |
| state->temp_live = rzalloc_array(state, BITSET_WORD, |
| BITSET_WORDS(c->num_temps)); |
| state->dag = dag_create(state); |
| |
| struct list_head setup_list; |
| list_inithead(&setup_list); |
| |
| /* Wrap each instruction in a scheduler structure. */ |
| qir_for_each_inst_safe(inst, block) { |
| struct schedule_node *n = rzalloc(state, struct schedule_node); |
| |
| n->inst = inst; |
| list_del(&inst->link); |
| list_addtail(&n->link, &setup_list); |
| dag_init_node(state->dag, &n->dag); |
| |
| if (inst->dst.file == QFILE_TEMP) |
| state->temp_writes[inst->dst.index]++; |
| } |
| |
| /* Dependencies tracked top-to-bottom. */ |
| calculate_forward_deps(c, state, &setup_list); |
| /* Dependencies tracked bottom-to-top. */ |
| calculate_reverse_deps(c, state, &setup_list); |
| |
| dag_traverse_bottom_up(state->dag, compute_delay, NULL); |
| |
| schedule_instructions(c, block, state); |
| |
| ralloc_free(state); |
| } |
| |
| void |
| qir_schedule_instructions(struct vc4_compile *c) |
| { |
| |
| if (debug) { |
| fprintf(stderr, "Pre-schedule instructions\n"); |
| qir_dump(c); |
| } |
| |
| qir_for_each_block(block, c) |
| qir_schedule_instructions_block(c, block); |
| |
| if (debug) { |
| fprintf(stderr, "Post-schedule instructions\n"); |
| qir_dump(c); |
| } |
| } |