src/gallium/drivers/lima/ir/pp/regalloc.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright (c) 2017 Lima Project
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sub license,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
  */

 #include "util/ralloc.h"
 #include "util/register_allocate.h"
 #include "util/u_debug.h"

 #include "ppir.h"
 #include "lima_context.h"

 #define PPIR_FULL_REG_NUM  6

 #define PPIR_VEC1_REG_NUM       (PPIR_FULL_REG_NUM * 4) /* x, y, z, w */
 #define PPIR_VEC2_REG_NUM       (PPIR_FULL_REG_NUM * 3) /* xy, yz, zw */
 #define PPIR_VEC3_REG_NUM       (PPIR_FULL_REG_NUM * 2) /* xyz, yzw */
 #define PPIR_VEC4_REG_NUM       PPIR_FULL_REG_NUM       /* xyzw */
 #define PPIR_HEAD_VEC1_REG_NUM  PPIR_FULL_REG_NUM       /* x */
 #define PPIR_HEAD_VEC2_REG_NUM  PPIR_FULL_REG_NUM       /* xy */
 #define PPIR_HEAD_VEC3_REG_NUM  PPIR_FULL_REG_NUM       /* xyz */
 #define PPIR_HEAD_VEC4_REG_NUM  PPIR_FULL_REG_NUM       /* xyzw */

 #define PPIR_VEC1_REG_BASE       0
 #define PPIR_VEC2_REG_BASE       (PPIR_VEC1_REG_BASE + PPIR_VEC1_REG_NUM)
 #define PPIR_VEC3_REG_BASE       (PPIR_VEC2_REG_BASE + PPIR_VEC2_REG_NUM)
 #define PPIR_VEC4_REG_BASE       (PPIR_VEC3_REG_BASE + PPIR_VEC3_REG_NUM)
 #define PPIR_HEAD_VEC1_REG_BASE  (PPIR_VEC4_REG_BASE + PPIR_VEC4_REG_NUM)
 #define PPIR_HEAD_VEC2_REG_BASE  (PPIR_HEAD_VEC1_REG_BASE + PPIR_HEAD_VEC1_REG_NUM)
 #define PPIR_HEAD_VEC3_REG_BASE  (PPIR_HEAD_VEC2_REG_BASE + PPIR_HEAD_VEC2_REG_NUM)
 #define PPIR_HEAD_VEC4_REG_BASE  (PPIR_HEAD_VEC3_REG_BASE + PPIR_HEAD_VEC3_REG_NUM)
 #define PPIR_REG_COUNT           (PPIR_HEAD_VEC4_REG_BASE + PPIR_HEAD_VEC4_REG_NUM)

 enum ppir_ra_reg_class {
    ppir_ra_reg_class_vec1,
    ppir_ra_reg_class_vec2,
    ppir_ra_reg_class_vec3,
    ppir_ra_reg_class_vec4,

    /* 4 reg class for load/store instr regs:
     * load/store instr has no swizzle field, so the (virtual) register
     * must be allocated at the beginning of a (physical) register,
     */
    ppir_ra_reg_class_head_vec1,
    ppir_ra_reg_class_head_vec2,
    ppir_ra_reg_class_head_vec3,
    ppir_ra_reg_class_head_vec4,

    ppir_ra_reg_class_num,
 };

 static const int ppir_ra_reg_base[ppir_ra_reg_class_num + 1] = {
    [ppir_ra_reg_class_vec1]       = PPIR_VEC1_REG_BASE,
    [ppir_ra_reg_class_vec2]       = PPIR_VEC2_REG_BASE,
    [ppir_ra_reg_class_vec3]       = PPIR_VEC3_REG_BASE,
    [ppir_ra_reg_class_vec4]       = PPIR_VEC4_REG_BASE,
    [ppir_ra_reg_class_head_vec1]  = PPIR_HEAD_VEC1_REG_BASE,
    [ppir_ra_reg_class_head_vec2]  = PPIR_HEAD_VEC2_REG_BASE,
    [ppir_ra_reg_class_head_vec3]  = PPIR_HEAD_VEC3_REG_BASE,
    [ppir_ra_reg_class_head_vec4]  = PPIR_HEAD_VEC4_REG_BASE,
    [ppir_ra_reg_class_num]        = PPIR_REG_COUNT,
 };

 static unsigned int *
 ppir_ra_reg_q_values[ppir_ra_reg_class_num] = {
    (unsigned int []) {1, 2, 3, 4, 1, 2, 3, 4},
    (unsigned int []) {2, 3, 3, 3, 1, 2, 3, 3},
    (unsigned int []) {2, 2, 2, 2, 1, 2, 2, 2},
    (unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
    (unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
    (unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
    (unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
    (unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
 };

 struct ra_regs *ppir_regalloc_init(void *mem_ctx)
 {
    struct ra_regs *ret = ra_alloc_reg_set(mem_ctx, PPIR_REG_COUNT, false);
    if (!ret)
       return NULL;

    /* (x, y, z, w) (xy, yz, zw) (xyz, yzw) (xyzw) (x) (xy) (xyz) (xyzw) */
    static const int class_reg_num[ppir_ra_reg_class_num] = {
       4, 3, 2, 1, 1, 1, 1, 1,
    };
    /* base reg (x, y, z, w) confliction with other regs */
    for (int h = 0; h < 4; h++) {
       int base_reg_mask = 1 << h;
       for (int i = 1; i < ppir_ra_reg_class_num; i++) {
          int class_reg_base_mask = (1 << ((i % 4) + 1)) - 1;
          for (int j = 0; j < class_reg_num[i]; j++) {
             if (base_reg_mask & (class_reg_base_mask << j)) {
                for (int k = 0; k < PPIR_FULL_REG_NUM; k++) {
                   ra_add_reg_conflict(ret, k * 4 + h,
                      ppir_ra_reg_base[i] + k * class_reg_num[i] + j);
                }
             }
          }
       }
    }
    /* build all other confliction by the base reg confliction */
    for (int i = 0; i < PPIR_VEC1_REG_NUM; i++)
       ra_make_reg_conflicts_transitive(ret, i);

    for (int i = 0; i < ppir_ra_reg_class_num; i++)
       ra_alloc_reg_class(ret);

    int reg_index = 0;
    for (int i = 0; i < ppir_ra_reg_class_num; i++) {
       while (reg_index < ppir_ra_reg_base[i + 1])
          ra_class_add_reg(ret, i, reg_index++);
    }

    ra_set_finalize(ret, ppir_ra_reg_q_values);
    return ret;
 }

 static ppir_reg *get_src_reg(ppir_src *src)
 {
    switch (src->type) {
    case ppir_target_ssa:
       return src->ssa;
    case ppir_target_register:
       return src->reg;
    default:
       return NULL;
    }
 }

 static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp)
 {
    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
       list_for_each_entry(ppir_node, node, &block->node_list, list) {
          if (node->op == ppir_op_store_color)
             continue;

          if (!node->instr || node->op == ppir_op_const)
             continue;

          ppir_dest *dest = ppir_node_get_dest(node);
          if (dest) {
             ppir_reg *reg = NULL;

             if (dest->type == ppir_target_ssa) {
                reg = &dest->ssa;
                list_addtail(&reg->list, &comp->reg_list);
             }
          }
       }
    }
 }

 static ppir_reg *ppir_regalloc_build_liveness_info(ppir_compiler *comp)
 {
    ppir_reg *ret = NULL;

    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
       list_for_each_entry(ppir_node, node, &block->node_list, list) {
          if (node->op == ppir_op_store_color) {
             ppir_store_node *store = ppir_node_to_store(node);
             if (store->src.type == ppir_target_ssa)
                ret = store->src.ssa;
             else
                ret = store->src.reg;
             ret->live_out = INT_MAX;
             continue;
          }

          if (!node->instr || node->op == ppir_op_const)
             continue;

          /* update reg live_in from node dest (write) */
          ppir_dest *dest = ppir_node_get_dest(node);
          if (dest) {
             ppir_reg *reg = NULL;

             if (dest->type == ppir_target_ssa) {
                reg = &dest->ssa;
             }
             else if (dest->type == ppir_target_register)
                reg = dest->reg;

             if (reg && node->instr->seq < reg->live_in)
                reg->live_in = node->instr->seq;
          }

          /* update reg live_out from node src (read) */
          for (int i = 0; i < ppir_node_get_src_num(node); i++)
          {
             ppir_reg *reg = get_src_reg(ppir_node_get_src(node, i));
             if (reg && node->instr->seq > reg->live_out)
                reg->live_out = node->instr->seq;
          }
       }
    }

    return ret;
 }

 static int get_phy_reg_index(int reg)
 {
    int i;

    for (i = 0; i < ppir_ra_reg_class_num; i++) {
       if (reg < ppir_ra_reg_base[i + 1]) {
          reg -= ppir_ra_reg_base[i];
          break;
       }
    }

    if (i < ppir_ra_reg_class_head_vec1)
       return reg / (4 - i) * 4 + reg % (4 - i);
    else
       return reg * 4;
 }

 static void ppir_regalloc_print_result(ppir_compiler *comp)
 {
    printf("======ppir regalloc result======\n");
    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
       list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
          printf("%03d:", instr->index);
          for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
             ppir_node *node = instr->slots[i];
             if (!node)
                continue;

             printf(" (%d|", node->index);

             ppir_dest *dest = ppir_node_get_dest(node);
             if (dest)
                printf("%d", ppir_target_get_dest_reg_index(dest));

             printf("|");

             for (int i = 0; i < ppir_node_get_src_num(node); i++) {
                if (i)
                   printf(" ");
                printf("%d", ppir_target_get_src_reg_index(ppir_node_get_src(node, i)));
             }

             printf(")");
          }
          printf("\n");
       }
    }
    printf("--------------------------\n");
 }

 static bool create_new_instr_after(ppir_block *block, ppir_instr *ref,
                                    ppir_node *node)
 {
    ppir_instr *newinstr = ppir_instr_create(block);
    if (unlikely(!newinstr))
       return false;

    list_del(&newinstr->list);
    list_add(&newinstr->list, &ref->list);

    if (!ppir_instr_insert_node(newinstr, node))
       return false;

    list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
       instr->seq++;
    }
    newinstr->seq = ref->seq+1;
    newinstr->scheduled = true;
    return true;
 }

 static bool create_new_instr_before(ppir_block *block, ppir_instr *ref,
                                     ppir_node *node)
 {
    ppir_instr *newinstr = ppir_instr_create(block);
    if (unlikely(!newinstr))
       return false;

    list_del(&newinstr->list);
    list_addtail(&newinstr->list, &ref->list);

    if (!ppir_instr_insert_node(newinstr, node))
       return false;

    list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
       instr->seq++;
    }
    newinstr->seq = ref->seq-1;
    newinstr->scheduled = true;
    return true;
 }

 static ppir_alu_node* ppir_update_spilled_src(ppir_compiler *comp,
                                               ppir_block *block,
                                               ppir_node *node, ppir_src *src,
                                               ppir_alu_node *move_alu)
 {
    /* alu nodes may have multiple references to the same value.
     * try to avoid unnecessary loads for the same alu node by
     * saving the node resulting from the temporary load */
    if (move_alu)
       goto update_src;

    /* alloc new node to load value */
    ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
    if (!load_node)
       return NULL;
    list_addtail(&load_node->list, &node->list);
    comp->num_fills++;

    ppir_load_node *load = ppir_node_to_load(load_node);

    load->index = -comp->prog->stack_size; /* index sizes are negative */
    load->num_components = 4;

    ppir_dest *ld_dest = &load->dest;
    ld_dest->type = ppir_target_pipeline;
    ld_dest->pipeline = ppir_pipeline_reg_uniform;
    ld_dest->write_mask = 0xf;

    create_new_instr_before(block, node->instr, load_node);

    /* Create move node */
    ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
    if (unlikely(!move_node))
       return false;
    list_addtail(&move_node->list, &node->list);

    move_alu = ppir_node_to_alu(move_node);

    move_alu->num_src = 1;
    move_alu->src->type = ppir_target_pipeline;
    move_alu->src->pipeline = ppir_pipeline_reg_uniform;
    for (int i = 0; i < 4; i++)
       move_alu->src->swizzle[i] = i;

    ppir_dest *alu_dest = &move_alu->dest;
    alu_dest->type = ppir_target_ssa;
    alu_dest->ssa.num_components = 4;
    alu_dest->ssa.live_in = INT_MAX;
    alu_dest->ssa.live_out = 0;
    alu_dest->write_mask = 0xf;

    list_addtail(&alu_dest->ssa.list, &comp->reg_list);

    if (!ppir_instr_insert_node(load_node->instr, move_node))
       return false;

    /* insert the new node as predecessor */
    ppir_node_foreach_pred_safe(node, dep) {
       ppir_node *pred = dep->pred;
       ppir_node_remove_dep(dep);
       ppir_node_add_dep(load_node, pred);
    }
    ppir_node_add_dep(node, move_node);
    ppir_node_add_dep(move_node, load_node);

 update_src:
    /* switch node src to use the new ssa instead */
    src->type = ppir_target_ssa;
    src->ssa = &move_alu->dest.ssa;

    return move_alu;
 }

 static ppir_reg *create_reg(ppir_compiler *comp, int num_components)
 {
    ppir_reg *r = rzalloc(comp, ppir_reg);
    if (!r)
       return NULL;

    r->num_components = num_components;
    r->live_in = INT_MAX;
    r->live_out = 0;
    r->is_head = false;
    list_addtail(&r->list, &comp->reg_list);

    return r;
 }

 static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
                                      ppir_node *node, ppir_dest *dest)
 {
    assert(dest != NULL);
    ppir_reg *reg = NULL;
    if (dest->type == ppir_target_register) {
       reg = dest->reg;
       reg->num_components = 4;
       reg->spilled = true;
    }
    else {
       reg = create_reg(comp, 4);
       reg->spilled = true;
       list_del(&dest->ssa.list);
    }

    /* alloc new node to load value */
    ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
    if (!load_node)
       return NULL;
    list_addtail(&load_node->list, &node->list);
    comp->num_fills++;

    ppir_load_node *load = ppir_node_to_load(load_node);

    load->index = -comp->prog->stack_size; /* index sizes are negative */
    load->num_components = 4;

    load->dest.type = ppir_target_pipeline;
    load->dest.pipeline = ppir_pipeline_reg_uniform;
    load->dest.write_mask = 0xf;

    create_new_instr_before(block, node->instr, load_node);

    /* Create move node */
    ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
    if (unlikely(!move_node))
       return false;
    list_addtail(&move_node->list, &node->list);

    ppir_alu_node *move_alu = ppir_node_to_alu(move_node);

    move_alu->num_src = 1;
    move_alu->src->type = ppir_target_pipeline;
    move_alu->src->pipeline = ppir_pipeline_reg_uniform;
    for (int i = 0; i < 4; i++)
       move_alu->src->swizzle[i] = i;

    move_alu->dest.type = ppir_target_register;
    move_alu->dest.reg = reg;
    move_alu->dest.write_mask = 0x0f;

    if (!ppir_instr_insert_node(load_node->instr, move_node))
       return false;

    ppir_node_foreach_pred_safe(node, dep) {
       ppir_node *pred = dep->pred;
       ppir_node_remove_dep(dep);
       ppir_node_add_dep(load_node, pred);
    }
    ppir_node_add_dep(node, move_node);
    ppir_node_add_dep(move_node, load_node);

    dest->type = ppir_target_register;
    dest->reg = reg;

    /* alloc new node to store value */
    ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0);
    if (!store_node)
       return false;
    list_addtail(&store_node->list, &node->list);
    comp->num_spills++;

    ppir_store_node *store = ppir_node_to_store(store_node);

    store->index = -comp->prog->stack_size; /* index sizes are negative */
    store->num_components = 4;

    store->src.type = ppir_target_register;
    store->src.reg = dest->reg;

    /* insert the new node as successor */
    ppir_node_foreach_succ_safe(node, dep) {
       ppir_node *succ = dep->succ;
       ppir_node_remove_dep(dep);
       ppir_node_add_dep(succ, store_node);
    }
    ppir_node_add_dep(store_node, node);

    create_new_instr_after(block, node->instr, store_node);

    return true;
 }

 static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen)
 {
    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
       list_for_each_entry(ppir_node, node, &block->node_list, list) {

          ppir_dest *dest = ppir_node_get_dest(node);
          ppir_reg *reg = NULL;
          if (dest) {
             if (dest->type == ppir_target_ssa)
                reg = &dest->ssa;
             else if (dest->type == ppir_target_register)
                reg = dest->reg;

             if (reg == chosen)
                ppir_update_spilled_dest(comp, block, node, dest);
          }

          switch (node->type) {
          case ppir_node_type_alu:
          {
             /* alu nodes may have multiple references to the same value.
              * try to avoid unnecessary loads for the same alu node by
              * saving the node resulting from the temporary load */
             ppir_alu_node *move_alu = NULL;
             ppir_alu_node *alu = ppir_node_to_alu(node);
             for (int i = 0; i < alu->num_src; i++) {
                reg = get_src_reg(alu->src + i);
                if (reg == chosen) {
                   move_alu = ppir_update_spilled_src(comp, block, node,
                                                      alu->src + i, move_alu);
                }
             }
             break;
          }
          default:
          {
             for (int i = 0; i < ppir_node_get_src_num(node); i++) {
                ppir_src *src = ppir_node_get_src(node, i);
                reg = get_src_reg(src);
                if (reg == chosen) {
                   ppir_update_spilled_src(comp, block, node, src, NULL);
                }
             }
             break;
          }
          }
       }
    }

    return true;
 }

 static ppir_reg *ppir_regalloc_choose_spill_node(ppir_compiler *comp,
                                                  struct ra_graph *g)
 {
    int i = 0;
    ppir_reg *chosen = NULL;

    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
       if (reg->spilled || reg->live_out == INT_MAX) {
          /* not considered for spilling */
          ra_set_node_spill_cost(g, i++, 0.0f);
          continue;
       }

       /* It is beneficial to spill registers with higher component number,
        * so increase the cost of spilling registers with few components */
       float spill_cost = 4.0f / (float)reg->num_components;
       ra_set_node_spill_cost(g, i++, spill_cost);
    }

    int r = ra_get_best_spill_node(g);
    if (r == -1)
       return NULL;

    i = 0;
    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
       if (i++ == r) {
          chosen = reg;
          break;
       }
    }
    assert(chosen);
    chosen->spilled = true;

    return chosen;
 }

 static void ppir_regalloc_reset_liveness_info(ppir_compiler *comp)
 {
    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
       reg->live_in = INT_MAX;
       reg->live_out = 0;
    }
 }

 int lima_ppir_force_spilling = 0;

 static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled)
 {
    ppir_reg *end_reg;

    ppir_regalloc_reset_liveness_info(comp);
    end_reg = ppir_regalloc_build_liveness_info(comp);

    struct ra_graph *g = ra_alloc_interference_graph(
       comp->ra, list_length(&comp->reg_list));

    int n = 0, end_reg_index = 0;
    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
       int c = ppir_ra_reg_class_vec1 + (reg->num_components - 1);
       if (reg->is_head)
          c += 4;
       if (reg == end_reg)
          end_reg_index = n;
       ra_set_node_class(g, n++, c);
    }

    int n1 = 0;
    list_for_each_entry(ppir_reg, reg1, &comp->reg_list, list) {
       int n2 = n1 + 1;
       list_for_each_entry_from(ppir_reg, reg2, reg1->list.next,
                                &comp->reg_list, list) {
          bool interference = false;
          if (reg1->live_in < reg2->live_in) {
             if (reg1->live_out > reg2->live_in)
                interference = true;
          }
          else if (reg1->live_in > reg2->live_in) {
             if (reg2->live_out > reg1->live_in)
                interference = true;
          }
          else
             interference = true;

          if (interference)
             ra_add_node_interference(g, n1, n2);

          n2++;
       }
       n1++;
    }

    ra_set_node_reg(g, end_reg_index, ppir_ra_reg_base[ppir_ra_reg_class_vec4]);

    *spilled = false;
    bool ok = ra_allocate(g);
    if (!ok || (comp->force_spilling-- > 0)) {
       ppir_reg *chosen = ppir_regalloc_choose_spill_node(comp, g);
       if (chosen) {
          /* stack_size will be used to assemble the frame reg in lima_draw.
           * It is also be used in the spilling code, as negative indices
           * starting from -1, to create stack addresses. */
          comp->prog->stack_size++;
          ppir_regalloc_spill_reg(comp, chosen);
          /* Ask the outer loop to call back in. */
          *spilled = true;

          ppir_debug("spilled register\n");
          goto err_out;
       }

       ppir_error("regalloc fail\n");
       goto err_out;
    }

    n = 0;
    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
       int reg_index = ra_get_node_reg(g, n++);
       reg->index = get_phy_reg_index(reg_index);
    }

    ralloc_free(g);

    if (lima_debug & LIMA_DEBUG_PP)
       ppir_regalloc_print_result(comp);

    return true;

 err_out:
    ralloc_free(g);
    return false;
 }

 bool ppir_regalloc_prog(ppir_compiler *comp)
 {
    bool spilled = false;
    comp->prog->stack_size = 0;

    /* Set from an environment variable to force spilling
     * for debugging purposes, see lima_screen.c */
    comp->force_spilling = lima_ppir_force_spilling;

    ppir_regalloc_update_reglist_ssa(comp);

    /* No registers? Probably shader consists of discard instruction */
    if (list_empty(&comp->reg_list))
       return true;

    /* this will most likely succeed in the first
     * try, except for very complicated shaders */
    while (!ppir_regalloc_prog_try(comp, &spilled))
       if (!spilled)
          return false;

    return true;
 }
	/*
	* Copyright (c) 2017 Lima Project
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sub license,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the
	* next paragraph) shall be included in all copies or substantial portions
	* of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	* DEALINGS IN THE SOFTWARE.
	*
	*/

	#include "util/ralloc.h"
	#include "util/register_allocate.h"
	#include "util/u_debug.h"

	#include "ppir.h"
	#include "lima_context.h"

	#define PPIR_FULL_REG_NUM 6

	#define PPIR_VEC1_REG_NUM (PPIR_FULL_REG_NUM * 4) /* x, y, z, w */
	#define PPIR_VEC2_REG_NUM (PPIR_FULL_REG_NUM * 3) /* xy, yz, zw */
	#define PPIR_VEC3_REG_NUM (PPIR_FULL_REG_NUM * 2) /* xyz, yzw */
	#define PPIR_VEC4_REG_NUM PPIR_FULL_REG_NUM /* xyzw */
	#define PPIR_HEAD_VEC1_REG_NUM PPIR_FULL_REG_NUM /* x */
	#define PPIR_HEAD_VEC2_REG_NUM PPIR_FULL_REG_NUM /* xy */
	#define PPIR_HEAD_VEC3_REG_NUM PPIR_FULL_REG_NUM /* xyz */
	#define PPIR_HEAD_VEC4_REG_NUM PPIR_FULL_REG_NUM /* xyzw */

	#define PPIR_VEC1_REG_BASE 0
	#define PPIR_VEC2_REG_BASE (PPIR_VEC1_REG_BASE + PPIR_VEC1_REG_NUM)
	#define PPIR_VEC3_REG_BASE (PPIR_VEC2_REG_BASE + PPIR_VEC2_REG_NUM)
	#define PPIR_VEC4_REG_BASE (PPIR_VEC3_REG_BASE + PPIR_VEC3_REG_NUM)
	#define PPIR_HEAD_VEC1_REG_BASE (PPIR_VEC4_REG_BASE + PPIR_VEC4_REG_NUM)
	#define PPIR_HEAD_VEC2_REG_BASE (PPIR_HEAD_VEC1_REG_BASE + PPIR_HEAD_VEC1_REG_NUM)
	#define PPIR_HEAD_VEC3_REG_BASE (PPIR_HEAD_VEC2_REG_BASE + PPIR_HEAD_VEC2_REG_NUM)
	#define PPIR_HEAD_VEC4_REG_BASE (PPIR_HEAD_VEC3_REG_BASE + PPIR_HEAD_VEC3_REG_NUM)
	#define PPIR_REG_COUNT (PPIR_HEAD_VEC4_REG_BASE + PPIR_HEAD_VEC4_REG_NUM)

	enum ppir_ra_reg_class {
	ppir_ra_reg_class_vec1,
	ppir_ra_reg_class_vec2,
	ppir_ra_reg_class_vec3,
	ppir_ra_reg_class_vec4,

	/* 4 reg class for load/store instr regs:
	* load/store instr has no swizzle field, so the (virtual) register
	* must be allocated at the beginning of a (physical) register,
	*/
	ppir_ra_reg_class_head_vec1,
	ppir_ra_reg_class_head_vec2,
	ppir_ra_reg_class_head_vec3,
	ppir_ra_reg_class_head_vec4,

	ppir_ra_reg_class_num,
	};

	static const int ppir_ra_reg_base[ppir_ra_reg_class_num + 1] = {
	[ppir_ra_reg_class_vec1] = PPIR_VEC1_REG_BASE,
	[ppir_ra_reg_class_vec2] = PPIR_VEC2_REG_BASE,
	[ppir_ra_reg_class_vec3] = PPIR_VEC3_REG_BASE,
	[ppir_ra_reg_class_vec4] = PPIR_VEC4_REG_BASE,
	[ppir_ra_reg_class_head_vec1] = PPIR_HEAD_VEC1_REG_BASE,
	[ppir_ra_reg_class_head_vec2] = PPIR_HEAD_VEC2_REG_BASE,
	[ppir_ra_reg_class_head_vec3] = PPIR_HEAD_VEC3_REG_BASE,
	[ppir_ra_reg_class_head_vec4] = PPIR_HEAD_VEC4_REG_BASE,
	[ppir_ra_reg_class_num] = PPIR_REG_COUNT,
	};

	static unsigned int *
	ppir_ra_reg_q_values[ppir_ra_reg_class_num] = {
	(unsigned int []) {1, 2, 3, 4, 1, 2, 3, 4},
	(unsigned int []) {2, 3, 3, 3, 1, 2, 3, 3},
	(unsigned int []) {2, 2, 2, 2, 1, 2, 2, 2},
	(unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
	(unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
	(unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
	(unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
	(unsigned int []) {1, 1, 1, 1, 1, 1, 1, 1},
	};

	struct ra_regs ppir_regalloc_init(void mem_ctx)
	{
	struct ra_regs *ret = ra_alloc_reg_set(mem_ctx, PPIR_REG_COUNT, false);
	if (!ret)
	return NULL;

	/* (x, y, z, w) (xy, yz, zw) (xyz, yzw) (xyzw) (x) (xy) (xyz) (xyzw) */
	static const int class_reg_num[ppir_ra_reg_class_num] = {
	4, 3, 2, 1, 1, 1, 1, 1,
	};
	/* base reg (x, y, z, w) confliction with other regs */
	for (int h = 0; h < 4; h++) {
	int base_reg_mask = 1 << h;
	for (int i = 1; i < ppir_ra_reg_class_num; i++) {
	int class_reg_base_mask = (1 << ((i % 4) + 1)) - 1;
	for (int j = 0; j < class_reg_num[i]; j++) {
	if (base_reg_mask & (class_reg_base_mask << j)) {
	for (int k = 0; k < PPIR_FULL_REG_NUM; k++) {
	ra_add_reg_conflict(ret, k * 4 + h,
	ppir_ra_reg_base[i] + k * class_reg_num[i] + j);
	}
	}
	}
	}
	}
	/* build all other confliction by the base reg confliction */
	for (int i = 0; i < PPIR_VEC1_REG_NUM; i++)
	ra_make_reg_conflicts_transitive(ret, i);

	for (int i = 0; i < ppir_ra_reg_class_num; i++)
	ra_alloc_reg_class(ret);

	int reg_index = 0;
	for (int i = 0; i < ppir_ra_reg_class_num; i++) {
	while (reg_index < ppir_ra_reg_base[i + 1])
	ra_class_add_reg(ret, i, reg_index++);
	}

	ra_set_finalize(ret, ppir_ra_reg_q_values);
	return ret;
	}

	static ppir_reg get_src_reg(ppir_src src)
	{
	switch (src->type) {
	case ppir_target_ssa:
	return src->ssa;
	case ppir_target_register:
	return src->reg;
	default:
	return NULL;
	}
	}

	static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp)
	{
	list_for_each_entry(ppir_block, block, &comp->block_list, list) {
	list_for_each_entry(ppir_node, node, &block->node_list, list) {
	if (node->op == ppir_op_store_color)
	continue;

	if (!node->instr \|\| node->op == ppir_op_const)
	continue;

	ppir_dest *dest = ppir_node_get_dest(node);
	if (dest) {
	ppir_reg *reg = NULL;

	if (dest->type == ppir_target_ssa) {
	reg = &dest->ssa;
	list_addtail(&reg->list, &comp->reg_list);
	}
	}
	}
	}
	}

	static ppir_reg ppir_regalloc_build_liveness_info(ppir_compiler comp)
	{
	ppir_reg *ret = NULL;

	list_for_each_entry(ppir_block, block, &comp->block_list, list) {
	list_for_each_entry(ppir_node, node, &block->node_list, list) {
	if (node->op == ppir_op_store_color) {
	ppir_store_node *store = ppir_node_to_store(node);
	if (store->src.type == ppir_target_ssa)
	ret = store->src.ssa;
	else
	ret = store->src.reg;
	ret->live_out = INT_MAX;
	continue;
	}

	if (!node->instr \|\| node->op == ppir_op_const)
	continue;

	/* update reg live_in from node dest (write) */
	ppir_dest *dest = ppir_node_get_dest(node);
	if (dest) {
	ppir_reg *reg = NULL;

	if (dest->type == ppir_target_ssa) {
	reg = &dest->ssa;
	}
	else if (dest->type == ppir_target_register)
	reg = dest->reg;

	if (reg && node->instr->seq < reg->live_in)
	reg->live_in = node->instr->seq;
	}

	/* update reg live_out from node src (read) */
	for (int i = 0; i < ppir_node_get_src_num(node); i++)
	{
	ppir_reg *reg = get_src_reg(ppir_node_get_src(node, i));
	if (reg && node->instr->seq > reg->live_out)
	reg->live_out = node->instr->seq;
	}
	}
	}

	return ret;
	}

	static int get_phy_reg_index(int reg)
	{
	int i;

	for (i = 0; i < ppir_ra_reg_class_num; i++) {
	if (reg < ppir_ra_reg_base[i + 1]) {
	reg -= ppir_ra_reg_base[i];
	break;
	}
	}

	if (i < ppir_ra_reg_class_head_vec1)
	return reg / (4 - i) * 4 + reg % (4 - i);
	else
	return reg * 4;
	}

	static void ppir_regalloc_print_result(ppir_compiler *comp)
	{
	printf("======ppir regalloc result======\n");
	list_for_each_entry(ppir_block, block, &comp->block_list, list) {
	list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
	printf("%03d:", instr->index);
	for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
	ppir_node *node = instr->slots[i];
	if (!node)
	continue;

	printf(" (%d\|", node->index);

	ppir_dest *dest = ppir_node_get_dest(node);
	if (dest)
	printf("%d", ppir_target_get_dest_reg_index(dest));

	printf("\|");

	for (int i = 0; i < ppir_node_get_src_num(node); i++) {
	if (i)
	printf(" ");
	printf("%d", ppir_target_get_src_reg_index(ppir_node_get_src(node, i)));
	}

	printf(")");
	}
	printf("\n");
	}
	}
	printf("--------------------------\n");
	}

	static bool create_new_instr_after(ppir_block block, ppir_instr ref,
	ppir_node *node)
	{
	ppir_instr *newinstr = ppir_instr_create(block);
	if (unlikely(!newinstr))
	return false;

	list_del(&newinstr->list);
	list_add(&newinstr->list, &ref->list);

	if (!ppir_instr_insert_node(newinstr, node))
	return false;

	list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
	instr->seq++;
	}
	newinstr->seq = ref->seq+1;
	newinstr->scheduled = true;
	return true;
	}

	static bool create_new_instr_before(ppir_block block, ppir_instr ref,
	ppir_node *node)
	{
	ppir_instr *newinstr = ppir_instr_create(block);
	if (unlikely(!newinstr))
	return false;

	list_del(&newinstr->list);
	list_addtail(&newinstr->list, &ref->list);

	if (!ppir_instr_insert_node(newinstr, node))
	return false;

	list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
	instr->seq++;
	}
	newinstr->seq = ref->seq-1;
	newinstr->scheduled = true;
	return true;
	}

	static ppir_alu_node* ppir_update_spilled_src(ppir_compiler *comp,
	ppir_block *block,
	ppir_node node, ppir_src src,
	ppir_alu_node *move_alu)
	{
	/* alu nodes may have multiple references to the same value.
	* try to avoid unnecessary loads for the same alu node by
	* saving the node resulting from the temporary load */
	if (move_alu)
	goto update_src;

	/* alloc new node to load value */
	ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
	if (!load_node)
	return NULL;
	list_addtail(&load_node->list, &node->list);
	comp->num_fills++;

	ppir_load_node *load = ppir_node_to_load(load_node);

	load->index = -comp->prog->stack_size; /* index sizes are negative */
	load->num_components = 4;

	ppir_dest *ld_dest = &load->dest;
	ld_dest->type = ppir_target_pipeline;
	ld_dest->pipeline = ppir_pipeline_reg_uniform;
	ld_dest->write_mask = 0xf;

	create_new_instr_before(block, node->instr, load_node);

	/* Create move node */
	ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
	if (unlikely(!move_node))
	return false;
	list_addtail(&move_node->list, &node->list);

	move_alu = ppir_node_to_alu(move_node);

	move_alu->num_src = 1;
	move_alu->src->type = ppir_target_pipeline;
	move_alu->src->pipeline = ppir_pipeline_reg_uniform;
	for (int i = 0; i < 4; i++)
	move_alu->src->swizzle[i] = i;

	ppir_dest *alu_dest = &move_alu->dest;
	alu_dest->type = ppir_target_ssa;
	alu_dest->ssa.num_components = 4;
	alu_dest->ssa.live_in = INT_MAX;
	alu_dest->ssa.live_out = 0;
	alu_dest->write_mask = 0xf;

	list_addtail(&alu_dest->ssa.list, &comp->reg_list);

	if (!ppir_instr_insert_node(load_node->instr, move_node))
	return false;

	/* insert the new node as predecessor */
	ppir_node_foreach_pred_safe(node, dep) {
	ppir_node *pred = dep->pred;
	ppir_node_remove_dep(dep);
	ppir_node_add_dep(load_node, pred);
	}
	ppir_node_add_dep(node, move_node);
	ppir_node_add_dep(move_node, load_node);

	update_src:
	/* switch node src to use the new ssa instead */
	src->type = ppir_target_ssa;
	src->ssa = &move_alu->dest.ssa;

	return move_alu;
	}

	static ppir_reg create_reg(ppir_compiler comp, int num_components)
	{
	ppir_reg *r = rzalloc(comp, ppir_reg);
	if (!r)
	return NULL;

	r->num_components = num_components;
	r->live_in = INT_MAX;
	r->live_out = 0;
	r->is_head = false;
	list_addtail(&r->list, &comp->reg_list);

	return r;
	}

	static bool ppir_update_spilled_dest(ppir_compiler comp, ppir_block block,
	ppir_node node, ppir_dest dest)
	{
	assert(dest != NULL);
	ppir_reg *reg = NULL;
	if (dest->type == ppir_target_register) {
	reg = dest->reg;
	reg->num_components = 4;
	reg->spilled = true;
	}
	else {
	reg = create_reg(comp, 4);
	reg->spilled = true;
	list_del(&dest->ssa.list);
	}

	/* alloc new node to load value */
	ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
	if (!load_node)
	return NULL;
	list_addtail(&load_node->list, &node->list);
	comp->num_fills++;

	ppir_load_node *load = ppir_node_to_load(load_node);

	load->index = -comp->prog->stack_size; /* index sizes are negative */
	load->num_components = 4;

	load->dest.type = ppir_target_pipeline;
	load->dest.pipeline = ppir_pipeline_reg_uniform;
	load->dest.write_mask = 0xf;

	create_new_instr_before(block, node->instr, load_node);

	/* Create move node */
	ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
	if (unlikely(!move_node))
	return false;
	list_addtail(&move_node->list, &node->list);

	ppir_alu_node *move_alu = ppir_node_to_alu(move_node);

	move_alu->num_src = 1;
	move_alu->src->type = ppir_target_pipeline;
	move_alu->src->pipeline = ppir_pipeline_reg_uniform;
	for (int i = 0; i < 4; i++)
	move_alu->src->swizzle[i] = i;

	move_alu->dest.type = ppir_target_register;
	move_alu->dest.reg = reg;
	move_alu->dest.write_mask = 0x0f;

	if (!ppir_instr_insert_node(load_node->instr, move_node))
	return false;

	ppir_node_foreach_pred_safe(node, dep) {
	ppir_node *pred = dep->pred;
	ppir_node_remove_dep(dep);
	ppir_node_add_dep(load_node, pred);
	}
	ppir_node_add_dep(node, move_node);
	ppir_node_add_dep(move_node, load_node);

	dest->type = ppir_target_register;
	dest->reg = reg;

	/* alloc new node to store value */
	ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0);
	if (!store_node)
	return false;
	list_addtail(&store_node->list, &node->list);
	comp->num_spills++;

	ppir_store_node *store = ppir_node_to_store(store_node);

	store->index = -comp->prog->stack_size; /* index sizes are negative */
	store->num_components = 4;

	store->src.type = ppir_target_register;
	store->src.reg = dest->reg;

	/* insert the new node as successor */
	ppir_node_foreach_succ_safe(node, dep) {
	ppir_node *succ = dep->succ;
	ppir_node_remove_dep(dep);
	ppir_node_add_dep(succ, store_node);
	}
	ppir_node_add_dep(store_node, node);

	create_new_instr_after(block, node->instr, store_node);

	return true;
	}

	static bool ppir_regalloc_spill_reg(ppir_compiler comp, ppir_reg chosen)
	{
	list_for_each_entry(ppir_block, block, &comp->block_list, list) {
	list_for_each_entry(ppir_node, node, &block->node_list, list) {

	ppir_dest *dest = ppir_node_get_dest(node);
	ppir_reg *reg = NULL;
	if (dest) {
	if (dest->type == ppir_target_ssa)
	reg = &dest->ssa;
	else if (dest->type == ppir_target_register)
	reg = dest->reg;

	if (reg == chosen)
	ppir_update_spilled_dest(comp, block, node, dest);
	}

	switch (node->type) {
	case ppir_node_type_alu:
	{
	/* alu nodes may have multiple references to the same value.
	* try to avoid unnecessary loads for the same alu node by
	* saving the node resulting from the temporary load */
	ppir_alu_node *move_alu = NULL;
	ppir_alu_node *alu = ppir_node_to_alu(node);
	for (int i = 0; i < alu->num_src; i++) {
	reg = get_src_reg(alu->src + i);
	if (reg == chosen) {
	move_alu = ppir_update_spilled_src(comp, block, node,
	alu->src + i, move_alu);
	}
	}
	break;
	}
	default:
	{
	for (int i = 0; i < ppir_node_get_src_num(node); i++) {
	ppir_src *src = ppir_node_get_src(node, i);
	reg = get_src_reg(src);
	if (reg == chosen) {
	ppir_update_spilled_src(comp, block, node, src, NULL);
	}
	}
	break;
	}
	}
	}
	}

	return true;
	}

	static ppir_reg ppir_regalloc_choose_spill_node(ppir_compiler comp,
	struct ra_graph *g)
	{
	int i = 0;
	ppir_reg *chosen = NULL;

	list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
	if (reg->spilled \|\| reg->live_out == INT_MAX) {
	/* not considered for spilling */
	ra_set_node_spill_cost(g, i++, 0.0f);
	continue;
	}

	/* It is beneficial to spill registers with higher component number,
	* so increase the cost of spilling registers with few components */
	float spill_cost = 4.0f / (float)reg->num_components;
	ra_set_node_spill_cost(g, i++, spill_cost);
	}

	int r = ra_get_best_spill_node(g);
	if (r == -1)
	return NULL;

	i = 0;
	list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
	if (i++ == r) {
	chosen = reg;
	break;
	}
	}
	assert(chosen);
	chosen->spilled = true;

	return chosen;
	}

	static void ppir_regalloc_reset_liveness_info(ppir_compiler *comp)
	{
	list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
	reg->live_in = INT_MAX;
	reg->live_out = 0;
	}
	}

	int lima_ppir_force_spilling = 0;

	static bool ppir_regalloc_prog_try(ppir_compiler comp, bool spilled)
	{
	ppir_reg *end_reg;

	ppir_regalloc_reset_liveness_info(comp);
	end_reg = ppir_regalloc_build_liveness_info(comp);

	struct ra_graph *g = ra_alloc_interference_graph(
	comp->ra, list_length(&comp->reg_list));

	int n = 0, end_reg_index = 0;
	list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
	int c = ppir_ra_reg_class_vec1 + (reg->num_components - 1);
	if (reg->is_head)
	c += 4;
	if (reg == end_reg)
	end_reg_index = n;
	ra_set_node_class(g, n++, c);
	}

	int n1 = 0;
	list_for_each_entry(ppir_reg, reg1, &comp->reg_list, list) {
	int n2 = n1 + 1;
	list_for_each_entry_from(ppir_reg, reg2, reg1->list.next,
	&comp->reg_list, list) {
	bool interference = false;
	if (reg1->live_in < reg2->live_in) {
	if (reg1->live_out > reg2->live_in)
	interference = true;
	}
	else if (reg1->live_in > reg2->live_in) {
	if (reg2->live_out > reg1->live_in)
	interference = true;
	}
	else
	interference = true;

	if (interference)
	ra_add_node_interference(g, n1, n2);

	n2++;
	}
	n1++;
	}

	ra_set_node_reg(g, end_reg_index, ppir_ra_reg_base[ppir_ra_reg_class_vec4]);

	*spilled = false;
	bool ok = ra_allocate(g);
	if (!ok \|\| (comp->force_spilling-- > 0)) {
	ppir_reg *chosen = ppir_regalloc_choose_spill_node(comp, g);
	if (chosen) {
	/* stack_size will be used to assemble the frame reg in lima_draw.
	* It is also be used in the spilling code, as negative indices
	* starting from -1, to create stack addresses. */
	comp->prog->stack_size++;
	ppir_regalloc_spill_reg(comp, chosen);
	/* Ask the outer loop to call back in. */
	*spilled = true;

	ppir_debug("spilled register\n");
	goto err_out;
	}

	ppir_error("regalloc fail\n");
	goto err_out;
	}

	n = 0;
	list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
	int reg_index = ra_get_node_reg(g, n++);
	reg->index = get_phy_reg_index(reg_index);
	}

	ralloc_free(g);

	if (lima_debug & LIMA_DEBUG_PP)
	ppir_regalloc_print_result(comp);

	return true;

	err_out:
	ralloc_free(g);
	return false;
	}

	bool ppir_regalloc_prog(ppir_compiler *comp)
	{
	bool spilled = false;
	comp->prog->stack_size = 0;

	/* Set from an environment variable to force spilling
	* for debugging purposes, see lima_screen.c */
	comp->force_spilling = lima_ppir_force_spilling;

	ppir_regalloc_update_reglist_ssa(comp);

	/* No registers? Probably shader consists of discard instruction */
	if (list_empty(&comp->reg_list))
	return true;

	/* this will most likely succeed in the first
	* try, except for very complicated shaders */
	while (!ppir_regalloc_prog_try(comp, &spilled))
	if (!spilled)
	return false;

	return true;
	}