src/amd/compiler/aco_live_var_analysis.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2018 Valve Corporation
  * Copyright © 2018 Google
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  * Authors:
  *    Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
  *    Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
  *
  */

 #include "aco_ir.h"
 #include "util/u_math.h"

 #include <set>
 #include <vector>

 #include "vulkan/radv_shader.h"

 namespace aco {
 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
 {
    RegisterDemand changes;
    for (const Definition& def : instr->definitions) {
       if (!def.isTemp() || def.isKill())
          continue;
       changes += def.getTemp();
    }

    for (const Operand& op : instr->operands) {
       if (!op.isTemp() || !op.isFirstKill())
          continue;
       changes -= op.getTemp();
    }

    return changes;
 }

 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
 {
    RegisterDemand temp_registers;

    for (Definition def : instr->definitions) {
       if (!def.isTemp())
          continue;
       if (def.isKill())
          temp_registers += def.getTemp();
    }

    for (Operand op : instr->operands) {
       if (op.isTemp() && op.isLateKill() && op.isFirstKill())
          temp_registers += op.getTemp();
    }

    return temp_registers;
 }

 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before)
 {
    demand -= get_live_changes(instr);
    demand -= get_temp_registers(instr);
    if (instr_before)
       demand += get_temp_registers(instr_before);
    return demand;
 }

 namespace {
 void process_live_temps_per_block(Program *program, live& lives, Block* block,
                                   std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
 {
    std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
    RegisterDemand new_demand;

    register_demand.resize(block->instructions.size());
    block->register_demand = RegisterDemand();
    IDSet live = lives.live_out[block->index];

    /* add the live_out_exec to live */
    bool exec_live = false;
    if (block->live_out_exec != Temp()) {
       live.insert(block->live_out_exec.id());
       exec_live = true;
    }

    /* initialize register demand */
    for (unsigned t : live)
       new_demand += Temp(t, program->temp_rc[t]);
    new_demand.sgpr -= phi_sgpr_ops[block->index];

    /* traverse the instructions backwards */
    int idx;
    for (idx = block->instructions.size() -1; idx >= 0; idx--) {
       Instruction *insn = block->instructions[idx].get();
       if (is_phi(insn))
          break;

       /* substract the 1 or 2 sgprs from exec */
       if (exec_live)
          assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
       register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0));

       /* KILL */
       for (Definition& definition : insn->definitions) {
          if (!definition.isTemp()) {
             continue;
          }
          if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
             program->needs_vcc = true;

          const Temp temp = definition.getTemp();
          const size_t n = live.erase(temp.id());

          if (n) {
             new_demand -= temp;
             definition.setKill(false);
          } else {
             register_demand[idx] += temp;
             definition.setKill(true);
          }

          if (definition.isFixed() && definition.physReg() == exec)
             exec_live = false;
       }

       /* GEN */
       if (insn->opcode == aco_opcode::p_logical_end) {
          new_demand.sgpr += phi_sgpr_ops[block->index];
       } else {
          /* we need to do this in a separate loop because the next one can
           * setKill() for several operands at once and we don't want to
           * overwrite that in a later iteration */
          for (Operand& op : insn->operands)
             op.setKill(false);

          for (unsigned i = 0; i < insn->operands.size(); ++i)
          {
             Operand& operand = insn->operands[i];
             if (!operand.isTemp())
                continue;
             if (operand.isFixed() && operand.physReg() == vcc)
                program->needs_vcc = true;
             const Temp temp = operand.getTemp();
             const bool inserted = live.insert(temp.id()).second;
             if (inserted) {
                operand.setFirstKill(true);
                for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
                   if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
                      insn->operands[j].setFirstKill(false);
                      insn->operands[j].setKill(true);
                   }
                }
                if (operand.isLateKill())
                   register_demand[idx] += temp;
                new_demand += temp;
             }

             if (operand.isFixed() && operand.physReg() == exec)
                exec_live = true;
          }
       }

       block->register_demand.update(register_demand[idx]);
    }

    /* update block's register demand for a last time */
    if (exec_live)
       assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
    new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0;
    block->register_demand.update(new_demand);

    /* handle phi definitions */
    int phi_idx = idx;
    while (phi_idx >= 0) {
       register_demand[phi_idx] = new_demand;
       Instruction *insn = block->instructions[phi_idx].get();

       assert(is_phi(insn));
       assert(insn->definitions.size() == 1 && insn->definitions[0].isTemp());
       Definition& definition = insn->definitions[0];
       if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
          program->needs_vcc = true;
       const Temp temp = definition.getTemp();
       const size_t n = live.erase(temp.id());

       if (n)
          definition.setKill(false);
       else
          definition.setKill(true);

       phi_idx--;
    }

    /* now, we need to merge the live-ins into the live-out sets */
    for (unsigned t : live) {
       RegClass rc = program->temp_rc[t];
       std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;

 #ifndef NDEBUG
       if (preds.empty())
          aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
 #endif

       for (unsigned pred_idx : preds) {
          auto it = lives.live_out[pred_idx].insert(t);
          if (it.second)
             worklist.insert(pred_idx);
       }
    }

    /* handle phi operands */
    phi_idx = idx;
    while (phi_idx >= 0) {
       Instruction *insn = block->instructions[phi_idx].get();
       assert(is_phi(insn));
       /* directly insert into the predecessors live-out set */
       std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
                                    ? block->logical_preds
                                    : block->linear_preds;
       for (unsigned i = 0; i < preds.size(); ++i) {
          Operand &operand = insn->operands[i];
          if (!operand.isTemp())
             continue;
          if (operand.isFixed() && operand.physReg() == vcc)
             program->needs_vcc = true;
          /* check if we changed an already processed block */
          const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
          if (inserted) {
             operand.setKill(true);
             worklist.insert(preds[i]);
             if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr)
                phi_sgpr_ops[preds[i]] += operand.size();
          }
       }
       phi_idx--;
    }

    assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
 }

 unsigned calc_waves_per_workgroup(Program *program)
 {
    /* When workgroup size is not known, just go with wave_size */
    unsigned workgroup_size = program->workgroup_size == UINT_MAX
                              ? program->wave_size
                              : program->workgroup_size;

    return align(workgroup_size, program->wave_size) / program->wave_size;
 }
 } /* end namespace */

 uint16_t get_extra_sgprs(Program *program)
 {
    if (program->chip_class >= GFX10) {
       assert(!program->needs_flat_scr);
       assert(!program->xnack_enabled);
       return 2;
    } else if (program->chip_class >= GFX8) {
       if (program->needs_flat_scr)
          return 6;
       else if (program->xnack_enabled)
          return 4;
       else if (program->needs_vcc)
          return 2;
       else
          return 0;
    } else {
       assert(!program->xnack_enabled);
       if (program->needs_flat_scr)
          return 4;
       else if (program->needs_vcc)
          return 2;
       else
          return 0;
    }
 }

 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
 {
    assert(addressable_sgprs <= program->sgpr_limit);
    uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
    uint16_t granule = program->sgpr_alloc_granule + 1;
    return align(std::max(sgprs, granule), granule);
 }

 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
 {
    assert(addressable_vgprs <= program->vgpr_limit);
    uint16_t granule = program->vgpr_alloc_granule + 1;
    return align(std::max(addressable_vgprs, granule), granule);
 }

 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
 {
     uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule;
     sgprs -= get_extra_sgprs(program);
     return std::min(sgprs, program->sgpr_limit);
 }

 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
 {
     uint16_t vgprs = 256 / max_waves & ~program->vgpr_alloc_granule;
     return std::min(vgprs, program->vgpr_limit);
 }

 void calc_min_waves(Program* program)
 {
    unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
    /* currently min_waves is in wave64 waves */
    if (program->wave_size == 32)
       waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2);

    unsigned simd_per_cu = 4; /* TODO: different on Navi */
    bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
    unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;

    program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
 }

 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
 {
    /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
    unsigned max_waves_per_simd = 10;
    if ((program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) || program->chip_class >= GFX10_3)
       max_waves_per_simd = 8;
    unsigned simd_per_cu = 4;

    bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
    unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
    unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit;

    /* this won't compile, register pressure reduction necessary */
    if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) {
       program->num_waves = 0;
       program->max_reg_demand = new_demand;
    } else {
       program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
       program->num_waves = std::min<uint16_t>(program->num_waves, 256 / get_vgpr_alloc(program, new_demand.vgpr));
       program->max_waves = max_waves_per_simd;

       /* adjust max_waves for workgroup and LDS limits */
       unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
       unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
       if (program->config->lds_size) {
          unsigned lds = program->config->lds_size * program->lds_alloc_granule;
          workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
       }
       if (waves_per_workgroup > 1 && program->chip_class < GFX10)
          workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */

       /* in cases like waves_per_workgroup=3 or lds=65536 and
        * waves_per_workgroup=1, we want the maximum possible number of waves per
        * SIMD and not the minimum. so DIV_ROUND_UP is used */
       program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));

       /* incorporate max_waves and calculate max_reg_demand */
       program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
       program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
       program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
    }
 }

 live live_var_analysis(Program* program,
                        const struct radv_nir_compiler_options *options)
 {
    live result;
    result.live_out.resize(program->blocks.size());
    result.register_demand.resize(program->blocks.size());
    std::set<unsigned> worklist;
    std::vector<uint16_t> phi_sgpr_ops(program->blocks.size());
    RegisterDemand new_demand;

    program->needs_vcc = false;

    /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
    for (Block& block : program->blocks)
       worklist.insert(block.index);
    while (!worklist.empty()) {
       std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
       unsigned block_idx = *b_it;
       worklist.erase(block_idx);
       process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
       new_demand.update(program->blocks[block_idx].register_demand);
    }

    /* calculate the program's register demand and number of waves */
    update_vgpr_sgpr_demand(program, new_demand);

    return result;
 }

 }
	/*
	* Copyright © 2018 Valve Corporation
	* Copyright © 2018 Google
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*
	* Authors:
	* Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
	* Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
	*
	*/

	#include "aco_ir.h"
	#include "util/u_math.h"

	#include <set>
	#include <vector>

	#include "vulkan/radv_shader.h"

	namespace aco {
	RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
	{
	RegisterDemand changes;
	for (const Definition& def : instr->definitions) {
	if (!def.isTemp() \|\| def.isKill())
	continue;
	changes += def.getTemp();
	}

	for (const Operand& op : instr->operands) {
	if (!op.isTemp() \|\| !op.isFirstKill())
	continue;
	changes -= op.getTemp();
	}

	return changes;
	}

	RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
	{
	RegisterDemand temp_registers;

	for (Definition def : instr->definitions) {
	if (!def.isTemp())
	continue;
	if (def.isKill())
	temp_registers += def.getTemp();
	}

	for (Operand op : instr->operands) {
	if (op.isTemp() && op.isLateKill() && op.isFirstKill())
	temp_registers += op.getTemp();
	}

	return temp_registers;
	}

	RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before)
	{
	demand -= get_live_changes(instr);
	demand -= get_temp_registers(instr);
	if (instr_before)
	demand += get_temp_registers(instr_before);
	return demand;
	}

	namespace {
	void process_live_temps_per_block(Program program, live& lives, Block block,
	std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
	{
	std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
	RegisterDemand new_demand;

	register_demand.resize(block->instructions.size());
	block->register_demand = RegisterDemand();
	IDSet live = lives.live_out[block->index];

	/* add the live_out_exec to live */
	bool exec_live = false;
	if (block->live_out_exec != Temp()) {
	live.insert(block->live_out_exec.id());
	exec_live = true;
	}

	/* initialize register demand */
	for (unsigned t : live)
	new_demand += Temp(t, program->temp_rc[t]);
	new_demand.sgpr -= phi_sgpr_ops[block->index];

	/* traverse the instructions backwards */
	int idx;
	for (idx = block->instructions.size() -1; idx >= 0; idx--) {
	Instruction *insn = block->instructions[idx].get();
	if (is_phi(insn))
	break;

	/* substract the 1 or 2 sgprs from exec */
	if (exec_live)
	assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
	register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0));

	/* KILL */
	for (Definition& definition : insn->definitions) {
	if (!definition.isTemp()) {
	continue;
	}
	if ((definition.isFixed() \|\| definition.hasHint()) && definition.physReg() == vcc)
	program->needs_vcc = true;

	const Temp temp = definition.getTemp();
	const size_t n = live.erase(temp.id());

	if (n) {
	new_demand -= temp;
	definition.setKill(false);
	} else {
	register_demand[idx] += temp;
	definition.setKill(true);
	}

	if (definition.isFixed() && definition.physReg() == exec)
	exec_live = false;
	}

	/* GEN */
	if (insn->opcode == aco_opcode::p_logical_end) {
	new_demand.sgpr += phi_sgpr_ops[block->index];
	} else {
	/* we need to do this in a separate loop because the next one can
	* setKill() for several operands at once and we don't want to
	* overwrite that in a later iteration */
	for (Operand& op : insn->operands)
	op.setKill(false);

	for (unsigned i = 0; i < insn->operands.size(); ++i)
	{
	Operand& operand = insn->operands[i];
	if (!operand.isTemp())
	continue;
	if (operand.isFixed() && operand.physReg() == vcc)
	program->needs_vcc = true;
	const Temp temp = operand.getTemp();
	const bool inserted = live.insert(temp.id()).second;
	if (inserted) {
	operand.setFirstKill(true);
	for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
	if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
	insn->operands[j].setFirstKill(false);
	insn->operands[j].setKill(true);
	}
	}
	if (operand.isLateKill())
	register_demand[idx] += temp;
	new_demand += temp;
	}

	if (operand.isFixed() && operand.physReg() == exec)
	exec_live = true;
	}
	}

	block->register_demand.update(register_demand[idx]);
	}

	/* update block's register demand for a last time */
	if (exec_live)
	assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
	new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0;
	block->register_demand.update(new_demand);

	/* handle phi definitions */
	int phi_idx = idx;
	while (phi_idx >= 0) {
	register_demand[phi_idx] = new_demand;
	Instruction *insn = block->instructions[phi_idx].get();

	assert(is_phi(insn));
	assert(insn->definitions.size() == 1 && insn->definitions[0].isTemp());
	Definition& definition = insn->definitions[0];
	if ((definition.isFixed() \|\| definition.hasHint()) && definition.physReg() == vcc)
	program->needs_vcc = true;
	const Temp temp = definition.getTemp();
	const size_t n = live.erase(temp.id());

	if (n)
	definition.setKill(false);
	else
	definition.setKill(true);

	phi_idx--;
	}

	/* now, we need to merge the live-ins into the live-out sets */
	for (unsigned t : live) {
	RegClass rc = program->temp_rc[t];
	std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds;

	#ifndef NDEBUG
	if (preds.empty())
	aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
	#endif

	for (unsigned pred_idx : preds) {
	auto it = lives.live_out[pred_idx].insert(t);
	if (it.second)
	worklist.insert(pred_idx);
	}
	}

	/* handle phi operands */
	phi_idx = idx;
	while (phi_idx >= 0) {
	Instruction *insn = block->instructions[phi_idx].get();
	assert(is_phi(insn));
	/* directly insert into the predecessors live-out set */
	std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
	? block->logical_preds
	: block->linear_preds;
	for (unsigned i = 0; i < preds.size(); ++i) {
	Operand &operand = insn->operands[i];
	if (!operand.isTemp())
	continue;
	if (operand.isFixed() && operand.physReg() == vcc)
	program->needs_vcc = true;
	/* check if we changed an already processed block */
	const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second;
	if (inserted) {
	operand.setKill(true);
	worklist.insert(preds[i]);
	if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr)
	phi_sgpr_ops[preds[i]] += operand.size();
	}
	}
	phi_idx--;
	}

	assert(block->index != 0 \|\| (new_demand == RegisterDemand() && live.empty()));
	}

	unsigned calc_waves_per_workgroup(Program *program)
	{
	/* When workgroup size is not known, just go with wave_size */
	unsigned workgroup_size = program->workgroup_size == UINT_MAX
	? program->wave_size
	: program->workgroup_size;

	return align(workgroup_size, program->wave_size) / program->wave_size;
	}
	} /* end namespace */

	uint16_t get_extra_sgprs(Program *program)
	{
	if (program->chip_class >= GFX10) {
	assert(!program->needs_flat_scr);
	assert(!program->xnack_enabled);
	return 2;
	} else if (program->chip_class >= GFX8) {
	if (program->needs_flat_scr)
	return 6;
	else if (program->xnack_enabled)
	return 4;
	else if (program->needs_vcc)
	return 2;
	else
	return 0;
	} else {
	assert(!program->xnack_enabled);
	if (program->needs_flat_scr)
	return 4;
	else if (program->needs_vcc)
	return 2;
	else
	return 0;
	}
	}

	uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
	{
	assert(addressable_sgprs <= program->sgpr_limit);
	uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
	uint16_t granule = program->sgpr_alloc_granule + 1;
	return align(std::max(sgprs, granule), granule);
	}

	uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
	{
	assert(addressable_vgprs <= program->vgpr_limit);
	uint16_t granule = program->vgpr_alloc_granule + 1;
	return align(std::max(addressable_vgprs, granule), granule);
	}

	uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
	{
	uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule;
	sgprs -= get_extra_sgprs(program);
	return std::min(sgprs, program->sgpr_limit);
	}

	uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
	{
	uint16_t vgprs = 256 / max_waves & ~program->vgpr_alloc_granule;
	return std::min(vgprs, program->vgpr_limit);
	}

	void calc_min_waves(Program* program)
	{
	unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
	/* currently min_waves is in wave64 waves */
	if (program->wave_size == 32)
	waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2);

	unsigned simd_per_cu = 4; /* TODO: different on Navi */
	bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
	unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;

	program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
	}

	void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
	{
	/* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
	unsigned max_waves_per_simd = 10;
	if ((program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) \|\| program->chip_class >= GFX10_3)
	max_waves_per_simd = 8;
	unsigned simd_per_cu = 4;

	bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
	unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
	unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit;

	/* this won't compile, register pressure reduction necessary */
	if (new_demand.vgpr > program->vgpr_limit \|\| new_demand.sgpr > program->sgpr_limit) {
	program->num_waves = 0;
	program->max_reg_demand = new_demand;
	} else {
	program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
	program->num_waves = std::min<uint16_t>(program->num_waves, 256 / get_vgpr_alloc(program, new_demand.vgpr));
	program->max_waves = max_waves_per_simd;

	/* adjust max_waves for workgroup and LDS limits */
	unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
	unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
	if (program->config->lds_size) {
	unsigned lds = program->config->lds_size * program->lds_alloc_granule;
	workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
	}
	if (waves_per_workgroup > 1 && program->chip_class < GFX10)
	workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */

	/* in cases like waves_per_workgroup=3 or lds=65536 and
	* waves_per_workgroup=1, we want the maximum possible number of waves per
	* SIMD and not the minimum. so DIV_ROUND_UP is used */
	program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));

	/* incorporate max_waves and calculate max_reg_demand */
	program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
	program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
	program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
	}
	}

	live live_var_analysis(Program* program,
	const struct radv_nir_compiler_options *options)
	{
	live result;
	result.live_out.resize(program->blocks.size());
	result.register_demand.resize(program->blocks.size());
	std::set<unsigned> worklist;
	std::vector<uint16_t> phi_sgpr_ops(program->blocks.size());
	RegisterDemand new_demand;

	program->needs_vcc = false;

	/* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
	for (Block& block : program->blocks)
	worklist.insert(block.index);
	while (!worklist.empty()) {
	std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
	unsigned block_idx = *b_it;
	worklist.erase(block_idx);
	process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
	new_demand.update(program->blocks[block_idx].register_demand);
	}

	/* calculate the program's register demand and number of waves */
	update_vgpr_sgpr_demand(program, new_demand);

	return result;
	}

	}