src/compiler/nir/nir_lower_idiv.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2015 Red Hat
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  * Authors:
  *    Rob Clark <robclark@freedesktop.org>
  */

 #include "nir.h"
 #include "nir_builder.h"

 /* Has two paths
  * One (nir_lower_idiv_fast) lowers idiv/udiv/umod and is based on
  * NV50LegalizeSSA::handleDIV()
  *
  * Note that this path probably does not have not enough precision for
  * compute shaders. Perhaps we want a second higher precision (looping)
  * version of this? Or perhaps we assume if you can do compute shaders you
  * can also branch out to a pre-optimized shader library routine..
  *
  * The other path (nir_lower_idiv_precise) is based off of code used by LLVM's
  * AMDGPU target. It should handle 32-bit idiv/irem/imod/udiv/umod exactly.
  */

 static bool
 convert_instr(nir_builder *bld, nir_alu_instr *alu)
 {
    nir_ssa_def *numer, *denom, *af, *bf, *a, *b, *q, *r, *rt;
    nir_op op = alu->op;
    bool is_signed;

    if ((op != nir_op_idiv) &&
        (op != nir_op_udiv) &&
        (op != nir_op_imod) &&
        (op != nir_op_umod) &&
        (op != nir_op_irem))
       return false;

    is_signed = (op == nir_op_idiv ||
                 op == nir_op_imod ||
                 op == nir_op_irem);

    bld->cursor = nir_before_instr(&alu->instr);

    numer = nir_ssa_for_alu_src(bld, alu, 0);
    denom = nir_ssa_for_alu_src(bld, alu, 1);

    if (is_signed) {
       af = nir_i2f32(bld, numer);
       bf = nir_i2f32(bld, denom);
       af = nir_fabs(bld, af);
       bf = nir_fabs(bld, bf);
       a  = nir_iabs(bld, numer);
       b  = nir_iabs(bld, denom);
    } else {
       af = nir_u2f32(bld, numer);
       bf = nir_u2f32(bld, denom);
       a  = numer;
       b  = denom;
    }

    /* get first result: */
    bf = nir_frcp(bld, bf);
    bf = nir_isub(bld, bf, nir_imm_int(bld, 2));  /* yes, really */
    q  = nir_fmul(bld, af, bf);

    if (is_signed) {
       q = nir_f2i32(bld, q);
    } else {
       q = nir_f2u32(bld, q);
    }

    /* get error of first result: */
    r = nir_imul(bld, q, b);
    r = nir_isub(bld, a, r);
    r = nir_u2f32(bld, r);
    r = nir_fmul(bld, r, bf);
    r = nir_f2u32(bld, r);

    /* add quotients: */
    q = nir_iadd(bld, q, r);

    /* correction: if modulus >= divisor, add 1 */
    r = nir_imul(bld, q, b);
    r = nir_isub(bld, a, r);
    rt = nir_uge(bld, r, b);

    if (op == nir_op_umod) {
       q = nir_bcsel(bld, rt, nir_isub(bld, r, b), r);
    } else {
       r = nir_b2i32(bld, rt);

       q = nir_iadd(bld, q, r);
       if (is_signed)  {
          /* fix the sign: */
          r = nir_ixor(bld, numer, denom);
          r = nir_ilt(bld, r, nir_imm_int(bld, 0));
          b = nir_ineg(bld, q);
          q = nir_bcsel(bld, r, b, q);

          if (op == nir_op_imod || op == nir_op_irem) {
             q = nir_imul(bld, q, denom);
             q = nir_isub(bld, numer, q);
             if (op == nir_op_imod) {
                q = nir_bcsel(bld, nir_ieq(bld, q, nir_imm_int(bld, 0)),
                              nir_imm_int(bld, 0),
                              nir_bcsel(bld, r, nir_iadd(bld, q, denom), q));
             }
          }
       }
    }

    assert(alu->dest.dest.is_ssa);
    nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(q));

    return true;
 }

 /* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */
 static nir_ssa_def *
 emit_udiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, bool modulo)
 {
    nir_ssa_def *rcp = nir_frcp(bld, nir_u2f32(bld, denom));
    rcp = nir_f2u32(bld, nir_fmul_imm(bld, rcp, 4294966784.0));

    nir_ssa_def *neg_rcp_times_denom =
       nir_imul(bld, rcp, nir_ineg(bld, denom));
    rcp = nir_iadd(bld, rcp, nir_umul_high(bld, rcp, neg_rcp_times_denom));

    /* Get initial estimate for quotient/remainder, then refine the estimate
     * in two iterations after */
    nir_ssa_def *quotient = nir_umul_high(bld, numer, rcp);
    nir_ssa_def *num_s_remainder = nir_imul(bld, quotient, denom);
    nir_ssa_def *remainder = nir_isub(bld, numer, num_s_remainder);

    /* First refinement step */
    nir_ssa_def *remainder_ge_den = nir_uge(bld, remainder, denom);
    if (!modulo) {
       quotient = nir_bcsel(bld, remainder_ge_den,
                            nir_iadd_imm(bld, quotient, 1), quotient);
    }
    remainder = nir_bcsel(bld, remainder_ge_den,
                          nir_isub(bld, remainder, denom), remainder);

    /* Second refinement step */
    remainder_ge_den = nir_uge(bld, remainder, denom);
    if (modulo) {
       return nir_bcsel(bld, remainder_ge_den, nir_isub(bld, remainder, denom),
                        remainder);
    } else {
       return nir_bcsel(bld, remainder_ge_den, nir_iadd_imm(bld, quotient, 1),
                        quotient);
    }
 }

 /* ported from LLVM's AMDGPUTargetLowering::LowerSDIVREM */
 static nir_ssa_def *
 emit_idiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, nir_op op)
 {
    nir_ssa_def *lh_sign = nir_ilt(bld, numer, nir_imm_int(bld, 0));
    nir_ssa_def *rh_sign = nir_ilt(bld, denom, nir_imm_int(bld, 0));
    lh_sign = nir_bcsel(bld, lh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0));
    rh_sign = nir_bcsel(bld, rh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0));

    nir_ssa_def *lhs = nir_iadd(bld, numer, lh_sign);
    nir_ssa_def *rhs = nir_iadd(bld, denom, rh_sign);
    lhs = nir_ixor(bld, lhs, lh_sign);
    rhs = nir_ixor(bld, rhs, rh_sign);

    if (op == nir_op_idiv) {
       nir_ssa_def *d_sign = nir_ixor(bld, lh_sign, rh_sign);
       nir_ssa_def *res = emit_udiv(bld, lhs, rhs, false);
       res = nir_ixor(bld, res, d_sign);
       return nir_isub(bld, res, d_sign);
    } else {
       nir_ssa_def *res = emit_udiv(bld, lhs, rhs, true);
       res = nir_ixor(bld, res, lh_sign);
       res = nir_isub(bld, res, lh_sign);
       if (op == nir_op_imod) {
          nir_ssa_def *cond = nir_ieq(bld, res, nir_imm_int(bld, 0));
          cond = nir_ior(bld, nir_ieq(bld, lh_sign, rh_sign), cond);
          res = nir_bcsel(bld, cond, res, nir_iadd(bld, res, denom));
       }
       return res;
    }
 }

 static bool
 convert_instr_precise(nir_builder *bld, nir_alu_instr *alu)
 {
    nir_op op = alu->op;

    if ((op != nir_op_idiv) &&
        (op != nir_op_imod) &&
        (op != nir_op_irem) &&
        (op != nir_op_udiv) &&
        (op != nir_op_umod))
       return false;

    if (alu->dest.dest.ssa.bit_size != 32)
       return false;

    bld->cursor = nir_before_instr(&alu->instr);

    nir_ssa_def *numer = nir_ssa_for_alu_src(bld, alu, 0);
    nir_ssa_def *denom = nir_ssa_for_alu_src(bld, alu, 1);

    nir_ssa_def *res = NULL;

    if (op == nir_op_udiv || op == nir_op_umod)
       res = emit_udiv(bld, numer, denom, op == nir_op_umod);
    else
       res = emit_idiv(bld, numer, denom, op);

    assert(alu->dest.dest.is_ssa);
    nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(res));

    return true;
 }

 static bool
 convert_impl(nir_function_impl *impl, enum nir_lower_idiv_path path)
 {
    nir_builder b;
    nir_builder_init(&b, impl);
    bool progress = false;

    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type == nir_instr_type_alu && path == nir_lower_idiv_precise)
             progress |= convert_instr_precise(&b, nir_instr_as_alu(instr));
          else if (instr->type == nir_instr_type_alu)
             progress |= convert_instr(&b, nir_instr_as_alu(instr));
       }
    }

    nir_metadata_preserve(impl, nir_metadata_block_index |
                                nir_metadata_dominance);

    return progress;
 }

 bool
 nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path)
 {
    bool progress = false;

    nir_foreach_function(function, shader) {
       if (function->impl)
          progress |= convert_impl(function->impl, path);
    }

    return progress;
 }
	/*
	* Copyright © 2015 Red Hat
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*
	* Authors:
	* Rob Clark <robclark@freedesktop.org>
	*/

	#include "nir.h"
	#include "nir_builder.h"

	/* Has two paths
	* One (nir_lower_idiv_fast) lowers idiv/udiv/umod and is based on
	* NV50LegalizeSSA::handleDIV()
	*
	* Note that this path probably does not have not enough precision for
	* compute shaders. Perhaps we want a second higher precision (looping)
	* version of this? Or perhaps we assume if you can do compute shaders you
	* can also branch out to a pre-optimized shader library routine..
	*
	* The other path (nir_lower_idiv_precise) is based off of code used by LLVM's
	* AMDGPU target. It should handle 32-bit idiv/irem/imod/udiv/umod exactly.
	*/

	static bool
	convert_instr(nir_builder bld, nir_alu_instr alu)
	{
	nir_ssa_def numer, denom, af, bf, a, b, q, r, *rt;
	nir_op op = alu->op;
	bool is_signed;

	if ((op != nir_op_idiv) &&
	(op != nir_op_udiv) &&
	(op != nir_op_imod) &&
	(op != nir_op_umod) &&
	(op != nir_op_irem))
	return false;

	is_signed = (op == nir_op_idiv \|\|
	op == nir_op_imod \|\|
	op == nir_op_irem);

	bld->cursor = nir_before_instr(&alu->instr);

	numer = nir_ssa_for_alu_src(bld, alu, 0);
	denom = nir_ssa_for_alu_src(bld, alu, 1);

	if (is_signed) {
	af = nir_i2f32(bld, numer);
	bf = nir_i2f32(bld, denom);
	af = nir_fabs(bld, af);
	bf = nir_fabs(bld, bf);
	a = nir_iabs(bld, numer);
	b = nir_iabs(bld, denom);
	} else {
	af = nir_u2f32(bld, numer);
	bf = nir_u2f32(bld, denom);
	a = numer;
	b = denom;
	}

	/* get first result: */
	bf = nir_frcp(bld, bf);
	bf = nir_isub(bld, bf, nir_imm_int(bld, 2)); /* yes, really */
	q = nir_fmul(bld, af, bf);

	if (is_signed) {
	q = nir_f2i32(bld, q);
	} else {
	q = nir_f2u32(bld, q);
	}

	/* get error of first result: */
	r = nir_imul(bld, q, b);
	r = nir_isub(bld, a, r);
	r = nir_u2f32(bld, r);
	r = nir_fmul(bld, r, bf);
	r = nir_f2u32(bld, r);

	/* add quotients: */
	q = nir_iadd(bld, q, r);

	/* correction: if modulus >= divisor, add 1 */
	r = nir_imul(bld, q, b);
	r = nir_isub(bld, a, r);
	rt = nir_uge(bld, r, b);

	if (op == nir_op_umod) {
	q = nir_bcsel(bld, rt, nir_isub(bld, r, b), r);
	} else {
	r = nir_b2i32(bld, rt);

	q = nir_iadd(bld, q, r);
	if (is_signed) {
	/* fix the sign: */
	r = nir_ixor(bld, numer, denom);
	r = nir_ilt(bld, r, nir_imm_int(bld, 0));
	b = nir_ineg(bld, q);
	q = nir_bcsel(bld, r, b, q);

	if (op == nir_op_imod \|\| op == nir_op_irem) {
	q = nir_imul(bld, q, denom);
	q = nir_isub(bld, numer, q);
	if (op == nir_op_imod) {
	q = nir_bcsel(bld, nir_ieq(bld, q, nir_imm_int(bld, 0)),
	nir_imm_int(bld, 0),
	nir_bcsel(bld, r, nir_iadd(bld, q, denom), q));
	}
	}
	}
	}

	assert(alu->dest.dest.is_ssa);
	nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(q));

	return true;
	}

	/* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */
	static nir_ssa_def *
	emit_udiv(nir_builder bld, nir_ssa_def numer, nir_ssa_def *denom, bool modulo)
	{
	nir_ssa_def *rcp = nir_frcp(bld, nir_u2f32(bld, denom));
	rcp = nir_f2u32(bld, nir_fmul_imm(bld, rcp, 4294966784.0));

	nir_ssa_def *neg_rcp_times_denom =
	nir_imul(bld, rcp, nir_ineg(bld, denom));
	rcp = nir_iadd(bld, rcp, nir_umul_high(bld, rcp, neg_rcp_times_denom));

	/* Get initial estimate for quotient/remainder, then refine the estimate
	* in two iterations after */
	nir_ssa_def *quotient = nir_umul_high(bld, numer, rcp);
	nir_ssa_def *num_s_remainder = nir_imul(bld, quotient, denom);
	nir_ssa_def *remainder = nir_isub(bld, numer, num_s_remainder);

	/* First refinement step */
	nir_ssa_def *remainder_ge_den = nir_uge(bld, remainder, denom);
	if (!modulo) {
	quotient = nir_bcsel(bld, remainder_ge_den,
	nir_iadd_imm(bld, quotient, 1), quotient);
	}
	remainder = nir_bcsel(bld, remainder_ge_den,
	nir_isub(bld, remainder, denom), remainder);

	/* Second refinement step */
	remainder_ge_den = nir_uge(bld, remainder, denom);
	if (modulo) {
	return nir_bcsel(bld, remainder_ge_den, nir_isub(bld, remainder, denom),
	remainder);
	} else {
	return nir_bcsel(bld, remainder_ge_den, nir_iadd_imm(bld, quotient, 1),
	quotient);
	}
	}

	/* ported from LLVM's AMDGPUTargetLowering::LowerSDIVREM */
	static nir_ssa_def *
	emit_idiv(nir_builder bld, nir_ssa_def numer, nir_ssa_def *denom, nir_op op)
	{
	nir_ssa_def *lh_sign = nir_ilt(bld, numer, nir_imm_int(bld, 0));
	nir_ssa_def *rh_sign = nir_ilt(bld, denom, nir_imm_int(bld, 0));
	lh_sign = nir_bcsel(bld, lh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0));
	rh_sign = nir_bcsel(bld, rh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0));

	nir_ssa_def *lhs = nir_iadd(bld, numer, lh_sign);
	nir_ssa_def *rhs = nir_iadd(bld, denom, rh_sign);
	lhs = nir_ixor(bld, lhs, lh_sign);
	rhs = nir_ixor(bld, rhs, rh_sign);

	if (op == nir_op_idiv) {
	nir_ssa_def *d_sign = nir_ixor(bld, lh_sign, rh_sign);
	nir_ssa_def *res = emit_udiv(bld, lhs, rhs, false);
	res = nir_ixor(bld, res, d_sign);
	return nir_isub(bld, res, d_sign);
	} else {
	nir_ssa_def *res = emit_udiv(bld, lhs, rhs, true);
	res = nir_ixor(bld, res, lh_sign);
	res = nir_isub(bld, res, lh_sign);
	if (op == nir_op_imod) {
	nir_ssa_def *cond = nir_ieq(bld, res, nir_imm_int(bld, 0));
	cond = nir_ior(bld, nir_ieq(bld, lh_sign, rh_sign), cond);
	res = nir_bcsel(bld, cond, res, nir_iadd(bld, res, denom));
	}
	return res;
	}
	}

	static bool
	convert_instr_precise(nir_builder bld, nir_alu_instr alu)
	{
	nir_op op = alu->op;

	if ((op != nir_op_idiv) &&
	(op != nir_op_imod) &&
	(op != nir_op_irem) &&
	(op != nir_op_udiv) &&
	(op != nir_op_umod))
	return false;

	if (alu->dest.dest.ssa.bit_size != 32)
	return false;

	bld->cursor = nir_before_instr(&alu->instr);

	nir_ssa_def *numer = nir_ssa_for_alu_src(bld, alu, 0);
	nir_ssa_def *denom = nir_ssa_for_alu_src(bld, alu, 1);

	nir_ssa_def *res = NULL;

	if (op == nir_op_udiv \|\| op == nir_op_umod)
	res = emit_udiv(bld, numer, denom, op == nir_op_umod);
	else
	res = emit_idiv(bld, numer, denom, op);

	assert(alu->dest.dest.is_ssa);
	nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(res));

	return true;
	}

	static bool
	convert_impl(nir_function_impl *impl, enum nir_lower_idiv_path path)
	{
	nir_builder b;
	nir_builder_init(&b, impl);
	bool progress = false;

	nir_foreach_block(block, impl) {
	nir_foreach_instr_safe(instr, block) {
	if (instr->type == nir_instr_type_alu && path == nir_lower_idiv_precise)
	progress \|= convert_instr_precise(&b, nir_instr_as_alu(instr));
	else if (instr->type == nir_instr_type_alu)
	progress \|= convert_instr(&b, nir_instr_as_alu(instr));
	}
	}

	nir_metadata_preserve(impl, nir_metadata_block_index \|
	nir_metadata_dominance);

	return progress;
	}

	bool
	nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path)
	{
	bool progress = false;

	nir_foreach_function(function, shader) {
	if (function->impl)
	progress \|= convert_impl(function->impl, path);
	}

	return progress;
	}