src/util/fast_idiv_by_const.h - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2018 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 #ifndef FAST_IDIV_BY_CONST_H
 #define FAST_IDIV_BY_CONST_H

 /* Imported from:
  *   https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
  */

 #include <inttypes.h>
 #include <limits.h>
 #include <assert.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

 /* Computes "magic info" for performing signed division by a fixed integer D.
  * The type 'sint_t' is assumed to be defined as a signed integer type large
  * enough to hold both the dividend and the divisor.
  * Here >> is arithmetic (signed) shift, and >>> is logical shift.
  *
  * To emit code for n/d, rounding towards zero, use the following sequence:
  *
  *   m = compute_signed_magic_info(D)
  *   emit("result = (m.multiplier * n) >> SINT_BITS");
  *   if d > 0 and m.multiplier < 0: emit("result += n")
  *   if d < 0 and m.multiplier > 0: emit("result -= n")
  *   if m.post_shift > 0: emit("result >>= m.shift")
  *   emit("result += (result < 0)")
  *
  * The shifts by SINT_BITS may be "free" if the high half of the full multiply
  * is put in a separate register.
  *
  * The final add can of course be implemented via the sign bit, e.g.
  *    result += (result >>> (SINT_BITS - 1))
  * or
  *    result -= (result >> (SINT_BITS - 1))
  *
  * This code is heavily indebted to Hacker's Delight by Henry Warren.
  * See http://www.hackersdelight.org/HDcode/magic.c.txt
  * Used with permission from http://www.hackersdelight.org/permissions.htm
  */

 struct util_fast_sdiv_info {
    int64_t multiplier; /* the "magic number" multiplier */
    unsigned shift; /* shift for the dividend after multiplying */
 };

 struct util_fast_sdiv_info
 util_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS);

 /* Computes "magic info" for performing unsigned division by a fixed positive
  * integer D.  UINT_BITS is the bit size at which the final "magic"
  * calculation will be performed; it is assumed to be large enough to hold
  * both the dividand and the divisor.  num_bits can be set appropriately if n
  * is known to be smaller than calc_bits; if this is not known then UINT_BITS
  * for num_bits.
  *
  * Assume we have a hardware register of width UINT_BITS, a known constant D
  * which is not zero and not a power of 2, and a variable n of width num_bits
  * (which may be up to UINT_BITS). To emit code for n/d, use one of the two
  * following sequences (here >>> refers to a logical bitshift):
  *
  *   m = compute_unsigned_magic_info(D, num_bits)
  *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
  *   if m.increment: emit("n = saturated_increment(n)")
  *   emit("result = (m.multiplier * n) >>> UINT_BITS")
  *   if m.post_shift > 0: emit("result >>>= m.post_shift")
  *
  * or
  *
  *   m = compute_unsigned_magic_info(D, num_bits)
  *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
  *   emit("result = m.multiplier * n")
  *   if m.increment: emit("result = result + m.multiplier")
  *   emit("result >>>= UINT_BITS")
  *   if m.post_shift > 0: emit("result >>>= m.post_shift")
  *
  * This second version works even if D is 1.  The shifts by UINT_BITS may be
  * "free" if the high half of the full multiply is put in a separate register.
  *
  * saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
  *   if n == (1 << UINT_BITS)-1: result = n
  *   else: result = n+1
  * A common way to implement this is with the carry bit. For example, on x86:
  *   add 1
  *   sbb 0
  *
  * Some invariants:
  *   1: At least one of pre_shift and increment is zero
  *   2: multiplier is never zero
  *
  * This code incorporates the "round down" optimization per ridiculous_fish.
  */

 struct util_fast_udiv_info {
    uint64_t multiplier; /* the "magic number" multiplier */
    unsigned pre_shift; /* shift for the dividend before multiplying */
    unsigned post_shift; /* shift for the dividend after multiplying */
    int increment; /* 0 or 1; if set then increment the numerator, using one of
                      the two strategies */
 };

 struct util_fast_udiv_info
 util_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS);

 /* Below are possible options for dividing by a uniform in a shader where
  * the divisor is constant but not known at compile time.
  */

 /* Full version. */
 static inline uint32_t
 util_fast_udiv32(uint32_t n, struct util_fast_udiv_info info)
 {
    n = n >> info.pre_shift;
    /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps
     * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.
     *
     * If you have unsigned 64-bit MAD with 32-bit inputs, you can do:
     *    increment = increment ? multiplier : 0; // on the CPU
     *    (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD
     */
    n = (((uint64_t)n + info.increment) * info.multiplier) >> 32;
    n = n >> info.post_shift;
    return n;
 }

 /* A little more efficient version if n != UINT_MAX, i.e. no unsigned
  * wraparound in the computation.
  */
 static inline uint32_t
 util_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info)
 {
    assert(n != UINT32_MAX);
    n = n >> info.pre_shift;
    n = n + info.increment;
    n = ((uint64_t)n * info.multiplier) >> 32;
    n = n >> info.post_shift;
    return n;
 }

 /* Even faster version but both operands must be 31-bit unsigned integers
  * and the divisor must be greater than 1.
  *
  * info must be computed with num_bits == 31.
  */
 static inline uint32_t
 util_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info)
 {
    assert(info.pre_shift == 0);
    assert(info.increment == 0);
    n = ((uint64_t)n * info.multiplier) >> 32;
    n = n >> info.post_shift;
    return n;
 }

 #ifdef __cplusplus
 } /* extern C */
 #endif

 #endif /* FAST_IDIV_BY_CONST_H */
	/*
	* Copyright © 2018 Advanced Micro Devices, Inc.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/

	#ifndef FAST_IDIV_BY_CONST_H
	#define FAST_IDIV_BY_CONST_H

	/* Imported from:
	* https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
	*/

	#include <inttypes.h>
	#include <limits.h>
	#include <assert.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/* Computes "magic info" for performing signed division by a fixed integer D.
	* The type 'sint_t' is assumed to be defined as a signed integer type large
	* enough to hold both the dividend and the divisor.
	* Here >> is arithmetic (signed) shift, and >>> is logical shift.
	*
	* To emit code for n/d, rounding towards zero, use the following sequence:
	*
	* m = compute_signed_magic_info(D)
	* emit("result = (m.multiplier * n) >> SINT_BITS");
	* if d > 0 and m.multiplier < 0: emit("result += n")
	* if d < 0 and m.multiplier > 0: emit("result -= n")
	* if m.post_shift > 0: emit("result >>= m.shift")
	* emit("result += (result < 0)")
	*
	* The shifts by SINT_BITS may be "free" if the high half of the full multiply
	* is put in a separate register.
	*
	* The final add can of course be implemented via the sign bit, e.g.
	* result += (result >>> (SINT_BITS - 1))
	* or
	* result -= (result >> (SINT_BITS - 1))
	*
	* This code is heavily indebted to Hacker's Delight by Henry Warren.
	* See http://www.hackersdelight.org/HDcode/magic.c.txt
	* Used with permission from http://www.hackersdelight.org/permissions.htm
	*/

	struct util_fast_sdiv_info {
	int64_t multiplier; /* the "magic number" multiplier */
	unsigned shift; /* shift for the dividend after multiplying */
	};

	struct util_fast_sdiv_info
	util_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS);

	/* Computes "magic info" for performing unsigned division by a fixed positive
	* integer D. UINT_BITS is the bit size at which the final "magic"
	* calculation will be performed; it is assumed to be large enough to hold
	* both the dividand and the divisor. num_bits can be set appropriately if n
	* is known to be smaller than calc_bits; if this is not known then UINT_BITS
	* for num_bits.
	*
	* Assume we have a hardware register of width UINT_BITS, a known constant D
	* which is not zero and not a power of 2, and a variable n of width num_bits
	* (which may be up to UINT_BITS). To emit code for n/d, use one of the two
	* following sequences (here >>> refers to a logical bitshift):
	*
	* m = compute_unsigned_magic_info(D, num_bits)
	* if m.pre_shift > 0: emit("n >>>= m.pre_shift")
	* if m.increment: emit("n = saturated_increment(n)")
	* emit("result = (m.multiplier * n) >>> UINT_BITS")
	* if m.post_shift > 0: emit("result >>>= m.post_shift")
	*
	* or
	*
	* m = compute_unsigned_magic_info(D, num_bits)
	* if m.pre_shift > 0: emit("n >>>= m.pre_shift")
	* emit("result = m.multiplier * n")
	* if m.increment: emit("result = result + m.multiplier")
	* emit("result >>>= UINT_BITS")
	* if m.post_shift > 0: emit("result >>>= m.post_shift")
	*
	* This second version works even if D is 1. The shifts by UINT_BITS may be
	* "free" if the high half of the full multiply is put in a separate register.
	*
	* saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
	* if n == (1 << UINT_BITS)-1: result = n
	* else: result = n+1
	* A common way to implement this is with the carry bit. For example, on x86:
	* add 1
	* sbb 0
	*
	* Some invariants:
	* 1: At least one of pre_shift and increment is zero
	* 2: multiplier is never zero
	*
	* This code incorporates the "round down" optimization per ridiculous_fish.
	*/

	struct util_fast_udiv_info {
	uint64_t multiplier; /* the "magic number" multiplier */
	unsigned pre_shift; /* shift for the dividend before multiplying */
	unsigned post_shift; /* shift for the dividend after multiplying */
	int increment; /* 0 or 1; if set then increment the numerator, using one of
	the two strategies */
	};

	struct util_fast_udiv_info
	util_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS);

	/* Below are possible options for dividing by a uniform in a shader where
	* the divisor is constant but not known at compile time.
	*/

	/* Full version. */
	static inline uint32_t
	util_fast_udiv32(uint32_t n, struct util_fast_udiv_info info)
	{
	n = n >> info.pre_shift;
	/* If the divisor is not 1, you can instead use a 32-bit ADD that clamps
	* to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.
	*
	* If you have unsigned 64-bit MAD with 32-bit inputs, you can do:
	* increment = increment ? multiplier : 0; // on the CPU
	* (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD
	*/
	n = (((uint64_t)n + info.increment) * info.multiplier) >> 32;
	n = n >> info.post_shift;
	return n;
	}

	/* A little more efficient version if n != UINT_MAX, i.e. no unsigned
	* wraparound in the computation.
	*/
	static inline uint32_t
	util_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info)
	{
	assert(n != UINT32_MAX);
	n = n >> info.pre_shift;
	n = n + info.increment;
	n = ((uint64_t)n * info.multiplier) >> 32;
	n = n >> info.post_shift;
	return n;
	}

	/* Even faster version but both operands must be 31-bit unsigned integers
	* and the divisor must be greater than 1.
	*
	* info must be computed with num_bits == 31.
	*/
	static inline uint32_t
	util_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info)
	{
	assert(info.pre_shift == 0);
	assert(info.increment == 0);
	n = ((uint64_t)n * info.multiplier) >> 32;
	n = n >> info.post_shift;
	return n;
	}

	#ifdef __cplusplus
	} /* extern C */
	#endif

	#endif /* FAST_IDIV_BY_CONST_H */