src/compiler/nir/nir_lower_amul.c - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2019 Google, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #include "nir.h"
 #include "nir_vla.h"

 /* Lowering for amul instructions, for drivers that support imul24.
  * This pass will analyze indirect derefs, and convert corresponding
  * amul instructions to either imul or imul24, depending on the
  * required range.
  *
  * 1) Analyze the uniform variables and build a table of UBOs and SSBOs
  *    that are either too large, or might be too large (unknown size)
  *    for imul24
  *
  * 2) Loop thru looking at all the intrinsics, finding dereferences of
  *    large variables, and recursively replacing all amul instructions
  *    used with imul
  *
  * 3) Finally loop again thru all instructions replacing any remaining
  *    amul with imul24.  At this point any remaining amul instructions
  *    are not involved in calculating an offset into a large variable,
  *    thanks to the 2nd step, so they can be safely replace with imul24.
  *
  * Using two passes over all the instructions lets us handle the case
  * where, due to CSE, an amul is used to calculate an offset into both
  * a large and small variable.
  */

 typedef struct {
    nir_shader *shader;

    int (*type_size)(const struct glsl_type *, bool);

    /* Tables of UBOs and SSBOs mapping driver_location/base whether
     * they are too large to use imul24:
     */
    bool *large_ubos;
    bool *large_ssbos;

    /* for cases that we cannot determine UBO/SSBO index, track if *any*
     * UBO/SSBO is too large for imul24:
     */
    bool has_large_ubo;
    bool has_large_ssbo;

    unsigned max_slot;
 } lower_state;

 /* Lower 'amul's in offset src of large variables to 'imul': */
 static bool
 lower_large_src(nir_src *src, void *s)
 {
    lower_state *state = s;

    assert(src->is_ssa);

    nir_instr *parent = src->ssa->parent_instr;

    /* No need to visit instructions we've already visited.. this also
     * avoids infinite recursion when phi's are involved:
     */
    if (parent->pass_flags)
       return false;

    bool progress = nir_foreach_src(parent, lower_large_src, state);

    if (parent->type == nir_instr_type_alu) {
       nir_alu_instr *alu = nir_instr_as_alu(parent);
       if (alu->op == nir_op_amul) {
          alu->op = nir_op_imul;
          progress = true;
       }
    }

    parent->pass_flags = 1;

    return progress;
 }

 static bool
 large_ubo(lower_state *state, nir_src src)
 {
    if (!nir_src_is_const(src))
       return state->has_large_ubo;
    unsigned idx = nir_src_as_uint(src);
    assert(idx < state->shader->info.num_ubos);
    return state->large_ubos[idx];
 }

 static bool
 large_ssbo(lower_state *state, nir_src src)
 {
    if (!nir_src_is_const(src))
       return state->has_large_ssbo;
    unsigned idx = nir_src_as_uint(src);
    assert(idx < state->shader->info.num_ssbos);
    return state->large_ssbos[idx];
 }

 static bool
 lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
 {
    switch (intr->intrinsic) {
    case nir_intrinsic_load_ubo:
       //# src[] = { buffer_index, offset }.
       if (large_ubo(state, intr->src[0]))
          return lower_large_src(&intr->src[1], state);
       return false;

    case nir_intrinsic_load_ssbo:
       //# src[] = { buffer_index, offset }.
       if (large_ssbo(state, intr->src[0]))
          return lower_large_src(&intr->src[1], state);
       return false;

    case nir_intrinsic_store_ssbo:
       //# src[] = { value, block_index, offset }
       if (large_ssbo(state, intr->src[1]))
          return lower_large_src(&intr->src[2], state);
       return false;

    case nir_intrinsic_ssbo_atomic_add:
    case nir_intrinsic_ssbo_atomic_imin:
    case nir_intrinsic_ssbo_atomic_umin:
    case nir_intrinsic_ssbo_atomic_imax:
    case nir_intrinsic_ssbo_atomic_umax:
    case nir_intrinsic_ssbo_atomic_and:
    case nir_intrinsic_ssbo_atomic_or:
    case nir_intrinsic_ssbo_atomic_xor:
    case nir_intrinsic_ssbo_atomic_exchange:
    case nir_intrinsic_ssbo_atomic_comp_swap:
    case nir_intrinsic_ssbo_atomic_fadd:
    case nir_intrinsic_ssbo_atomic_fmin:
    case nir_intrinsic_ssbo_atomic_fmax:
    case nir_intrinsic_ssbo_atomic_fcomp_swap:
       /* 0: SSBO index
        * 1: offset
        */
       if (large_ssbo(state, intr->src[0]))
          return lower_large_src(&intr->src[1], state);
       return false;

    case nir_intrinsic_global_atomic_add:
    case nir_intrinsic_global_atomic_imin:
    case nir_intrinsic_global_atomic_umin:
    case nir_intrinsic_global_atomic_imax:
    case nir_intrinsic_global_atomic_umax:
    case nir_intrinsic_global_atomic_and:
    case nir_intrinsic_global_atomic_or:
    case nir_intrinsic_global_atomic_xor:
    case nir_intrinsic_global_atomic_exchange:
    case nir_intrinsic_global_atomic_comp_swap:
    case nir_intrinsic_global_atomic_fadd:
    case nir_intrinsic_global_atomic_fmin:
    case nir_intrinsic_global_atomic_fmax:
    case nir_intrinsic_global_atomic_fcomp_swap:
       /* just assume we that 24b is not sufficient: */
       return lower_large_src(&intr->src[0], state);

    /* These should all be small enough to unconditionally use imul24: */
    case nir_intrinsic_shared_atomic_add:
    case nir_intrinsic_shared_atomic_imin:
    case nir_intrinsic_shared_atomic_umin:
    case nir_intrinsic_shared_atomic_imax:
    case nir_intrinsic_shared_atomic_umax:
    case nir_intrinsic_shared_atomic_and:
    case nir_intrinsic_shared_atomic_or:
    case nir_intrinsic_shared_atomic_xor:
    case nir_intrinsic_shared_atomic_exchange:
    case nir_intrinsic_shared_atomic_comp_swap:
    case nir_intrinsic_shared_atomic_fadd:
    case nir_intrinsic_shared_atomic_fmin:
    case nir_intrinsic_shared_atomic_fmax:
    case nir_intrinsic_shared_atomic_fcomp_swap:
    case nir_intrinsic_load_uniform:
    case nir_intrinsic_load_input:
    case nir_intrinsic_load_output:
    case nir_intrinsic_store_output:
    default:
       return false;
    }
 }

 static bool
 lower_instr(lower_state *state, nir_instr *instr)
 {
    bool progress = false;

    if (instr->type == nir_instr_type_intrinsic) {
       progress |= lower_intrinsic(state, nir_instr_as_intrinsic(instr));
    }

    return progress;
 }

 static bool
 is_large(lower_state *state, nir_variable *var)
 {
    const struct glsl_type *type = glsl_without_array(var->type);
    unsigned size = state->type_size(type, false);

    /* if size is not known (ie. VLA) then assume the worst: */
    if (!size)
       return true;

    return size >= (1 << 23);
 }

 bool
 nir_lower_amul(nir_shader *shader,
                int (*type_size)(const struct glsl_type *, bool))
 {
    assert(shader->options->has_imul24);
    assert(type_size);

    NIR_VLA_FILL(bool, large_ubos, shader->info.num_ubos, 0);
    NIR_VLA_FILL(bool, large_ssbos, shader->info.num_ssbos, 0);

    lower_state state = {
       .shader = shader,
       .type_size = type_size,
       .large_ubos = large_ubos,
       .large_ssbos = large_ssbos,
    };

    /* Figure out which UBOs or SSBOs are large enough to be
     * disqualified from imul24:
     */
    nir_foreach_variable_in_shader (var, shader) {
       if (var->data.mode == nir_var_mem_ubo) {
          if (is_large(&state, var)) {
             state.has_large_ubo = true;
             unsigned size = MAX2(1, glsl_array_size(var->type));
             for (unsigned i = 0; i < size; i++)
                state.large_ubos[var->data.binding + i] = true;
          }
       } else if (var->data.mode == nir_var_mem_ssbo) {
          if (is_large(&state, var)) {
             state.has_large_ssbo = true;
             unsigned size = MAX2(1, glsl_array_size(var->type));
             for (unsigned i = 0; i < size; i++)
                state.large_ssbos[var->data.binding + i] = true;
          }
       }
    }

    /* clear pass flags: */
    nir_foreach_function(function, shader) {
       nir_function_impl *impl = function->impl;
       if (!impl)
          continue;

       nir_foreach_block(block, impl) {
          nir_foreach_instr(instr, block) {
             instr->pass_flags = 0;
          }
       }
    }

    bool progress = false;
    nir_foreach_function(function, shader) {
       nir_function_impl *impl = function->impl;

       if (!impl)
          continue;

       nir_foreach_block(block, impl) {
          nir_foreach_instr(instr, block) {
             progress |= lower_instr(&state, instr);
          }
       }
    }

    /* At this point, all 'amul's used in calculating an offset into
     * a large variable have been replaced with 'imul'.  So remaining
     * 'amul's can be replaced with 'imul24':
     */
    nir_foreach_function(function, shader) {
       nir_function_impl *impl = function->impl;

       if (!impl)
          continue;

       nir_foreach_block(block, impl) {
          nir_foreach_instr(instr, block) {
             if (instr->type != nir_instr_type_alu)
                continue;

             nir_alu_instr *alu = nir_instr_as_alu(instr);
             if (alu->op != nir_op_amul)
                continue;

             alu->op = nir_op_imul24;
             progress |= true;
          }
       }

       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);

    }

    return progress;
 }
	/*
	* Copyright © 2019 Google, Inc.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/

	#include "nir.h"
	#include "nir_vla.h"

	/* Lowering for amul instructions, for drivers that support imul24.
	* This pass will analyze indirect derefs, and convert corresponding
	* amul instructions to either imul or imul24, depending on the
	* required range.
	*
	* 1) Analyze the uniform variables and build a table of UBOs and SSBOs
	* that are either too large, or might be too large (unknown size)
	* for imul24
	*
	* 2) Loop thru looking at all the intrinsics, finding dereferences of
	* large variables, and recursively replacing all amul instructions
	* used with imul
	*
	* 3) Finally loop again thru all instructions replacing any remaining
	* amul with imul24. At this point any remaining amul instructions
	* are not involved in calculating an offset into a large variable,
	* thanks to the 2nd step, so they can be safely replace with imul24.
	*
	* Using two passes over all the instructions lets us handle the case
	* where, due to CSE, an amul is used to calculate an offset into both
	* a large and small variable.
	*/

	typedef struct {
	nir_shader *shader;

	int (type_size)(const struct glsl_type , bool);

	/* Tables of UBOs and SSBOs mapping driver_location/base whether
	* they are too large to use imul24:
	*/
	bool *large_ubos;
	bool *large_ssbos;

	/* for cases that we cannot determine UBO/SSBO index, track if any
	* UBO/SSBO is too large for imul24:
	*/
	bool has_large_ubo;
	bool has_large_ssbo;

	unsigned max_slot;
	} lower_state;

	/* Lower 'amul's in offset src of large variables to 'imul': */
	static bool
	lower_large_src(nir_src src, void s)
	{
	lower_state *state = s;

	assert(src->is_ssa);

	nir_instr *parent = src->ssa->parent_instr;

	/* No need to visit instructions we've already visited.. this also
	* avoids infinite recursion when phi's are involved:
	*/
	if (parent->pass_flags)
	return false;

	bool progress = nir_foreach_src(parent, lower_large_src, state);

	if (parent->type == nir_instr_type_alu) {
	nir_alu_instr *alu = nir_instr_as_alu(parent);
	if (alu->op == nir_op_amul) {
	alu->op = nir_op_imul;
	progress = true;
	}
	}

	parent->pass_flags = 1;

	return progress;
	}

	static bool
	large_ubo(lower_state *state, nir_src src)
	{
	if (!nir_src_is_const(src))
	return state->has_large_ubo;
	unsigned idx = nir_src_as_uint(src);
	assert(idx < state->shader->info.num_ubos);
	return state->large_ubos[idx];
	}

	static bool
	large_ssbo(lower_state *state, nir_src src)
	{
	if (!nir_src_is_const(src))
	return state->has_large_ssbo;
	unsigned idx = nir_src_as_uint(src);
	assert(idx < state->shader->info.num_ssbos);
	return state->large_ssbos[idx];
	}

	static bool
	lower_intrinsic(lower_state state, nir_intrinsic_instr intr)
	{
	switch (intr->intrinsic) {
	case nir_intrinsic_load_ubo:
	//# src[] = { buffer_index, offset }.
	if (large_ubo(state, intr->src[0]))
	return lower_large_src(&intr->src[1], state);
	return false;

	case nir_intrinsic_load_ssbo:
	//# src[] = { buffer_index, offset }.
	if (large_ssbo(state, intr->src[0]))
	return lower_large_src(&intr->src[1], state);
	return false;

	case nir_intrinsic_store_ssbo:
	//# src[] = { value, block_index, offset }
	if (large_ssbo(state, intr->src[1]))
	return lower_large_src(&intr->src[2], state);
	return false;

	case nir_intrinsic_ssbo_atomic_add:
	case nir_intrinsic_ssbo_atomic_imin:
	case nir_intrinsic_ssbo_atomic_umin:
	case nir_intrinsic_ssbo_atomic_imax:
	case nir_intrinsic_ssbo_atomic_umax:
	case nir_intrinsic_ssbo_atomic_and:
	case nir_intrinsic_ssbo_atomic_or:
	case nir_intrinsic_ssbo_atomic_xor:
	case nir_intrinsic_ssbo_atomic_exchange:
	case nir_intrinsic_ssbo_atomic_comp_swap:
	case nir_intrinsic_ssbo_atomic_fadd:
	case nir_intrinsic_ssbo_atomic_fmin:
	case nir_intrinsic_ssbo_atomic_fmax:
	case nir_intrinsic_ssbo_atomic_fcomp_swap:
	/* 0: SSBO index
	* 1: offset
	*/
	if (large_ssbo(state, intr->src[0]))
	return lower_large_src(&intr->src[1], state);
	return false;

	case nir_intrinsic_global_atomic_add:
	case nir_intrinsic_global_atomic_imin:
	case nir_intrinsic_global_atomic_umin:
	case nir_intrinsic_global_atomic_imax:
	case nir_intrinsic_global_atomic_umax:
	case nir_intrinsic_global_atomic_and:
	case nir_intrinsic_global_atomic_or:
	case nir_intrinsic_global_atomic_xor:
	case nir_intrinsic_global_atomic_exchange:
	case nir_intrinsic_global_atomic_comp_swap:
	case nir_intrinsic_global_atomic_fadd:
	case nir_intrinsic_global_atomic_fmin:
	case nir_intrinsic_global_atomic_fmax:
	case nir_intrinsic_global_atomic_fcomp_swap:
	/* just assume we that 24b is not sufficient: */
	return lower_large_src(&intr->src[0], state);

	/* These should all be small enough to unconditionally use imul24: */
	case nir_intrinsic_shared_atomic_add:
	case nir_intrinsic_shared_atomic_imin:
	case nir_intrinsic_shared_atomic_umin:
	case nir_intrinsic_shared_atomic_imax:
	case nir_intrinsic_shared_atomic_umax:
	case nir_intrinsic_shared_atomic_and:
	case nir_intrinsic_shared_atomic_or:
	case nir_intrinsic_shared_atomic_xor:
	case nir_intrinsic_shared_atomic_exchange:
	case nir_intrinsic_shared_atomic_comp_swap:
	case nir_intrinsic_shared_atomic_fadd:
	case nir_intrinsic_shared_atomic_fmin:
	case nir_intrinsic_shared_atomic_fmax:
	case nir_intrinsic_shared_atomic_fcomp_swap:
	case nir_intrinsic_load_uniform:
	case nir_intrinsic_load_input:
	case nir_intrinsic_load_output:
	case nir_intrinsic_store_output:
	default:
	return false;
	}
	}

	static bool
	lower_instr(lower_state state, nir_instr instr)
	{
	bool progress = false;

	if (instr->type == nir_instr_type_intrinsic) {
	progress \|= lower_intrinsic(state, nir_instr_as_intrinsic(instr));
	}

	return progress;
	}

	static bool
	is_large(lower_state state, nir_variable var)
	{
	const struct glsl_type *type = glsl_without_array(var->type);
	unsigned size = state->type_size(type, false);

	/* if size is not known (ie. VLA) then assume the worst: */
	if (!size)
	return true;

	return size >= (1 << 23);
	}

	bool
	nir_lower_amul(nir_shader *shader,
	int (type_size)(const struct glsl_type , bool))
	{
	assert(shader->options->has_imul24);
	assert(type_size);

	NIR_VLA_FILL(bool, large_ubos, shader->info.num_ubos, 0);
	NIR_VLA_FILL(bool, large_ssbos, shader->info.num_ssbos, 0);

	lower_state state = {
	.shader = shader,
	.type_size = type_size,
	.large_ubos = large_ubos,
	.large_ssbos = large_ssbos,
	};

	/* Figure out which UBOs or SSBOs are large enough to be
	* disqualified from imul24:
	*/
	nir_foreach_variable_in_shader (var, shader) {
	if (var->data.mode == nir_var_mem_ubo) {
	if (is_large(&state, var)) {
	state.has_large_ubo = true;
	unsigned size = MAX2(1, glsl_array_size(var->type));
	for (unsigned i = 0; i < size; i++)
	state.large_ubos[var->data.binding + i] = true;
	}
	} else if (var->data.mode == nir_var_mem_ssbo) {
	if (is_large(&state, var)) {
	state.has_large_ssbo = true;
	unsigned size = MAX2(1, glsl_array_size(var->type));
	for (unsigned i = 0; i < size; i++)
	state.large_ssbos[var->data.binding + i] = true;
	}
	}
	}

	/* clear pass flags: */
	nir_foreach_function(function, shader) {
	nir_function_impl *impl = function->impl;
	if (!impl)
	continue;

	nir_foreach_block(block, impl) {
	nir_foreach_instr(instr, block) {
	instr->pass_flags = 0;
	}
	}
	}

	bool progress = false;
	nir_foreach_function(function, shader) {
	nir_function_impl *impl = function->impl;

	if (!impl)
	continue;

	nir_foreach_block(block, impl) {
	nir_foreach_instr(instr, block) {
	progress \|= lower_instr(&state, instr);
	}
	}
	}

	/* At this point, all 'amul's used in calculating an offset into
	* a large variable have been replaced with 'imul'. So remaining
	* 'amul's can be replaced with 'imul24':
	*/
	nir_foreach_function(function, shader) {
	nir_function_impl *impl = function->impl;

	if (!impl)
	continue;

	nir_foreach_block(block, impl) {
	nir_foreach_instr(instr, block) {
	if (instr->type != nir_instr_type_alu)
	continue;

	nir_alu_instr *alu = nir_instr_as_alu(instr);
	if (alu->op != nir_op_amul)
	continue;

	alu->op = nir_op_imul24;
	progress \|= true;
	}
	}

	nir_metadata_preserve(impl, nir_metadata_block_index \|
	nir_metadata_dominance);

	}

	return progress;
	}