src/intel/compiler/brw_compile_gs.cpp - platform/external/mesa3d - Git at Google

 /*
  * Copyright © 2013 Intel Corporation
  * SPDX-License-Identifier: MIT
  */

 #include "brw_eu.h"
 #include "brw_fs.h"
 #include "brw_builder.h"
 #include "brw_generator.h"
 #include "brw_prim.h"
 #include "brw_nir.h"
 #include "brw_private.h"
 #include "dev/intel_debug.h"

 using namespace brw;

 static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
    [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
    [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
    [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
    [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
    [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
    [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
    [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
    [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
    [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
    [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
    [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
    [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
    [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
    [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
 };

 static void
 brw_emit_gs_thread_end(fs_visitor &s)
 {
    assert(s.stage == MESA_SHADER_GEOMETRY);

    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);

    if (s.gs.control_data_header_size_bits > 0) {
       s.emit_gs_control_data_bits(s.final_gs_vertex_count);
    }

    const brw_builder abld = brw_builder(&s).at_end().annotate("thread end");
    brw_inst *inst;

    if (gs_prog_data->static_vertex_count != -1) {
       /* Try and tag the last URB write with EOT instead of emitting a whole
        * separate write just to finish the thread.
        */
       if (s.mark_last_urb_write_with_eot())
          return;

       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
       srcs[URB_LOGICAL_SRC_HANDLE] = s.gs_payload().urb_handles;
       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
       inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
                        srcs, ARRAY_SIZE(srcs));
    } else {
       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
       srcs[URB_LOGICAL_SRC_HANDLE] = s.gs_payload().urb_handles;
       srcs[URB_LOGICAL_SRC_DATA] = s.final_gs_vertex_count;
       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
       inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
                        srcs, ARRAY_SIZE(srcs));
    }
    inst->eot = true;
    inst->offset = 0;
 }

 static void
 brw_assign_gs_urb_setup(fs_visitor &s)
 {
    assert(s.stage == MESA_SHADER_GEOMETRY);

    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data);

    s.first_non_payload_grf +=
       8 * vue_prog_data->urb_read_length * s.nir->info.gs.vertices_in;

    foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
       /* Rewrite all ATTR file references to GRFs. */
       s.convert_attr_sources_to_hw_regs(inst);
    }
 }

 static bool
 run_gs(fs_visitor &s)
 {
    assert(s.stage == MESA_SHADER_GEOMETRY);

    s.payload_ = new gs_thread_payload(s);

    const brw_builder bld = brw_builder(&s).at_end();

    s.final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD);

    if (s.gs.control_data_header_size_bits > 0) {
       /* Create a VGRF to store accumulated control data bits. */
       s.control_data_bits = bld.vgrf(BRW_TYPE_UD);

       /* If we're outputting more than 32 control data bits, then EmitVertex()
        * will set control_data_bits to 0 after emitting the first vertex.
        * Otherwise, we need to initialize it to 0 here.
        */
       if (s.gs.control_data_header_size_bits <= 32) {
          const brw_builder abld = bld.annotate("initialize control data bits");
          abld.MOV(s.control_data_bits, brw_imm_ud(0u));
       }
    }

    brw_from_nir(&s);

    brw_emit_gs_thread_end(s);

    if (s.failed)
       return false;

    brw_calculate_cfg(s);

    brw_optimize(s);

    s.assign_curb_setup();
    brw_assign_gs_urb_setup(s);

    brw_lower_3src_null_dest(s);
    brw_workaround_memory_fence_before_eot(s);
    brw_workaround_emit_dummy_mov_instruction(s);

    brw_allocate_registers(s, true /* allow_spilling */);

    brw_workaround_source_arf_before_eot(s);

    return !s.failed;
 }

 extern "C" const unsigned *
 brw_compile_gs(const struct brw_compiler *compiler,
                struct brw_compile_gs_params *params)
 {
    nir_shader *nir = params->base.nir;
    const struct brw_gs_prog_key *key = params->key;
    struct brw_gs_prog_data *prog_data = params->prog_data;
    const unsigned dispatch_width = brw_geometry_stage_dispatch_width(compiler->devinfo);

    struct intel_vue_map input_vue_map = {0};

    unsigned control_data_bits_per_vertex = 0;
    unsigned control_data_header_size_bits = 0;

    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);

    prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
    prog_data->base.base.ray_queries = nir->info.ray_queries;
    prog_data->base.base.total_scratch = 0;

    /* The GLSL linker will have already matched up GS inputs and the outputs
     * of prior stages.  The driver does extend VS outputs in some cases, but
     * only for legacy OpenGL or Gfx4-5 hardware, neither of which offer
     * geometry shader support.  So we can safely ignore that.
     *
     * For SSO pipelines, we use a fixed VUE map layout based on variable
     * locations, so we can rely on rendezvous-by-location making this work.
     */
    GLbitfield64 inputs_read = nir->info.inputs_read;
    brw_compute_vue_map(compiler->devinfo,
                        &input_vue_map, inputs_read,
                        nir->info.separate_shader, 1);

    brw_nir_apply_key(nir, compiler, &key->base, dispatch_width);
    brw_nir_lower_vue_inputs(nir, &input_vue_map);
    brw_nir_lower_vue_outputs(nir);
    brw_postprocess_nir(nir, compiler, debug_enabled,
                        key->base.robust_flags);

    prog_data->base.clip_distance_mask =
       ((1 << nir->info.clip_distance_array_size) - 1);
    prog_data->base.cull_distance_mask =
       ((1 << nir->info.cull_distance_array_size) - 1) <<
       nir->info.clip_distance_array_size;

    prog_data->include_primitive_id =
       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);

    prog_data->invocations = nir->info.gs.invocations;

    nir_gs_count_vertices_and_primitives(
       nir, &prog_data->static_vertex_count, nullptr, nullptr, 1u);

    if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
       /* When the output type is points, the geometry shader may output data
        * to multiple streams, and EndPrimitive() has no effect.  So we
        * configure the hardware to interpret the control data as stream ID.
        */
       prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;

       /* We only have to emit control bits if we are using non-zero streams */
       if (nir->info.gs.active_stream_mask != (1 << 0))
          control_data_bits_per_vertex = 2;
       else
          control_data_bits_per_vertex = 0;
    } else {
       /* When the output type is triangle_strip or line_strip, EndPrimitive()
        * may be used to terminate the current strip and start a new one
        * (similar to primitive restart), and outputting data to multiple
        * streams is not supported.  So we configure the hardware to interpret
        * the control data as EndPrimitive information (a.k.a. "cut bits").
        */
       prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;

       /* We only need to output control data if the shader actually calls
        * EndPrimitive().
        */
       control_data_bits_per_vertex =
          nir->info.gs.uses_end_primitive ? 1 : 0;
    }

    control_data_header_size_bits =
       nir->info.gs.vertices_out * control_data_bits_per_vertex;

    /* 1 HWORD = 32 bytes = 256 bits */
    prog_data->control_data_header_size_hwords =
       ALIGN(control_data_header_size_bits, 256) / 256;

    /* Compute the output vertex size.
     *
     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
     * Size (p168):
     *
     *     [0,62] indicating [1,63] 16B units
     *
     *     Specifies the size of each vertex stored in the GS output entry
     *     (following any Control Header data) as a number of 128-bit units
     *     (minus one).
     *
     *     Programming Restrictions: The vertex size must be programmed as a
     *     multiple of 32B units with the following exception: Rendering is
     *     disabled (as per SOL stage state) and the vertex size output by the
     *     GS thread is 16B.
     *
     *     If rendering is enabled (as per SOL state) the vertex size must be
     *     programmed as a multiple of 32B units. In other words, the only time
     *     software can program a vertex size with an odd number of 16B units
     *     is when rendering is disabled.
     *
     * Note: B=bytes in the above text.
     *
     * It doesn't seem worth the extra trouble to optimize the case where the
     * vertex size is 16B (especially since this would require special-casing
     * the GEN assembly that writes to the URB).  So we just set the vertex
     * size to a multiple of 32B (2 vec4's) in all cases.
     *
     * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
     * budget that as follows:
     *
     *   512 bytes for varyings (a varying component is 4 bytes and
     *             gl_MaxGeometryOutputComponents = 128)
     *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
     *             bytes)
     *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
     *             even if it's not used)
     *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
     *             whenever clip planes are enabled, even if the shader doesn't
     *             write to gl_ClipDistance)
     *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
     *             (see below)--this causes up to 1 VUE slot to be wasted
     *   400 bytes available for varying packing overhead
     *
     * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
     * per interpolation type, so this is plenty.
     *
     */
    unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
    assert(output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
    prog_data->output_vertex_size_hwords =
       ALIGN(output_vertex_size_bytes, 32) / 32;

    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
     *
     *     64 bytes for the control data header (cut indices or StreamID bits)
     *   4096 bytes for varyings (a varying component is 4 bytes and
     *              gl_MaxGeometryTotalOutputComponents = 1024)
     *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
     *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
     *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
     *              even if it's not used)
     *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
     *              whenever clip planes are enabled, even if the shader doesn't
     *              write to gl_ClipDistance)
     *   4096 bytes overhead since the VUE size must be a multiple of 32
     *              bytes (see above)--this causes up to 1 VUE slot to be wasted
     *   8128 bytes available for varying packing overhead
     *
     * Worst-case varying packing overhead is 3/4 of a varying slot per
     * interpolation type, which works out to 3072 bytes, so this would allow
     * us to accommodate 2 interpolation types without any danger of running
     * out of URB space.
     *
     * In practice, the risk of running out of URB space is very small, since
     * the above figures are all worst-case, and most of them scale with the
     * number of output vertices.  So we'll just calculate the amount of space
     * we need, and if it's too large, fail to compile.
     *
     * The above is for gfx7+ where we have a single URB entry that will hold
     * all the output.
     */
    unsigned output_size_bytes =
       prog_data->output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
    output_size_bytes += 32 * prog_data->control_data_header_size_hwords;

    /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
     * which comes before the control header.
     */
    output_size_bytes += 32;

    /* Shaders can technically set max_vertices = 0, at which point we
     * may have a URB size of 0 bytes.  Nothing good can come from that,
     * so enforce a minimum size.
     */
    if (output_size_bytes == 0)
       output_size_bytes = 1;

    unsigned max_output_size_bytes = GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (output_size_bytes > max_output_size_bytes)
       return NULL;


    /* URB entry sizes are stored as a multiple of 64 bytes in gfx7+. */
    prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;

    assert(nir->info.gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
    prog_data->output_topology =
       gl_prim_to_hw_prim[nir->info.gs.output_primitive];

    prog_data->vertices_in = nir->info.gs.vertices_in;

    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
     * need to program a URB read length of ceiling(num_slots / 2).
     */
    prog_data->base.urb_read_length = (input_vue_map.num_slots + 1) / 2;

    /* Now that prog_data setup is done, we are ready to actually compile the
     * program.
     */
    if (unlikely(debug_enabled)) {
       fprintf(stderr, "GS Input ");
       brw_print_vue_map(stderr, &input_vue_map, MESA_SHADER_GEOMETRY);
       fprintf(stderr, "GS Output ");
       brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
    }

    fs_visitor v(compiler, &params->base, &key->base, &prog_data->base.base,
                 nir, dispatch_width,
                 params->base.stats != NULL, debug_enabled);
    v.gs.control_data_bits_per_vertex = control_data_bits_per_vertex;
    v.gs.control_data_header_size_bits = control_data_header_size_bits;
    if (run_gs(v)) {
       prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;

       assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
       prog_data->base.base.dispatch_grf_start_reg =
          v.payload().num_regs / reg_unit(compiler->devinfo);
       prog_data->base.base.grf_used = v.grf_used;

       brw_generator g(compiler, &params->base,
                      &prog_data->base.base, MESA_SHADER_GEOMETRY);
       if (unlikely(debug_enabled)) {
          const char *label =
             nir->info.label ? nir->info.label : "unnamed";
          char *name = ralloc_asprintf(params->base.mem_ctx,
                                       "%s geometry shader %s",
                                       label, nir->info.name);
          g.enable_debug(name);
       }
       g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
                       v.performance_analysis.require(), params->base.stats);
       g.add_const_data(nir->constant_data, nir->constant_data_size);
       return g.get_assembly();
    }

    params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);

    return NULL;
 }
	/*
	* Copyright © 2013 Intel Corporation
	* SPDX-License-Identifier: MIT
	*/

	#include "brw_eu.h"
	#include "brw_fs.h"
	#include "brw_builder.h"
	#include "brw_generator.h"
	#include "brw_prim.h"
	#include "brw_nir.h"
	#include "brw_private.h"
	#include "dev/intel_debug.h"

	using namespace brw;

	static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
	[MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
	[MESA_PRIM_LINES] = _3DPRIM_LINELIST,
	[MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
	[MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
	[MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
	[MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
	[MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
	[MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
	[MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
	[MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
	[MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
	[MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
	[MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
	[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
	};

	static void
	brw_emit_gs_thread_end(fs_visitor &s)
	{
	assert(s.stage == MESA_SHADER_GEOMETRY);

	struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);

	if (s.gs.control_data_header_size_bits > 0) {
	s.emit_gs_control_data_bits(s.final_gs_vertex_count);
	}

	const brw_builder abld = brw_builder(&s).at_end().annotate("thread end");
	brw_inst *inst;

	if (gs_prog_data->static_vertex_count != -1) {
	/* Try and tag the last URB write with EOT instead of emitting a whole
	* separate write just to finish the thread.
	*/
	if (s.mark_last_urb_write_with_eot())
	return;

	brw_reg srcs[URB_LOGICAL_NUM_SRCS];
	srcs[URB_LOGICAL_SRC_HANDLE] = s.gs_payload().urb_handles;
	srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
	inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
	srcs, ARRAY_SIZE(srcs));
	} else {
	brw_reg srcs[URB_LOGICAL_NUM_SRCS];
	srcs[URB_LOGICAL_SRC_HANDLE] = s.gs_payload().urb_handles;
	srcs[URB_LOGICAL_SRC_DATA] = s.final_gs_vertex_count;
	srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
	inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
	srcs, ARRAY_SIZE(srcs));
	}
	inst->eot = true;
	inst->offset = 0;
	}

	static void
	brw_assign_gs_urb_setup(fs_visitor &s)
	{
	assert(s.stage == MESA_SHADER_GEOMETRY);

	struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data);

	s.first_non_payload_grf +=
	8 * vue_prog_data->urb_read_length * s.nir->info.gs.vertices_in;

	foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
	/* Rewrite all ATTR file references to GRFs. */
	s.convert_attr_sources_to_hw_regs(inst);
	}
	}

	static bool
	run_gs(fs_visitor &s)
	{
	assert(s.stage == MESA_SHADER_GEOMETRY);

	s.payload_ = new gs_thread_payload(s);

	const brw_builder bld = brw_builder(&s).at_end();

	s.final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD);

	if (s.gs.control_data_header_size_bits > 0) {
	/* Create a VGRF to store accumulated control data bits. */
	s.control_data_bits = bld.vgrf(BRW_TYPE_UD);

	/* If we're outputting more than 32 control data bits, then EmitVertex()
	* will set control_data_bits to 0 after emitting the first vertex.
	* Otherwise, we need to initialize it to 0 here.
	*/
	if (s.gs.control_data_header_size_bits <= 32) {
	const brw_builder abld = bld.annotate("initialize control data bits");
	abld.MOV(s.control_data_bits, brw_imm_ud(0u));
	}
	}

	brw_from_nir(&s);

	brw_emit_gs_thread_end(s);

	if (s.failed)
	return false;

	brw_calculate_cfg(s);

	brw_optimize(s);

	s.assign_curb_setup();
	brw_assign_gs_urb_setup(s);

	brw_lower_3src_null_dest(s);
	brw_workaround_memory_fence_before_eot(s);
	brw_workaround_emit_dummy_mov_instruction(s);

	brw_allocate_registers(s, true /* allow_spilling */);

	brw_workaround_source_arf_before_eot(s);

	return !s.failed;
	}

	extern "C" const unsigned *
	brw_compile_gs(const struct brw_compiler *compiler,
	struct brw_compile_gs_params *params)
	{
	nir_shader *nir = params->base.nir;
	const struct brw_gs_prog_key *key = params->key;
	struct brw_gs_prog_data *prog_data = params->prog_data;
	const unsigned dispatch_width = brw_geometry_stage_dispatch_width(compiler->devinfo);

	struct intel_vue_map input_vue_map = {0};

	unsigned control_data_bits_per_vertex = 0;
	unsigned control_data_header_size_bits = 0;

	const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);

	prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
	prog_data->base.base.ray_queries = nir->info.ray_queries;
	prog_data->base.base.total_scratch = 0;

	/* The GLSL linker will have already matched up GS inputs and the outputs
	* of prior stages. The driver does extend VS outputs in some cases, but
	* only for legacy OpenGL or Gfx4-5 hardware, neither of which offer
	* geometry shader support. So we can safely ignore that.
	*
	* For SSO pipelines, we use a fixed VUE map layout based on variable
	* locations, so we can rely on rendezvous-by-location making this work.
	*/
	GLbitfield64 inputs_read = nir->info.inputs_read;
	brw_compute_vue_map(compiler->devinfo,
	&input_vue_map, inputs_read,
	nir->info.separate_shader, 1);

	brw_nir_apply_key(nir, compiler, &key->base, dispatch_width);
	brw_nir_lower_vue_inputs(nir, &input_vue_map);
	brw_nir_lower_vue_outputs(nir);
	brw_postprocess_nir(nir, compiler, debug_enabled,
	key->base.robust_flags);

	prog_data->base.clip_distance_mask =
	((1 << nir->info.clip_distance_array_size) - 1);
	prog_data->base.cull_distance_mask =
	((1 << nir->info.cull_distance_array_size) - 1) <<
	nir->info.clip_distance_array_size;

	prog_data->include_primitive_id =
	BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);

	prog_data->invocations = nir->info.gs.invocations;

	nir_gs_count_vertices_and_primitives(
	nir, &prog_data->static_vertex_count, nullptr, nullptr, 1u);

	if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
	/* When the output type is points, the geometry shader may output data
	* to multiple streams, and EndPrimitive() has no effect. So we
	* configure the hardware to interpret the control data as stream ID.
	*/
	prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;

	/* We only have to emit control bits if we are using non-zero streams */
	if (nir->info.gs.active_stream_mask != (1 << 0))
	control_data_bits_per_vertex = 2;
	else
	control_data_bits_per_vertex = 0;
	} else {
	/* When the output type is triangle_strip or line_strip, EndPrimitive()
	* may be used to terminate the current strip and start a new one
	* (similar to primitive restart), and outputting data to multiple
	* streams is not supported. So we configure the hardware to interpret
	* the control data as EndPrimitive information (a.k.a. "cut bits").
	*/
	prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;

	/* We only need to output control data if the shader actually calls
	* EndPrimitive().
	*/
	control_data_bits_per_vertex =
	nir->info.gs.uses_end_primitive ? 1 : 0;
	}

	control_data_header_size_bits =
	nir->info.gs.vertices_out * control_data_bits_per_vertex;

	/* 1 HWORD = 32 bytes = 256 bits */
	prog_data->control_data_header_size_hwords =
	ALIGN(control_data_header_size_bits, 256) / 256;

	/* Compute the output vertex size.
	*
	* From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
	* Size (p168):
	*
	* [0,62] indicating [1,63] 16B units
	*
	* Specifies the size of each vertex stored in the GS output entry
	* (following any Control Header data) as a number of 128-bit units
	* (minus one).
	*
	* Programming Restrictions: The vertex size must be programmed as a
	* multiple of 32B units with the following exception: Rendering is
	* disabled (as per SOL stage state) and the vertex size output by the
	* GS thread is 16B.
	*
	* If rendering is enabled (as per SOL state) the vertex size must be
	* programmed as a multiple of 32B units. In other words, the only time
	* software can program a vertex size with an odd number of 16B units
	* is when rendering is disabled.
	*
	* Note: B=bytes in the above text.
	*
	* It doesn't seem worth the extra trouble to optimize the case where the
	* vertex size is 16B (especially since this would require special-casing
	* the GEN assembly that writes to the URB). So we just set the vertex
	* size to a multiple of 32B (2 vec4's) in all cases.
	*
	* The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We
	* budget that as follows:
	*
	* 512 bytes for varyings (a varying component is 4 bytes and
	* gl_MaxGeometryOutputComponents = 128)
	* 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
	* bytes)
	* 16 bytes overhead for gl_Position (we allocate it a slot in the VUE
	* even if it's not used)
	* 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
	* whenever clip planes are enabled, even if the shader doesn't
	* write to gl_ClipDistance)
	* 16 bytes overhead since the VUE size must be a multiple of 32 bytes
	* (see below)--this causes up to 1 VUE slot to be wasted
	* 400 bytes available for varying packing overhead
	*
	* Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
	* per interpolation type, so this is plenty.
	*
	*/
	unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
	assert(output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
	prog_data->output_vertex_size_hwords =
	ALIGN(output_vertex_size_bytes, 32) / 32;

	/* Compute URB entry size. The maximum allowed URB entry size is 32k.
	* That divides up as follows:
	*
	* 64 bytes for the control data header (cut indices or StreamID bits)
	* 4096 bytes for varyings (a varying component is 4 bytes and
	* gl_MaxGeometryTotalOutputComponents = 1024)
	* 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
	* bytes/vertex and gl_MaxGeometryOutputVertices is 256)
	* 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
	* even if it's not used)
	* 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
	* whenever clip planes are enabled, even if the shader doesn't
	* write to gl_ClipDistance)
	* 4096 bytes overhead since the VUE size must be a multiple of 32
	* bytes (see above)--this causes up to 1 VUE slot to be wasted
	* 8128 bytes available for varying packing overhead
	*
	* Worst-case varying packing overhead is 3/4 of a varying slot per
	* interpolation type, which works out to 3072 bytes, so this would allow
	* us to accommodate 2 interpolation types without any danger of running
	* out of URB space.
	*
	* In practice, the risk of running out of URB space is very small, since
	* the above figures are all worst-case, and most of them scale with the
	* number of output vertices. So we'll just calculate the amount of space
	* we need, and if it's too large, fail to compile.
	*
	* The above is for gfx7+ where we have a single URB entry that will hold
	* all the output.
	*/
	unsigned output_size_bytes =
	prog_data->output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
	output_size_bytes += 32 * prog_data->control_data_header_size_hwords;

	/* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
	* which comes before the control header.
	*/
	output_size_bytes += 32;

	/* Shaders can technically set max_vertices = 0, at which point we
	* may have a URB size of 0 bytes. Nothing good can come from that,
	* so enforce a minimum size.
	*/
	if (output_size_bytes == 0)
	output_size_bytes = 1;

	unsigned max_output_size_bytes = GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES;
	if (output_size_bytes > max_output_size_bytes)
	return NULL;


	/* URB entry sizes are stored as a multiple of 64 bytes in gfx7+. */
	prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;

	assert(nir->info.gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
	prog_data->output_topology =
	gl_prim_to_hw_prim[nir->info.gs.output_primitive];

	prog_data->vertices_in = nir->info.gs.vertices_in;

	/* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
	* need to program a URB read length of ceiling(num_slots / 2).
	*/
	prog_data->base.urb_read_length = (input_vue_map.num_slots + 1) / 2;

	/* Now that prog_data setup is done, we are ready to actually compile the
	* program.
	*/
	if (unlikely(debug_enabled)) {
	fprintf(stderr, "GS Input ");
	brw_print_vue_map(stderr, &input_vue_map, MESA_SHADER_GEOMETRY);
	fprintf(stderr, "GS Output ");
	brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
	}

	fs_visitor v(compiler, &params->base, &key->base, &prog_data->base.base,
	nir, dispatch_width,
	params->base.stats != NULL, debug_enabled);
	v.gs.control_data_bits_per_vertex = control_data_bits_per_vertex;
	v.gs.control_data_header_size_bits = control_data_header_size_bits;
	if (run_gs(v)) {
	prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;

	assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
	prog_data->base.base.dispatch_grf_start_reg =
	v.payload().num_regs / reg_unit(compiler->devinfo);
	prog_data->base.base.grf_used = v.grf_used;

	brw_generator g(compiler, &params->base,
	&prog_data->base.base, MESA_SHADER_GEOMETRY);
	if (unlikely(debug_enabled)) {
	const char *label =
	nir->info.label ? nir->info.label : "unnamed";
	char *name = ralloc_asprintf(params->base.mem_ctx,
	"%s geometry shader %s",
	label, nir->info.name);
	g.enable_debug(name);
	}
	g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
	v.performance_analysis.require(), params->base.stats);
	g.add_const_data(nir->constant_data, nir->constant_data_size);
	return g.get_assembly();
	}

	params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);

	return NULL;
	}