src/gallium/drivers/radeonsi/si_shader.h - platform/external/mesa3d - Git at Google

 /*
  * Copyright 2012 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * on the rights to use, copy, modify, merge, publish, distribute, sub
  * license, and/or sell copies of the Software, and to permit persons to whom
  * the Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * Authors:
  *	Tom Stellard <thomas.stellard@amd.com>
  *	Michel Dänzer <michel.daenzer@amd.com>
  *      Christian König <christian.koenig@amd.com>
  */

 /* How linking shader inputs and outputs between vertex, tessellation, and
  * geometry shaders works.
  *
  * Inputs and outputs between shaders are stored in a buffer. This buffer
  * lives in LDS (typical case for tessellation), but it can also live
  * in memory (ESGS). Each input or output has a fixed location within a vertex.
  * The highest used input or output determines the stride between vertices.
  *
  * Since GS and tessellation are only possible in the OpenGL core profile,
  * only these semantics are valid for per-vertex data:
  *
  *   Name             Location
  *
  *   POSITION         0
  *   PSIZE            1
  *   CLIPDIST0..1     2..3
  *   CULLDIST0..1     (not implemented)
  *   GENERIC0..31     4..35
  *
  * For example, a shader only writing GENERIC0 has the output stride of 5.
  *
  * Only these semantics are valid for per-patch data:
  *
  *   Name             Location
  *
  *   TESSOUTER        0
  *   TESSINNER        1
  *   PATCH0..29       2..31
  *
  * That's how independent shaders agree on input and output locations.
  * The si_shader_io_get_unique_index function assigns the locations.
  *
  * For tessellation, other required information for calculating the input and
  * output addresses like the vertex stride, the patch stride, and the offsets
  * where per-vertex and per-patch data start, is passed to the shader via
  * user data SGPRs. The offsets and strides are calculated at draw time and
  * aren't available at compile time.
  */

 #ifndef SI_SHADER_H
 #define SI_SHADER_H

 #include <llvm-c/Core.h> /* LLVMModuleRef */
 #include <llvm-c/TargetMachine.h>
 #include "tgsi/tgsi_scan.h"
 #include "util/u_queue.h"
 #include "si_state.h"

 struct radeon_shader_binary;
 struct radeon_shader_reloc;

 #define SI_MAX_VS_OUTPUTS	40

 /* SGPR user data indices */
 enum {
 	SI_SGPR_RW_BUFFERS,  /* rings (& stream-out, VS only) */
 	SI_SGPR_RW_BUFFERS_HI,
 	SI_SGPR_CONST_BUFFERS,
 	SI_SGPR_CONST_BUFFERS_HI,
 	SI_SGPR_SAMPLERS,  /* images & sampler states interleaved */
 	SI_SGPR_SAMPLERS_HI,
 	SI_SGPR_IMAGES,
 	SI_SGPR_IMAGES_HI,
 	SI_SGPR_SHADER_BUFFERS,
 	SI_SGPR_SHADER_BUFFERS_HI,
 	SI_NUM_RESOURCE_SGPRS,

 	/* all VS variants */
 	SI_SGPR_VERTEX_BUFFERS	= SI_NUM_RESOURCE_SGPRS,
 	SI_SGPR_VERTEX_BUFFERS_HI,
 	SI_SGPR_BASE_VERTEX,
 	SI_SGPR_START_INSTANCE,
 	SI_SGPR_DRAWID,
 	SI_ES_NUM_USER_SGPR,

 	/* hw VS only */
 	SI_SGPR_VS_STATE_BITS	= SI_ES_NUM_USER_SGPR,
 	SI_VS_NUM_USER_SGPR,

 	/* hw LS only */
 	SI_SGPR_LS_OUT_LAYOUT	= SI_ES_NUM_USER_SGPR,
 	SI_LS_NUM_USER_SGPR,

 	/* both TCS and TES */
 	SI_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
 	SI_TES_NUM_USER_SGPR,

 	/* TCS only */
 	SI_SGPR_TCS_OUT_OFFSETS = SI_TES_NUM_USER_SGPR,
 	SI_SGPR_TCS_OUT_LAYOUT,
 	SI_SGPR_TCS_IN_LAYOUT,
 	SI_TCS_NUM_USER_SGPR,

 	/* GS limits */
 	SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
 	SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1,

 	/* PS only */
 	SI_SGPR_ALPHA_REF	= SI_NUM_RESOURCE_SGPRS,
 	SI_PS_NUM_USER_SGPR,

 	/* CS only */
 	SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS,
 	SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3,
 	SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3
 };

 /* LLVM function parameter indices */
 enum {
 	SI_PARAM_RW_BUFFERS,
 	SI_PARAM_CONST_BUFFERS,
 	SI_PARAM_SAMPLERS,
 	SI_PARAM_IMAGES,
 	SI_PARAM_SHADER_BUFFERS,
 	SI_NUM_RESOURCE_PARAMS,

 	/* VS only parameters */
 	SI_PARAM_VERTEX_BUFFERS	= SI_NUM_RESOURCE_PARAMS,
 	SI_PARAM_BASE_VERTEX,
 	SI_PARAM_START_INSTANCE,
 	SI_PARAM_DRAWID,
 	/* [0] = clamp vertex color, VS as VS only */
 	SI_PARAM_VS_STATE_BITS,
 	/* same value as TCS_IN_LAYOUT, VS as LS only */
 	SI_PARAM_LS_OUT_LAYOUT = SI_PARAM_DRAWID + 1,
 	/* the other VS parameters are assigned dynamically */

 	/* Layout of TCS outputs in the offchip buffer
 	 *   [0:8] = the number of patches per threadgroup.
 	 *   [9:15] = the number of output vertices per patch.
 	 *   [16:31] = the offset of per patch attributes in the buffer in bytes.
 	 */
 	SI_PARAM_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_PARAMS, /* for TCS & TES */

 	/* TCS only parameters. */

 	/* Offsets where TCS outputs and TCS patch outputs live in LDS:
 	 *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
 	 *   [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32
 	 */
 	SI_PARAM_TCS_OUT_OFFSETS,

 	/* Layout of TCS outputs / TES inputs:
 	 *   [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4
 	 *   [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4
 	 *   [26:31] = gl_PatchVerticesIn, max = 32
 	 */
 	SI_PARAM_TCS_OUT_LAYOUT,

 	/* Layout of LS outputs / TCS inputs
 	 *   [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4
 	 *   [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4
 	 */
 	SI_PARAM_TCS_IN_LAYOUT,

 	SI_PARAM_TCS_OC_LDS,
 	SI_PARAM_TESS_FACTOR_OFFSET,
 	SI_PARAM_PATCH_ID,
 	SI_PARAM_REL_IDS,

 	/* GS only parameters */
 	SI_PARAM_GS2VS_OFFSET = SI_NUM_RESOURCE_PARAMS,
 	SI_PARAM_GS_WAVE_ID,
 	SI_PARAM_VTX0_OFFSET,
 	SI_PARAM_VTX1_OFFSET,
 	SI_PARAM_PRIMITIVE_ID,
 	SI_PARAM_VTX2_OFFSET,
 	SI_PARAM_VTX3_OFFSET,
 	SI_PARAM_VTX4_OFFSET,
 	SI_PARAM_VTX5_OFFSET,
 	SI_PARAM_GS_INSTANCE_ID,

 	/* PS only parameters */
 	SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
 	SI_PARAM_PRIM_MASK,
 	SI_PARAM_PERSP_SAMPLE,
 	SI_PARAM_PERSP_CENTER,
 	SI_PARAM_PERSP_CENTROID,
 	SI_PARAM_PERSP_PULL_MODEL,
 	SI_PARAM_LINEAR_SAMPLE,
 	SI_PARAM_LINEAR_CENTER,
 	SI_PARAM_LINEAR_CENTROID,
 	SI_PARAM_LINE_STIPPLE_TEX,
 	SI_PARAM_POS_X_FLOAT,
 	SI_PARAM_POS_Y_FLOAT,
 	SI_PARAM_POS_Z_FLOAT,
 	SI_PARAM_POS_W_FLOAT,
 	SI_PARAM_FRONT_FACE,
 	SI_PARAM_ANCILLARY,
 	SI_PARAM_SAMPLE_COVERAGE,
 	SI_PARAM_POS_FIXED_PT,

 	/* CS only parameters */
 	SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS,
 	SI_PARAM_BLOCK_SIZE,
 	SI_PARAM_BLOCK_ID,
 	SI_PARAM_THREAD_ID,

 	SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
 };

 /* SI-specific system values. */
 enum {
 	TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT,
 	TGSI_SEMANTIC_DEFAULT_TESSINNER_SI,
 };

 /* For VS shader key fix_fetch. */
 enum {
 	SI_FIX_FETCH_NONE = 0,
 	SI_FIX_FETCH_A2_SNORM,
 	SI_FIX_FETCH_A2_SSCALED,
 	SI_FIX_FETCH_A2_SINT,
 	SI_FIX_FETCH_RGBA_32_UNORM,
 	SI_FIX_FETCH_RGBX_32_UNORM,
 	SI_FIX_FETCH_RGBA_32_SNORM,
 	SI_FIX_FETCH_RGBX_32_SNORM,
 	SI_FIX_FETCH_RGBA_32_USCALED,
 	SI_FIX_FETCH_RGBA_32_SSCALED,
 	SI_FIX_FETCH_RGBA_32_FIXED,
 	SI_FIX_FETCH_RGBX_32_FIXED,
 };

 struct si_shader;

 /* State of the context creating the shader object. */
 struct si_compiler_ctx_state {
 	/* Should only be used by si_init_shader_selector_async and
 	 * si_build_shader_variant if thread_index == -1 (non-threaded). */
 	LLVMTargetMachineRef		tm;

 	/* Used if thread_index == -1 or if debug.async is true. */
 	struct pipe_debug_callback	debug;

 	/* Used for creating the log string for gallium/ddebug. */
 	bool				is_debug_context;
 };

 /* A shader selector is a gallium CSO and contains shader variants and
  * binaries for one TGSI program. This can be shared by multiple contexts.
  */
 struct si_shader_selector {
 	struct si_screen	*screen;
 	struct util_queue_fence ready;
 	struct si_compiler_ctx_state compiler_ctx_state;

 	pipe_mutex		mutex;
 	struct si_shader	*first_variant; /* immutable after the first variant */
 	struct si_shader	*last_variant; /* mutable */

 	/* The compiled TGSI shader expecting a prolog and/or epilog (not
 	 * uploaded to a buffer).
 	 */
 	struct si_shader	*main_shader_part;

 	struct si_shader	*gs_copy_shader;

 	struct tgsi_token       *tokens;
 	struct pipe_stream_output_info  so;
 	struct tgsi_shader_info		info;

 	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
 	unsigned	type;

 	/* GS parameters. */
 	unsigned	esgs_itemsize;
 	unsigned	gs_input_verts_per_prim;
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
 	unsigned	gs_num_invocations;
 	unsigned	max_gs_stream; /* count - 1 */
 	unsigned	gsvs_vertex_size;
 	unsigned	max_gsvs_emit_size;

 	/* PS parameters. */
 	unsigned	color_attr_index[2];
 	unsigned	db_shader_control;
 	/* Set 0xf or 0x0 (4 bits) per each written output.
 	 * ANDed with spi_shader_col_format.
 	 */
 	unsigned	colors_written_4bit;

 	/* CS parameters */
 	unsigned local_size;

 	uint64_t	outputs_written;	/* "get_unique_index" bits */
 	uint32_t	patch_outputs_written;	/* "get_unique_index" bits */
 	uint32_t	outputs_written2;	/* "get_unique_index2" bits */

 	uint64_t	inputs_read;		/* "get_unique_index" bits */
 	uint32_t	inputs_read2;		/* "get_unique_index2" bits */
 };

 /* Valid shader configurations:
  *
  * API shaders       VS | TCS | TES | GS |pass| PS
  * are compiled as:     |     |     |    |thru|
  *                      |     |     |    |    |
  * Only VS & PS:     VS | --  | --  | -- | -- | PS
  * With GS:          ES | --  | --  | GS | VS | PS
  * With Tessel.:     LS | HS  | VS  | -- | -- | PS
  * With both:        LS | HS  | ES  | GS | VS | PS
  */

 /* Common VS bits between the shader key and the prolog key. */
 struct si_vs_prolog_bits {
 	unsigned	instance_divisors[SI_NUM_VERTEX_BUFFERS];
 };

 /* Common VS bits between the shader key and the epilog key. */
 struct si_vs_epilog_bits {
 	unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 };

 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
 	unsigned	prim_mode:3;
 };

 struct si_gs_prolog_bits {
 	unsigned	tri_strip_adj_fix:1;
 };

 /* Common PS bits between the shader key and the prolog key. */
 struct si_ps_prolog_bits {
 	unsigned	color_two_side:1;
 	unsigned	flatshade_colors:1;
 	unsigned	poly_stipple:1;
 	unsigned	force_persp_sample_interp:1;
 	unsigned	force_linear_sample_interp:1;
 	unsigned	force_persp_center_interp:1;
 	unsigned	force_linear_center_interp:1;
 	unsigned	bc_optimize_for_persp:1;
 	unsigned	bc_optimize_for_linear:1;
 };

 /* Common PS bits between the shader key and the epilog key. */
 struct si_ps_epilog_bits {
 	unsigned	spi_shader_col_format;
 	unsigned	color_is_int8:8;
 	unsigned	last_cbuf:3;
 	unsigned	alpha_func:3;
 	unsigned	alpha_to_one:1;
 	unsigned	poly_line_smoothing:1;
 	unsigned	clamp_color:1;
 };

 union si_shader_part_key {
 	struct {
 		struct si_vs_prolog_bits states;
 		unsigned	num_input_sgprs:5;
 		unsigned	last_input:4;
 	} vs_prolog;
 	struct {
 		struct si_vs_epilog_bits states;
 		unsigned	prim_id_param_offset:5;
 	} vs_epilog;
 	struct {
 		struct si_tcs_epilog_bits states;
 	} tcs_epilog;
 	struct {
 		struct si_gs_prolog_bits states;
 	} gs_prolog;
 	struct {
 		struct si_ps_prolog_bits states;
 		unsigned	num_input_sgprs:5;
 		unsigned	num_input_vgprs:5;
 		/* Color interpolation and two-side color selection. */
 		unsigned	colors_read:8; /* color input components read */
 		unsigned	num_interp_inputs:5; /* BCOLOR is at this location */
 		unsigned	face_vgpr_index:5;
 		unsigned	wqm:1;
 		char		color_attr_index[2];
 		char		color_interp_vgpr_index[2]; /* -1 == constant */
 	} ps_prolog;
 	struct {
 		struct si_ps_epilog_bits states;
 		unsigned	colors_written:8;
 		unsigned	writes_z:1;
 		unsigned	writes_stencil:1;
 		unsigned	writes_samplemask:1;
 	} ps_epilog;
 };

 struct si_shader_key {
 	/* Prolog and epilog flags. */
 	union {
 		struct {
 			struct si_ps_prolog_bits prolog;
 			struct si_ps_epilog_bits epilog;
 		} ps;
 		struct {
 			struct si_vs_prolog_bits prolog;
 			struct si_vs_epilog_bits epilog;
 		} vs;
 		struct {
 			struct si_tcs_epilog_bits epilog;
 		} tcs; /* tessellation control shader */
 		struct {
 			struct si_vs_epilog_bits epilog; /* same as VS */
 		} tes; /* tessellation evaluation shader */
 		struct {
 			struct si_gs_prolog_bits prolog;
 		} gs;
 	} part;

 	/* These two are initially set according to the NEXT_SHADER property,
 	 * or guessed if the property doesn't seem correct.
 	 */
 	unsigned as_es:1; /* export shader */
 	unsigned as_ls:1; /* local shader */

 	/* Flags for monolithic compilation only. */
 	union {
 		struct {
 			/* One nibble for every input: SI_FIX_FETCH_* enums. */
 			uint64_t	fix_fetch;
 		} vs;
 		struct {
 			uint64_t	inputs_to_copy; /* for fixed-func TCS */
 		} tcs;
 	} mono;

 	/* Optimization flags for asynchronous compilation only. */
 	union {
 		struct {
 			uint64_t	kill_outputs; /* "get_unique_index" bits */
 			uint32_t	kill_outputs2; /* "get_unique_index2" bits */
 			unsigned	clip_disable:1;
 		} hw_vs; /* HW VS (it can be VS, TES, GS) */
 	} opt;
 };

 struct si_shader_config {
 	unsigned			num_sgprs;
 	unsigned			num_vgprs;
 	unsigned			spilled_sgprs;
 	unsigned			spilled_vgprs;
 	unsigned			private_mem_vgprs;
 	unsigned			lds_size;
 	unsigned			spi_ps_input_ena;
 	unsigned			spi_ps_input_addr;
 	unsigned			float_mode;
 	unsigned			scratch_bytes_per_wave;
 	unsigned			rsrc1;
 	unsigned			rsrc2;
 };

 enum {
 	/* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
 	EXP_PARAM_OFFSET_0 = 0,
 	EXP_PARAM_OFFSET_31 = 31,
 	/* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
 	EXP_PARAM_DEFAULT_VAL_0000 = 64,
 	EXP_PARAM_DEFAULT_VAL_0001,
 	EXP_PARAM_DEFAULT_VAL_1110,
 	EXP_PARAM_DEFAULT_VAL_1111,
 	EXP_PARAM_UNDEFINED = 255,
 };

 /* GCN-specific shader info. */
 struct si_shader_info {
 	ubyte			vs_output_param_offset[SI_MAX_VS_OUTPUTS];
 	ubyte			num_input_sgprs;
 	ubyte			num_input_vgprs;
 	char			face_vgpr_index;
 	bool			uses_instanceid;
 	ubyte			nr_pos_exports;
 	ubyte			nr_param_exports;
 };

 struct si_shader {
 	struct si_compiler_ctx_state	compiler_ctx_state;

 	struct si_shader_selector	*selector;
 	struct si_shader		*next_variant;

 	struct si_shader_part		*prolog;
 	struct si_shader_part		*epilog;

 	struct si_pm4_state		*pm4;
 	struct r600_resource		*bo;
 	struct r600_resource		*scratch_bo;
 	struct si_shader_key		key;
 	struct util_queue_fence		optimized_ready;
 	bool				compilation_failed;
 	bool				is_monolithic;
 	bool				is_optimized;
 	bool				is_binary_shared;
 	bool				is_gs_copy_shader;

 	/* The following data is all that's needed for binary shaders. */
 	struct radeon_shader_binary	binary;
 	struct si_shader_config		config;
 	struct si_shader_info		info;

 	/* Shader key + LLVM IR + disassembly + statistics.
 	 * Generated for debug contexts only.
 	 */
 	char				*shader_log;
 	size_t				shader_log_size;
 };

 struct si_shader_part {
 	struct si_shader_part *next;
 	union si_shader_part_key key;
 	struct radeon_shader_binary binary;
 	struct si_shader_config config;
 };

 /* si_shader.c */
 struct si_shader *
 si_generate_gs_copy_shader(struct si_screen *sscreen,
 			   LLVMTargetMachineRef tm,
 			   struct si_shader_selector *gs_selector,
 			   struct pipe_debug_callback *debug);
 int si_compile_tgsi_shader(struct si_screen *sscreen,
 			   LLVMTargetMachineRef tm,
 			   struct si_shader *shader,
 			   bool is_monolithic,
 			   struct pipe_debug_callback *debug);
 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 		     struct si_shader *shader,
 		     struct pipe_debug_callback *debug);
 int si_compile_llvm(struct si_screen *sscreen,
 		    struct radeon_shader_binary *binary,
 		    struct si_shader_config *conf,
 		    LLVMTargetMachineRef tm,
 		    LLVMModuleRef mod,
 		    struct pipe_debug_callback *debug,
 		    unsigned processor,
 		    const char *name);
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
 		    struct pipe_debug_callback *debug, unsigned processor,
 		    FILE *f, bool check_debug_option);
 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
 				      unsigned *lds_size);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			struct si_shader_config *config,
 			uint64_t scratch_va);
 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				  struct si_shader_config *conf,
 				  unsigned symbol_offset);
 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
 				    bool writes_samplemask);

 #endif
	/*
	* Copyright 2012 Advanced Micro Devices, Inc.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* on the rights to use, copy, modify, merge, publish, distribute, sub
	* license, and/or sell copies of the Software, and to permit persons to whom
	* the Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
	* USE OR OTHER DEALINGS IN THE SOFTWARE.
	*
	* Authors:
	* Tom Stellard <thomas.stellard@amd.com>
	* Michel Dänzer <michel.daenzer@amd.com>
	* Christian König <christian.koenig@amd.com>
	*/

	/* How linking shader inputs and outputs between vertex, tessellation, and
	* geometry shaders works.
	*
	* Inputs and outputs between shaders are stored in a buffer. This buffer
	* lives in LDS (typical case for tessellation), but it can also live
	* in memory (ESGS). Each input or output has a fixed location within a vertex.
	* The highest used input or output determines the stride between vertices.
	*
	* Since GS and tessellation are only possible in the OpenGL core profile,
	* only these semantics are valid for per-vertex data:
	*
	* Name Location
	*
	* POSITION 0
	* PSIZE 1
	* CLIPDIST0..1 2..3
	* CULLDIST0..1 (not implemented)
	* GENERIC0..31 4..35
	*
	* For example, a shader only writing GENERIC0 has the output stride of 5.
	*
	* Only these semantics are valid for per-patch data:
	*
	* Name Location
	*
	* TESSOUTER 0
	* TESSINNER 1
	* PATCH0..29 2..31
	*
	* That's how independent shaders agree on input and output locations.
	* The si_shader_io_get_unique_index function assigns the locations.
	*
	* For tessellation, other required information for calculating the input and
	* output addresses like the vertex stride, the patch stride, and the offsets
	* where per-vertex and per-patch data start, is passed to the shader via
	* user data SGPRs. The offsets and strides are calculated at draw time and
	* aren't available at compile time.
	*/

	#ifndef SI_SHADER_H
	#define SI_SHADER_H

	#include <llvm-c/Core.h> /* LLVMModuleRef */
	#include <llvm-c/TargetMachine.h>
	#include "tgsi/tgsi_scan.h"
	#include "util/u_queue.h"
	#include "si_state.h"

	struct radeon_shader_binary;
	struct radeon_shader_reloc;

	#define SI_MAX_VS_OUTPUTS 40

	/* SGPR user data indices */
	enum {
	SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */
	SI_SGPR_RW_BUFFERS_HI,
	SI_SGPR_CONST_BUFFERS,
	SI_SGPR_CONST_BUFFERS_HI,
	SI_SGPR_SAMPLERS, /* images & sampler states interleaved */
	SI_SGPR_SAMPLERS_HI,
	SI_SGPR_IMAGES,
	SI_SGPR_IMAGES_HI,
	SI_SGPR_SHADER_BUFFERS,
	SI_SGPR_SHADER_BUFFERS_HI,
	SI_NUM_RESOURCE_SGPRS,

	/* all VS variants */
	SI_SGPR_VERTEX_BUFFERS = SI_NUM_RESOURCE_SGPRS,
	SI_SGPR_VERTEX_BUFFERS_HI,
	SI_SGPR_BASE_VERTEX,
	SI_SGPR_START_INSTANCE,
	SI_SGPR_DRAWID,
	SI_ES_NUM_USER_SGPR,

	/* hw VS only */
	SI_SGPR_VS_STATE_BITS = SI_ES_NUM_USER_SGPR,
	SI_VS_NUM_USER_SGPR,

	/* hw LS only */
	SI_SGPR_LS_OUT_LAYOUT = SI_ES_NUM_USER_SGPR,
	SI_LS_NUM_USER_SGPR,

	/* both TCS and TES */
	SI_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
	SI_TES_NUM_USER_SGPR,

	/* TCS only */
	SI_SGPR_TCS_OUT_OFFSETS = SI_TES_NUM_USER_SGPR,
	SI_SGPR_TCS_OUT_LAYOUT,
	SI_SGPR_TCS_IN_LAYOUT,
	SI_TCS_NUM_USER_SGPR,

	/* GS limits */
	SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS,
	SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1,

	/* PS only */
	SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS,
	SI_PS_NUM_USER_SGPR,

	/* CS only */
	SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS,
	SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3,
	SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3
	};

	/* LLVM function parameter indices */
	enum {
	SI_PARAM_RW_BUFFERS,
	SI_PARAM_CONST_BUFFERS,
	SI_PARAM_SAMPLERS,
	SI_PARAM_IMAGES,
	SI_PARAM_SHADER_BUFFERS,
	SI_NUM_RESOURCE_PARAMS,

	/* VS only parameters */
	SI_PARAM_VERTEX_BUFFERS = SI_NUM_RESOURCE_PARAMS,
	SI_PARAM_BASE_VERTEX,
	SI_PARAM_START_INSTANCE,
	SI_PARAM_DRAWID,
	/* [0] = clamp vertex color, VS as VS only */
	SI_PARAM_VS_STATE_BITS,
	/* same value as TCS_IN_LAYOUT, VS as LS only */
	SI_PARAM_LS_OUT_LAYOUT = SI_PARAM_DRAWID + 1,
	/* the other VS parameters are assigned dynamically */

	/* Layout of TCS outputs in the offchip buffer
	* [0:8] = the number of patches per threadgroup.
	* [9:15] = the number of output vertices per patch.
	* [16:31] = the offset of per patch attributes in the buffer in bytes.
	*/
	SI_PARAM_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_PARAMS, /* for TCS & TES */

	/* TCS only parameters. */

	/* Offsets where TCS outputs and TCS patch outputs live in LDS:
	* [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
	* [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES3232* + 32*32
	*/
	SI_PARAM_TCS_OUT_OFFSETS,

	/* Layout of TCS outputs / TES inputs:
	* [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32324
	* [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4
	* [26:31] = gl_PatchVerticesIn, max = 32
	*/
	SI_PARAM_TCS_OUT_LAYOUT,

	/* Layout of LS outputs / TCS inputs
	* [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32324
	* [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4
	*/
	SI_PARAM_TCS_IN_LAYOUT,

	SI_PARAM_TCS_OC_LDS,
	SI_PARAM_TESS_FACTOR_OFFSET,
	SI_PARAM_PATCH_ID,
	SI_PARAM_REL_IDS,

	/* GS only parameters */
	SI_PARAM_GS2VS_OFFSET = SI_NUM_RESOURCE_PARAMS,
	SI_PARAM_GS_WAVE_ID,
	SI_PARAM_VTX0_OFFSET,
	SI_PARAM_VTX1_OFFSET,
	SI_PARAM_PRIMITIVE_ID,
	SI_PARAM_VTX2_OFFSET,
	SI_PARAM_VTX3_OFFSET,
	SI_PARAM_VTX4_OFFSET,
	SI_PARAM_VTX5_OFFSET,
	SI_PARAM_GS_INSTANCE_ID,

	/* PS only parameters */
	SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS,
	SI_PARAM_PRIM_MASK,
	SI_PARAM_PERSP_SAMPLE,
	SI_PARAM_PERSP_CENTER,
	SI_PARAM_PERSP_CENTROID,
	SI_PARAM_PERSP_PULL_MODEL,
	SI_PARAM_LINEAR_SAMPLE,
	SI_PARAM_LINEAR_CENTER,
	SI_PARAM_LINEAR_CENTROID,
	SI_PARAM_LINE_STIPPLE_TEX,
	SI_PARAM_POS_X_FLOAT,
	SI_PARAM_POS_Y_FLOAT,
	SI_PARAM_POS_Z_FLOAT,
	SI_PARAM_POS_W_FLOAT,
	SI_PARAM_FRONT_FACE,
	SI_PARAM_ANCILLARY,
	SI_PARAM_SAMPLE_COVERAGE,
	SI_PARAM_POS_FIXED_PT,

	/* CS only parameters */
	SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS,
	SI_PARAM_BLOCK_SIZE,
	SI_PARAM_BLOCK_ID,
	SI_PARAM_THREAD_ID,

	SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */
	};

	/* SI-specific system values. */
	enum {
	TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT,
	TGSI_SEMANTIC_DEFAULT_TESSINNER_SI,
	};

	/* For VS shader key fix_fetch. */
	enum {
	SI_FIX_FETCH_NONE = 0,
	SI_FIX_FETCH_A2_SNORM,
	SI_FIX_FETCH_A2_SSCALED,
	SI_FIX_FETCH_A2_SINT,
	SI_FIX_FETCH_RGBA_32_UNORM,
	SI_FIX_FETCH_RGBX_32_UNORM,
	SI_FIX_FETCH_RGBA_32_SNORM,
	SI_FIX_FETCH_RGBX_32_SNORM,
	SI_FIX_FETCH_RGBA_32_USCALED,
	SI_FIX_FETCH_RGBA_32_SSCALED,
	SI_FIX_FETCH_RGBA_32_FIXED,
	SI_FIX_FETCH_RGBX_32_FIXED,
	};

	struct si_shader;

	/* State of the context creating the shader object. */
	struct si_compiler_ctx_state {
	/* Should only be used by si_init_shader_selector_async and
	* si_build_shader_variant if thread_index == -1 (non-threaded). */
	LLVMTargetMachineRef tm;

	/* Used if thread_index == -1 or if debug.async is true. */
	struct pipe_debug_callback debug;

	/* Used for creating the log string for gallium/ddebug. */
	bool is_debug_context;
	};

	/* A shader selector is a gallium CSO and contains shader variants and
	* binaries for one TGSI program. This can be shared by multiple contexts.
	*/
	struct si_shader_selector {
	struct si_screen *screen;
	struct util_queue_fence ready;
	struct si_compiler_ctx_state compiler_ctx_state;

	pipe_mutex mutex;
	struct si_shader first_variant; / immutable after the first variant */
	struct si_shader last_variant; / mutable */

	/* The compiled TGSI shader expecting a prolog and/or epilog (not
	* uploaded to a buffer).
	*/
	struct si_shader *main_shader_part;

	struct si_shader *gs_copy_shader;

	struct tgsi_token *tokens;
	struct pipe_stream_output_info so;
	struct tgsi_shader_info info;

	/* PIPE_SHADER_[VERTEX\|FRAGMENT\|...] */
	unsigned type;

	/* GS parameters. */
	unsigned esgs_itemsize;
	unsigned gs_input_verts_per_prim;
	unsigned gs_output_prim;
	unsigned gs_max_out_vertices;
	unsigned gs_num_invocations;
	unsigned max_gs_stream; /* count - 1 */
	unsigned gsvs_vertex_size;
	unsigned max_gsvs_emit_size;

	/* PS parameters. */
	unsigned color_attr_index[2];
	unsigned db_shader_control;
	/* Set 0xf or 0x0 (4 bits) per each written output.
	* ANDed with spi_shader_col_format.
	*/
	unsigned colors_written_4bit;

	/* CS parameters */
	unsigned local_size;

	uint64_t outputs_written; /* "get_unique_index" bits */
	uint32_t patch_outputs_written; /* "get_unique_index" bits */
	uint32_t outputs_written2; /* "get_unique_index2" bits */

	uint64_t inputs_read; /* "get_unique_index" bits */
	uint32_t inputs_read2; /* "get_unique_index2" bits */
	};

	/* Valid shader configurations:
	*
	* API shaders VS \| TCS \| TES \| GS \|pass\| PS
	* are compiled as: \| \| \| \|thru\|
	* \| \| \| \| \|
	* Only VS & PS: VS \| -- \| -- \| -- \| -- \| PS
	* With GS: ES \| -- \| -- \| GS \| VS \| PS
	* With Tessel.: LS \| HS \| VS \| -- \| -- \| PS
	* With both: LS \| HS \| ES \| GS \| VS \| PS
	*/

	/* Common VS bits between the shader key and the prolog key. */
	struct si_vs_prolog_bits {
	unsigned instance_divisors[SI_NUM_VERTEX_BUFFERS];
	};

	/* Common VS bits between the shader key and the epilog key. */
	struct si_vs_epilog_bits {
	unsigned export_prim_id:1; /* when PS needs it and GS is disabled */
	};

	/* Common TCS bits between the shader key and the epilog key. */
	struct si_tcs_epilog_bits {
	unsigned prim_mode:3;
	};

	struct si_gs_prolog_bits {
	unsigned tri_strip_adj_fix:1;
	};

	/* Common PS bits between the shader key and the prolog key. */
	struct si_ps_prolog_bits {
	unsigned color_two_side:1;
	unsigned flatshade_colors:1;
	unsigned poly_stipple:1;
	unsigned force_persp_sample_interp:1;
	unsigned force_linear_sample_interp:1;
	unsigned force_persp_center_interp:1;
	unsigned force_linear_center_interp:1;
	unsigned bc_optimize_for_persp:1;
	unsigned bc_optimize_for_linear:1;
	};

	/* Common PS bits between the shader key and the epilog key. */
	struct si_ps_epilog_bits {
	unsigned spi_shader_col_format;
	unsigned color_is_int8:8;
	unsigned last_cbuf:3;
	unsigned alpha_func:3;
	unsigned alpha_to_one:1;
	unsigned poly_line_smoothing:1;
	unsigned clamp_color:1;
	};

	union si_shader_part_key {
	struct {
	struct si_vs_prolog_bits states;
	unsigned num_input_sgprs:5;
	unsigned last_input:4;
	} vs_prolog;
	struct {
	struct si_vs_epilog_bits states;
	unsigned prim_id_param_offset:5;
	} vs_epilog;
	struct {
	struct si_tcs_epilog_bits states;
	} tcs_epilog;
	struct {
	struct si_gs_prolog_bits states;
	} gs_prolog;
	struct {
	struct si_ps_prolog_bits states;
	unsigned num_input_sgprs:5;
	unsigned num_input_vgprs:5;
	/* Color interpolation and two-side color selection. */
	unsigned colors_read:8; /* color input components read */
	unsigned num_interp_inputs:5; /* BCOLOR is at this location */
	unsigned face_vgpr_index:5;
	unsigned wqm:1;
	char color_attr_index[2];
	char color_interp_vgpr_index[2]; /* -1 == constant */
	} ps_prolog;
	struct {
	struct si_ps_epilog_bits states;
	unsigned colors_written:8;
	unsigned writes_z:1;
	unsigned writes_stencil:1;
	unsigned writes_samplemask:1;
	} ps_epilog;
	};

	struct si_shader_key {
	/* Prolog and epilog flags. */
	union {
	struct {
	struct si_ps_prolog_bits prolog;
	struct si_ps_epilog_bits epilog;
	} ps;
	struct {
	struct si_vs_prolog_bits prolog;
	struct si_vs_epilog_bits epilog;
	} vs;
	struct {
	struct si_tcs_epilog_bits epilog;
	} tcs; /* tessellation control shader */
	struct {
	struct si_vs_epilog_bits epilog; /* same as VS */
	} tes; /* tessellation evaluation shader */
	struct {
	struct si_gs_prolog_bits prolog;
	} gs;
	} part;

	/* These two are initially set according to the NEXT_SHADER property,
	* or guessed if the property doesn't seem correct.
	*/
	unsigned as_es:1; /* export shader */
	unsigned as_ls:1; /* local shader */

	/* Flags for monolithic compilation only. */
	union {
	struct {
	/* One nibble for every input: SI_FIX_FETCH_* enums. */
	uint64_t fix_fetch;
	} vs;
	struct {
	uint64_t inputs_to_copy; /* for fixed-func TCS */
	} tcs;
	} mono;

	/* Optimization flags for asynchronous compilation only. */
	union {
	struct {
	uint64_t kill_outputs; /* "get_unique_index" bits */
	uint32_t kill_outputs2; /* "get_unique_index2" bits */
	unsigned clip_disable:1;
	} hw_vs; /* HW VS (it can be VS, TES, GS) */
	} opt;
	};

	struct si_shader_config {
	unsigned num_sgprs;
	unsigned num_vgprs;
	unsigned spilled_sgprs;
	unsigned spilled_vgprs;
	unsigned private_mem_vgprs;
	unsigned lds_size;
	unsigned spi_ps_input_ena;
	unsigned spi_ps_input_addr;
	unsigned float_mode;
	unsigned scratch_bytes_per_wave;
	unsigned rsrc1;
	unsigned rsrc2;
	};

	enum {
	/* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
	EXP_PARAM_OFFSET_0 = 0,
	EXP_PARAM_OFFSET_31 = 31,
	/* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
	EXP_PARAM_DEFAULT_VAL_0000 = 64,
	EXP_PARAM_DEFAULT_VAL_0001,
	EXP_PARAM_DEFAULT_VAL_1110,
	EXP_PARAM_DEFAULT_VAL_1111,
	EXP_PARAM_UNDEFINED = 255,
	};

	/* GCN-specific shader info. */
	struct si_shader_info {
	ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
	ubyte num_input_sgprs;
	ubyte num_input_vgprs;
	char face_vgpr_index;
	bool uses_instanceid;
	ubyte nr_pos_exports;
	ubyte nr_param_exports;
	};

	struct si_shader {
	struct si_compiler_ctx_state compiler_ctx_state;

	struct si_shader_selector *selector;
	struct si_shader *next_variant;

	struct si_shader_part *prolog;
	struct si_shader_part *epilog;

	struct si_pm4_state *pm4;
	struct r600_resource *bo;
	struct r600_resource *scratch_bo;
	struct si_shader_key key;
	struct util_queue_fence optimized_ready;
	bool compilation_failed;
	bool is_monolithic;
	bool is_optimized;
	bool is_binary_shared;
	bool is_gs_copy_shader;

	/* The following data is all that's needed for binary shaders. */
	struct radeon_shader_binary binary;
	struct si_shader_config config;
	struct si_shader_info info;

	/* Shader key + LLVM IR + disassembly + statistics.
	* Generated for debug contexts only.
	*/
	char *shader_log;
	size_t shader_log_size;
	};

	struct si_shader_part {
	struct si_shader_part *next;
	union si_shader_part_key key;
	struct radeon_shader_binary binary;
	struct si_shader_config config;
	};

	/* si_shader.c */
	struct si_shader *
	si_generate_gs_copy_shader(struct si_screen *sscreen,
	LLVMTargetMachineRef tm,
	struct si_shader_selector *gs_selector,
	struct pipe_debug_callback *debug);
	int si_compile_tgsi_shader(struct si_screen *sscreen,
	LLVMTargetMachineRef tm,
	struct si_shader *shader,
	bool is_monolithic,
	struct pipe_debug_callback *debug);
	int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
	struct si_shader *shader,
	struct pipe_debug_callback *debug);
	int si_compile_llvm(struct si_screen *sscreen,
	struct radeon_shader_binary *binary,
	struct si_shader_config *conf,
	LLVMTargetMachineRef tm,
	LLVMModuleRef mod,
	struct pipe_debug_callback *debug,
	unsigned processor,
	const char *name);
	void si_shader_destroy(struct si_shader *shader);
	unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
	unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index);
	int si_shader_binary_upload(struct si_screen sscreen, struct si_shader shader);
	void si_shader_dump(struct si_screen sscreen, struct si_shader shader,
	struct pipe_debug_callback *debug, unsigned processor,
	FILE *f, bool check_debug_option);
	void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
	unsigned *lds_size);
	void si_shader_apply_scratch_relocs(struct si_context *sctx,
	struct si_shader *shader,
	struct si_shader_config *config,
	uint64_t scratch_va);
	void si_shader_binary_read_config(struct radeon_shader_binary *binary,
	struct si_shader_config *conf,
	unsigned symbol_offset);
	unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
	bool writes_samplemask);

	#endif