X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_shader.h;h=14230b8207307b3ee242cd766366cf0a13d5676a;hb=e334e104d0fe8a9704a51ad897cdae34006273da;hp=c38e7f560b6aeb2669ed0baa1abefe03bc40ca76;hpb=69a687189e0381d0ff8c2f079698b8adfbb0a7b1;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index c38e7f560b6..14230b82073 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -1,5 +1,6 @@ /* * Copyright 2012 Advanced Micro Devices, Inc. + * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -19,11 +20,73 @@ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* The compiler middle-end architecture: Explaining (non-)monolithic shaders + * ------------------------------------------------------------------------- + * + * Typically, there is one-to-one correspondence between API and HW shaders, + * that is, for every API shader, there is exactly one shader binary in + * the driver. + * + * The problem with that is that we also have to emulate some API states + * (e.g. alpha-test, and many others) in shaders too. The two obvious ways + * to deal with it are: + * - each shader has multiple variants for each combination of emulated states, + * and the variants are compiled on demand, possibly relying on a shader + * cache for good performance + * - patch shaders at the binary level + * + * This driver uses something completely different. The emulated states are + * usually implemented at the beginning or end of shaders. Therefore, we can + * split the shader into 3 parts: + * - prolog part (shader code dependent on states) + * - main part (the API shader) + * - epilog part (shader code dependent on states) + * + * Each part is compiled as a separate shader and the final binaries are + * concatenated. This type of shader is called non-monolithic, because it + * consists of multiple independent binaries. Creating a new shader variant + * is therefore only a concatenation of shader parts (binaries) and doesn't + * involve any compilation. The main shader parts are the only parts that are + * compiled when applications create shader objects. The prolog and epilog + * parts are compiled on the first use and saved, so that their binaries can + * be reused by many other shaders. + * + * One of the roles of the prolog part is to compute vertex buffer addresses + * for vertex shaders. A few of the roles of the epilog part are color buffer + * format conversions in pixel shaders that we have to do manually, and write + * tessellation factors in tessellation control shaders. The prolog and epilog + * have many other important responsibilities in various shader stages. + * They don't just "emulate legacy stuff". + * + * Monolithic shaders are shaders where the parts are combined before LLVM + * compilation, and the whole thing is compiled and optimized as one unit with + * one binary on the output. The result is the same as the non-monolithic + * shader, but the final code can be better, because LLVM can optimize across + * all shader parts. Monolithic shaders aren't usually used except for these + * special cases: + * + * 1) Some rarely-used states require modification of the main shader part + * itself, and in such cases, only the monolithic shader variant is + * compiled, and that's always done on the first use. + * + * 2) When we do cross-stage optimizations for separate shader objects and + * e.g. eliminate unused shader varyings, the resulting optimized shader + * variants are always compiled as monolithic shaders, and always + * asynchronously (i.e. not stalling ongoing rendering). We call them + * "optimized monolithic" shaders. The important property here is that + * the non-monolithic unoptimized shader variant is always available for use + * when the asynchronous compilation of the optimized shader is not done + * yet. * - * Authors: - * Tom Stellard - * Michel Dänzer - * Christian König + * Starting with GFX9 chips, some shader stages are merged, and the number of + * shader parts per shader increased. The complete new list of shader parts is: + * - 1st shader: prolog part + * - 1st shader: main part + * - 2nd shader: prolog part + * - 2nd shader: main part + * - 2nd shader: epilog part */ /* How linking shader inputs and outputs between vertex, tessellation, and @@ -71,131 +134,111 @@ #include /* LLVMModuleRef */ #include #include "tgsi/tgsi_scan.h" +#include "util/u_inlines.h" #include "util/u_queue.h" -#include "si_state.h" -struct ac_shader_binary; +#include "ac_binary.h" +#include "ac_llvm_build.h" +#include "ac_llvm_util.h" +#include + +struct nir_shader; +struct si_shader; +struct si_context; + +#define SI_MAX_ATTRIBS 16 #define SI_MAX_VS_OUTPUTS 40 +/* Shader IO unique indices are supported for TGSI_SEMANTIC_GENERIC with an + * index smaller than this. + */ +#define SI_MAX_IO_GENERIC 43 + /* SGPR user data indices */ enum { SI_SGPR_RW_BUFFERS, /* rings (& stream-out, VS only) */ +#if !HAVE_32BIT_POINTERS SI_SGPR_RW_BUFFERS_HI, - SI_SGPR_CONST_BUFFERS, - SI_SGPR_CONST_BUFFERS_HI, - SI_SGPR_SAMPLERS, /* images & sampler states interleaved */ - SI_SGPR_SAMPLERS_HI, - SI_SGPR_IMAGES, - SI_SGPR_IMAGES_HI, - SI_SGPR_SHADER_BUFFERS, - SI_SGPR_SHADER_BUFFERS_HI, +#endif + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES, +#if !HAVE_32BIT_POINTERS + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES_HI, +#endif + SI_SGPR_CONST_AND_SHADER_BUFFERS, /* or just a constant buffer 0 pointer */ +#if !HAVE_32BIT_POINTERS + SI_SGPR_CONST_AND_SHADER_BUFFERS_HI, +#endif + SI_SGPR_SAMPLERS_AND_IMAGES, +#if !HAVE_32BIT_POINTERS + SI_SGPR_SAMPLERS_AND_IMAGES_HI, +#endif SI_NUM_RESOURCE_SGPRS, + /* API VS, TES without GS, GS copy shader */ + SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS, + SI_NUM_VS_STATE_RESOURCE_SGPRS, + /* all VS variants */ - SI_SGPR_VERTEX_BUFFERS = SI_NUM_RESOURCE_SGPRS, - SI_SGPR_VERTEX_BUFFERS_HI, - SI_SGPR_BASE_VERTEX, + SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS, SI_SGPR_START_INSTANCE, SI_SGPR_DRAWID, - SI_ES_NUM_USER_SGPR, - - /* hw VS only */ - SI_SGPR_VS_STATE_BITS = SI_ES_NUM_USER_SGPR, SI_VS_NUM_USER_SGPR, - /* hw LS only */ - SI_SGPR_LS_OUT_LAYOUT = SI_ES_NUM_USER_SGPR, - SI_LS_NUM_USER_SGPR, + SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS, - /* both TCS and TES */ - SI_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS, + /* TES */ + SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS, + SI_SGPR_TES_OFFCHIP_ADDR, SI_TES_NUM_USER_SGPR, - /* TCS only */ - SI_SGPR_TCS_OUT_OFFSETS = SI_TES_NUM_USER_SGPR, - SI_SGPR_TCS_OUT_LAYOUT, - SI_SGPR_TCS_IN_LAYOUT, - SI_TCS_NUM_USER_SGPR, + /* GFX6-8: TCS only */ + GFX6_SGPR_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS, + GFX6_SGPR_TCS_OUT_OFFSETS, + GFX6_SGPR_TCS_OUT_LAYOUT, + GFX6_SGPR_TCS_IN_LAYOUT, + GFX6_TCS_NUM_USER_SGPR, + + /* GFX9: Merged shaders. */ +#if HAVE_32BIT_POINTERS + /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO (SGPR0). */ + /* 2ND_SAMPLERS_AND_IMAGES is set in USER_DATA_ADDR_HI (SGPR1). */ + GFX9_MERGED_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, +#else + /* 2ND_CONST_AND_SHADER_BUFFERS is set in USER_DATA_ADDR_LO/HI (SGPR[0:1]). */ + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES = SI_VS_NUM_USER_SGPR, + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES_HI, + GFX9_MERGED_NUM_USER_SGPR, +#endif + + /* GFX9: Merged LS-HS (VS-TCS) only. */ + GFX9_SGPR_TCS_OFFCHIP_LAYOUT = GFX9_MERGED_NUM_USER_SGPR, + GFX9_SGPR_TCS_OUT_OFFSETS, + GFX9_SGPR_TCS_OUT_LAYOUT, +#if !HAVE_32BIT_POINTERS + GFX9_SGPR_align_for_vb_pointer, +#endif + GFX9_TCS_NUM_USER_SGPR, /* GS limits */ - SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, - SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1, + GFX6_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, +#if HAVE_32BIT_POINTERS + GFX9_VSGS_NUM_USER_SGPR = SI_VS_NUM_USER_SGPR, + GFX9_TESGS_NUM_USER_SGPR = SI_TES_NUM_USER_SGPR, +#else + GFX9_VSGS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR, + GFX9_TESGS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR, +#endif + SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS, /* PS only */ SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, SI_PS_NUM_USER_SGPR, - - /* CS only */ - SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS, - SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3, - SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3 }; /* LLVM function parameter indices */ enum { - SI_PARAM_RW_BUFFERS, - SI_PARAM_CONST_BUFFERS, - SI_PARAM_SAMPLERS, - SI_PARAM_IMAGES, - SI_PARAM_SHADER_BUFFERS, - SI_NUM_RESOURCE_PARAMS, - - /* VS only parameters */ - SI_PARAM_VERTEX_BUFFERS = SI_NUM_RESOURCE_PARAMS, - SI_PARAM_BASE_VERTEX, - SI_PARAM_START_INSTANCE, - SI_PARAM_DRAWID, - /* [0] = clamp vertex color, VS as VS only */ - SI_PARAM_VS_STATE_BITS, - /* same value as TCS_IN_LAYOUT, VS as LS only */ - SI_PARAM_LS_OUT_LAYOUT = SI_PARAM_DRAWID + 1, - /* the other VS parameters are assigned dynamically */ - - /* Layout of TCS outputs in the offchip buffer - * [0:8] = the number of patches per threadgroup. - * [9:15] = the number of output vertices per patch. - * [16:31] = the offset of per patch attributes in the buffer in bytes. - */ - SI_PARAM_TCS_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_PARAMS, /* for TCS & TES */ - - /* TCS only parameters. */ - - /* Offsets where TCS outputs and TCS patch outputs live in LDS: - * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 - * [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32 - */ - SI_PARAM_TCS_OUT_OFFSETS, - - /* Layout of TCS outputs / TES inputs: - * [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4 - * [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4 - * [26:31] = gl_PatchVerticesIn, max = 32 - */ - SI_PARAM_TCS_OUT_LAYOUT, - - /* Layout of LS outputs / TCS inputs - * [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4 - * [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4 - */ - SI_PARAM_TCS_IN_LAYOUT, - - SI_PARAM_TCS_OC_LDS, - SI_PARAM_TESS_FACTOR_OFFSET, - SI_PARAM_PATCH_ID, - SI_PARAM_REL_IDS, - - /* GS only parameters */ - SI_PARAM_GS2VS_OFFSET = SI_NUM_RESOURCE_PARAMS, - SI_PARAM_GS_WAVE_ID, - SI_PARAM_VTX0_OFFSET, - SI_PARAM_VTX1_OFFSET, - SI_PARAM_PRIMITIVE_ID, - SI_PARAM_VTX2_OFFSET, - SI_PARAM_VTX3_OFFSET, - SI_PARAM_VTX4_OFFSET, - SI_PARAM_VTX5_OFFSET, - SI_PARAM_GS_INSTANCE_ID, + SI_NUM_RESOURCE_PARAMS = 4, /* PS only parameters */ SI_PARAM_ALPHA_REF = SI_NUM_RESOURCE_PARAMS, @@ -217,19 +260,44 @@ enum { SI_PARAM_SAMPLE_COVERAGE, SI_PARAM_POS_FIXED_PT, - /* CS only parameters */ - SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS, - SI_PARAM_BLOCK_SIZE, - SI_PARAM_BLOCK_ID, - SI_PARAM_THREAD_ID, - SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */ }; +/* Fields of driver-defined VS state SGPR. */ +/* Clamp vertex color output (only used in VS as VS). */ +#define S_VS_STATE_CLAMP_VERTEX_COLOR(x) (((unsigned)(x) & 0x1) << 0) +#define C_VS_STATE_CLAMP_VERTEX_COLOR 0xFFFFFFFE +#define S_VS_STATE_INDEXED(x) (((unsigned)(x) & 0x1) << 1) +#define C_VS_STATE_INDEXED 0xFFFFFFFD +#define S_VS_STATE_LS_OUT_PATCH_SIZE(x) (((unsigned)(x) & 0x1FFF) << 8) +#define C_VS_STATE_LS_OUT_PATCH_SIZE 0xFFE000FF +#define S_VS_STATE_LS_OUT_VERTEX_SIZE(x) (((unsigned)(x) & 0xFF) << 24) +#define C_VS_STATE_LS_OUT_VERTEX_SIZE 0x00FFFFFF + /* SI-specific system values. */ enum { + /* Values from set_tess_state. */ TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT, TGSI_SEMANTIC_DEFAULT_TESSINNER_SI, + + /* Up to 4 dwords in user SGPRs for compute shaders. */ + TGSI_SEMANTIC_CS_USER_DATA, +}; + +enum { + /* Use a property enum that CS wouldn't use. */ + TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN, + + /* The number of used user data dwords in the range [1, 4]. */ + TGSI_PROPERTY_CS_USER_DATA_DWORDS = TGSI_PROPERTY_FS_COORD_PIXEL_CENTER, + + /* Use a property enum that VS wouldn't use. */ + TGSI_PROPERTY_VS_BLIT_SGPRS = TGSI_PROPERTY_FS_COORD_ORIGIN, + + /* These represent the number of SGPRs the shader uses. */ + SI_VS_BLIT_SGPRS_POS = 3, + SI_VS_BLIT_SGPRS_POS_COLOR = 7, + SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9, }; /* For VS shader key fix_fetch. */ @@ -261,7 +329,7 @@ struct si_shader; struct si_compiler_ctx_state { /* Should only be used by si_init_shader_selector_async and * si_build_shader_variant if thread_index == -1 (non-threaded). */ - LLVMTargetMachineRef tm; + struct ac_llvm_compiler *compiler; /* Used if thread_index == -1 or if debug.async is true. */ struct pipe_debug_callback debug; @@ -274,11 +342,12 @@ struct si_compiler_ctx_state { * binaries for one TGSI program. This can be shared by multiple contexts. */ struct si_shader_selector { + struct pipe_reference reference; struct si_screen *screen; struct util_queue_fence ready; struct si_compiler_ctx_state compiler_ctx_state; - pipe_mutex mutex; + mtx_t mutex; struct si_shader *first_variant; /* immutable after the first variant */ struct si_shader *last_variant; /* mutable */ @@ -292,14 +361,24 @@ struct si_shader_selector { struct si_shader *gs_copy_shader; struct tgsi_token *tokens; + struct nir_shader *nir; struct pipe_stream_output_info so; struct tgsi_shader_info info; + struct tgsi_tessctrl_info tcs_info; /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */ unsigned type; + bool vs_needs_prolog; + bool force_correct_derivs_after_kill; + unsigned pa_cl_vs_out_cntl; + ubyte clipdist_mask; + ubyte culldist_mask; + + /* ES parameters. */ + unsigned esgs_itemsize; /* vertex stride */ + unsigned lshs_vertex_stride; /* GS parameters. */ - unsigned esgs_itemsize; unsigned gs_input_verts_per_prim; unsigned gs_output_prim; unsigned gs_max_out_vertices; @@ -307,6 +386,7 @@ struct si_shader_selector { unsigned max_gs_stream; /* count - 1 */ unsigned gsvs_vertex_size; unsigned max_gsvs_emit_size; + unsigned enabled_streamout_buffer_mask; /* PS parameters. */ unsigned color_attr_index[2]; @@ -316,15 +396,15 @@ struct si_shader_selector { */ unsigned colors_written_4bit; - /* CS parameters */ - unsigned local_size; - + uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ uint64_t outputs_written; /* "get_unique_index" bits */ - uint32_t patch_outputs_written; /* "get_unique_index" bits */ - uint32_t outputs_written2; /* "get_unique_index2" bits */ + uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ uint64_t inputs_read; /* "get_unique_index" bits */ - uint32_t inputs_read2; /* "get_unique_index2" bits */ + + /* bitmasks of used descriptor slots */ + uint32_t active_const_and_shader_buffers; + uint64_t active_samplers_and_images; }; /* Valid shader configurations: @@ -332,30 +412,45 @@ struct si_shader_selector { * API shaders VS | TCS | TES | GS |pass| PS * are compiled as: | | | |thru| * | | | | | - * Only VS & PS: VS | -- | -- | -- | -- | PS - * With GS: ES | -- | -- | GS | VS | PS - * With Tessel.: LS | HS | VS | -- | -- | PS - * With both: LS | HS | ES | GS | VS | PS + * Only VS & PS: VS | | | | | PS + * GFX6 - with GS: ES | | | GS | VS | PS + * - with tess: LS | HS | VS | | | PS + * - with both: LS | HS | ES | GS | VS | PS + * GFX9 - with GS: -> | | | GS | VS | PS + * - with tess: -> | HS | VS | | | PS + * - with both: -> | HS | -> | GS | VS | PS + * + * -> = merged with the next stage + */ + +/* Use the byte alignment for all following structure members for optimal + * shader key memory footprint. */ +#pragma pack(push, 1) /* Common VS bits between the shader key and the prolog key. */ struct si_vs_prolog_bits { - unsigned instance_divisors[SI_MAX_ATTRIBS]; -}; - -/* Common VS bits between the shader key and the epilog key. */ -struct si_vs_epilog_bits { - unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ + /* - If neither "is_one" nor "is_fetched" has a bit set, the instance + * divisor is 0. + * - If "is_one" has a bit set, the instance divisor is 1. + * - If "is_fetched" has a bit set, the instance divisor will be loaded + * from the constant buffer. + */ + uint16_t instance_divisor_is_one; /* bitmask of inputs */ + uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ + unsigned ls_vgpr_fix:1; }; /* Common TCS bits between the shader key and the epilog key. */ struct si_tcs_epilog_bits { unsigned prim_mode:3; + unsigned invoc0_tess_factors_are_def:1; unsigned tes_reads_tess_factors:1; }; struct si_gs_prolog_bits { unsigned tri_strip_adj_fix:1; + unsigned gfx9_prev_is_vs:1; }; /* Common PS bits between the shader key and the prolog key. */ @@ -369,6 +464,7 @@ struct si_ps_prolog_bits { unsigned force_linear_center_interp:1; unsigned bc_optimize_for_persp:1; unsigned bc_optimize_for_linear:1; + unsigned samplemask_log_ps_iter:3; }; /* Common PS bits between the shader key and the epilog key. */ @@ -386,30 +482,35 @@ struct si_ps_epilog_bits { union si_shader_part_key { struct { struct si_vs_prolog_bits states; - unsigned num_input_sgprs:5; + unsigned num_input_sgprs:6; + /* For merged stages such as LS-HS, HS input VGPRs are first. */ + unsigned num_merged_next_stage_vgprs:3; unsigned last_input:4; + unsigned as_ls:1; + unsigned as_es:1; + /* Prologs for monolithic shaders shouldn't set EXEC. */ + unsigned is_monolithic:1; } vs_prolog; - struct { - struct si_vs_epilog_bits states; - unsigned prim_id_param_offset:5; - } vs_epilog; struct { struct si_tcs_epilog_bits states; } tcs_epilog; struct { struct si_gs_prolog_bits states; + /* Prologs of monolithic shaders shouldn't set EXEC. */ + unsigned is_monolithic:1; } gs_prolog; struct { struct si_ps_prolog_bits states; - unsigned num_input_sgprs:5; + unsigned num_input_sgprs:6; unsigned num_input_vgprs:5; /* Color interpolation and two-side color selection. */ unsigned colors_read:8; /* color input components read */ unsigned num_interp_inputs:5; /* BCOLOR is at this location */ unsigned face_vgpr_index:5; + unsigned ancillary_vgpr_index:5; unsigned wqm:1; char color_attr_index[2]; - char color_interp_vgpr_index[2]; /* -1 == constant */ + signed char color_interp_vgpr_index[2]; /* -1 == constant */ } ps_prolog; struct { struct si_ps_epilog_bits states; @@ -425,15 +526,15 @@ struct si_shader_key { union { struct { struct si_vs_prolog_bits prolog; - struct si_vs_epilog_bits epilog; } vs; struct { + struct si_vs_prolog_bits ls_prolog; /* for merged LS-HS */ + struct si_shader_selector *ls; /* for merged LS-HS */ struct si_tcs_epilog_bits epilog; } tcs; /* tessellation control shader */ struct { - struct si_vs_epilog_bits epilog; /* same as VS */ - } tes; /* tessellation evaluation shader */ - struct { + struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */ + struct si_shader_selector *es; /* for merged ES-GS */ struct si_gs_prolog_bits prolog; } gs; struct { @@ -449,26 +550,42 @@ struct si_shader_key { unsigned as_ls:1; /* local shader, which precedes TCS */ /* Flags for monolithic compilation only. */ - union { - struct { - /* One byte for every input: SI_FIX_FETCH_* enums. */ - uint8_t fix_fetch[SI_MAX_ATTRIBS]; - } vs; - struct { - uint64_t inputs_to_copy; /* for fixed-func TCS */ - } tcs; + struct { + /* One byte for every input: SI_FIX_FETCH_* enums. */ + uint8_t vs_fix_fetch[SI_MAX_ATTRIBS]; + + union { + uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */ + /* When PS needs PrimID and GS is disabled. */ + unsigned vs_export_prim_id:1; + struct { + unsigned interpolate_at_sample_force_center:1; + unsigned fbfetch_msaa; + unsigned fbfetch_is_1D; + unsigned fbfetch_layered; + } ps; + } u; } mono; /* Optimization flags for asynchronous compilation only. */ - union { - struct { - uint64_t kill_outputs; /* "get_unique_index" bits */ - uint32_t kill_outputs2; /* "get_unique_index2" bits */ - unsigned clip_disable:1; - } hw_vs; /* HW VS (it can be VS, TES, GS) */ + struct { + /* For HW VS (it can be VS, TES, GS) */ + uint64_t kill_outputs; /* "get_unique_index" bits */ + unsigned clip_disable:1; + + /* For shaders where monolithic variants have better code. + * + * This is a flag that has no effect on code generation, + * but forces monolithic shaders to be used as soon as + * possible, because it's in the "opt" group. + */ + unsigned prefer_mono:1; } opt; }; +/* Restore the pack alignment to default. */ +#pragma pack(pop) + struct si_shader_config { unsigned num_sgprs; unsigned num_vgprs; @@ -476,6 +593,7 @@ struct si_shader_config { unsigned spilled_vgprs; unsigned private_mem_vgprs; unsigned lds_size; + unsigned max_simd_waves; unsigned spi_ps_input_ena; unsigned spi_ps_input_addr; unsigned float_mode; @@ -484,24 +602,13 @@ struct si_shader_config { unsigned rsrc2; }; -enum { - /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */ - EXP_PARAM_OFFSET_0 = 0, - EXP_PARAM_OFFSET_31 = 31, - /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */ - EXP_PARAM_DEFAULT_VAL_0000 = 64, - EXP_PARAM_DEFAULT_VAL_0001, - EXP_PARAM_DEFAULT_VAL_1110, - EXP_PARAM_DEFAULT_VAL_1111, - EXP_PARAM_UNDEFINED = 255, -}; - /* GCN-specific shader info. */ struct si_shader_info { ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; ubyte num_input_sgprs; ubyte num_input_vgprs; - char face_vgpr_index; + signed char face_vgpr_index; + signed char ancillary_vgpr_index; bool uses_instanceid; ubyte nr_pos_exports; ubyte nr_param_exports; @@ -511,16 +618,19 @@ struct si_shader { struct si_compiler_ctx_state compiler_ctx_state; struct si_shader_selector *selector; + struct si_shader_selector *previous_stage_sel; /* for refcounting */ struct si_shader *next_variant; struct si_shader_part *prolog; + struct si_shader *previous_stage; /* for GFX9 */ + struct si_shader_part *prolog2; struct si_shader_part *epilog; struct si_pm4_state *pm4; struct r600_resource *bo; struct r600_resource *scratch_bo; struct si_shader_key key; - struct util_queue_fence optimized_ready; + struct util_queue_fence ready; bool compilation_failed; bool is_monolithic; bool is_optimized; @@ -549,44 +659,42 @@ struct si_shader_part { /* si_shader.c */ struct si_shader * si_generate_gs_copy_shader(struct si_screen *sscreen, - LLVMTargetMachineRef tm, + struct ac_llvm_compiler *compiler, struct si_shader_selector *gs_selector, struct pipe_debug_callback *debug); int si_compile_tgsi_shader(struct si_screen *sscreen, - LLVMTargetMachineRef tm, + struct ac_llvm_compiler *compiler, struct si_shader *shader, - bool is_monolithic, struct pipe_debug_callback *debug); -int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, +int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, struct pipe_debug_callback *debug); -int si_compile_llvm(struct si_screen *sscreen, - struct ac_shader_binary *binary, - struct si_shader_config *conf, - LLVMTargetMachineRef tm, - LLVMModuleRef mod, - struct pipe_debug_callback *debug, - unsigned processor, - const char *name); void si_shader_destroy(struct si_shader *shader); -unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); -unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index); +unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index); +unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index, + unsigned is_varying); int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader); -void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, +void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor, FILE *f, bool check_debug_option); +void si_shader_dump_stats_for_shader_db(const struct si_shader *shader, + struct pipe_debug_callback *debug); void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size); -void si_shader_apply_scratch_relocs(struct si_context *sctx, - struct si_shader *shader, - struct si_shader_config *config, - uint64_t scratch_va); +void si_shader_apply_scratch_relocs(struct si_shader *shader, + uint64_t scratch_va); void si_shader_binary_read_config(struct ac_shader_binary *binary, struct si_shader_config *conf, unsigned symbol_offset); -unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil, - bool writes_samplemask); -const char *si_get_shader_name(struct si_shader *shader, unsigned processor); +const char *si_get_shader_name(const struct si_shader *shader, unsigned processor); + +/* si_shader_nir.c */ +void si_nir_scan_shader(const struct nir_shader *nir, + struct tgsi_shader_info *info); +void si_nir_scan_tess_ctrl(const struct nir_shader *nir, + const struct tgsi_shader_info *info, + struct tgsi_tessctrl_info *out); +void si_lower_nir(struct si_shader_selector *sel); /* Inline helpers. */ @@ -602,4 +710,30 @@ si_get_main_shader_part(struct si_shader_selector *sel, return &sel->main_shader_part; } +static inline bool +si_shader_uses_bindless_samplers(struct si_shader_selector *selector) +{ + return selector ? selector->info.uses_bindless_samplers : false; +} + +static inline bool +si_shader_uses_bindless_images(struct si_shader_selector *selector) +{ + return selector ? selector->info.uses_bindless_images : false; +} + +void si_destroy_shader_selector(struct si_context *sctx, + struct si_shader_selector *sel); + +static inline void +si_shader_selector_reference(struct si_context *sctx, + struct si_shader_selector **dst, + struct si_shader_selector *src) +{ + if (pipe_reference(&(*dst)->reference, &src->reference)) + si_destroy_shader_selector(sctx, *dst); + + *dst = src; +} + #endif