src/gallium/drivers/radeonsi/si_state_shaders.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Christian König <christian.koenig@amd.com>
  25  *      Marek Olšák <maraeo@gmail.com>
  26  */
  27
  28 #include "si_pipe.h"
  29 #include "si_shader.h"
  30 #include "sid.h"
  31 #include "radeon/r600_cs.h"
  32
  33 #include "tgsi/tgsi_parse.h"
  34 #include "tgsi/tgsi_ureg.h"
  35 #include "util/u_memory.h"
  36 #include "util/u_prim.h"
  37 #include "util/u_simple_shaders.h"
  38
  39 static void si_set_tesseval_regs(struct si_shader *shader,
  40                                  struct si_pm4_state *pm4)
  41 {
  42         struct tgsi_shader_info *info = &shader->selector->info;
  43         unsigned tes_prim_mode = info->properties[TGSI_PROPERTY_TES_PRIM_MODE];
  44         unsigned tes_spacing = info->properties[TGSI_PROPERTY_TES_SPACING];
  45         bool tes_vertex_order_cw = info->properties[TGSI_PROPERTY_TES_VERTEX_ORDER_CW];
  46         bool tes_point_mode = info->properties[TGSI_PROPERTY_TES_POINT_MODE];
  47         unsigned type, partitioning, topology;
  48
  49         switch (tes_prim_mode) {
  50         case PIPE_PRIM_LINES:
  51                 type = V_028B6C_TESS_ISOLINE;
  52                 break;
  53         case PIPE_PRIM_TRIANGLES:
  54                 type = V_028B6C_TESS_TRIANGLE;
  55                 break;
  56         case PIPE_PRIM_QUADS:
  57                 type = V_028B6C_TESS_QUAD;
  58                 break;
  59         default:
  60                 assert(0);
  61                 return;
  62         }
  63
  64         switch (tes_spacing) {
  65         case PIPE_TESS_SPACING_FRACTIONAL_ODD:
  66                 partitioning = V_028B6C_PART_FRAC_ODD;
  67                 break;
  68         case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
  69                 partitioning = V_028B6C_PART_FRAC_EVEN;
  70                 break;
  71         case PIPE_TESS_SPACING_EQUAL:
  72                 partitioning = V_028B6C_PART_INTEGER;
  73                 break;
  74         default:
  75                 assert(0);
  76                 return;
  77         }
  78
  79         if (tes_point_mode)
  80                 topology = V_028B6C_OUTPUT_POINT;
  81         else if (tes_prim_mode == PIPE_PRIM_LINES)
  82                 topology = V_028B6C_OUTPUT_LINE;
  83         else if (tes_vertex_order_cw)
  84                 /* for some reason, this must be the other way around */
  85                 topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
  86         else
  87                 topology = V_028B6C_OUTPUT_TRIANGLE_CW;
  88
  89         si_pm4_set_reg(pm4, R_028B6C_VGT_TF_PARAM,
  90                        S_028B6C_TYPE(type) |
  91                        S_028B6C_PARTITIONING(partitioning) |
  92                        S_028B6C_TOPOLOGY(topology));
  93 }
  94
  95 static void si_shader_ls(struct si_shader *shader)
  96 {
  97         struct si_pm4_state *pm4;
  98         unsigned num_sgprs, num_user_sgprs;
  99         unsigned vgpr_comp_cnt;
 100         uint64_t va;
 101
 102         pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 103         if (!pm4)
 104                 return;
 105
 106         va = shader->bo->gpu_address;
 107         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 108
 109         /* We need at least 2 components for LS.
 110          * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */
 111         vgpr_comp_cnt = shader->uses_instanceid ? 3 : 1;
 112
 113         num_user_sgprs = SI_LS_NUM_USER_SGPR;
 114         num_sgprs = shader->config.num_sgprs;
 115         if (num_user_sgprs > num_sgprs) {
 116                 /* Last 2 reserved SGPRs are used for VCC */
 117                 num_sgprs = num_user_sgprs + 2;
 118         }
 119         assert(num_sgprs <= 104);
 120
 121         si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 122         si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
 123
 124         shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
 125                            S_00B528_SGPRS((num_sgprs - 1) / 8) |
 126                            S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) |
 127                            S_00B528_DX10_CLAMP(shader->dx10_clamp_mode);
 128         shader->config.rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
 129                            S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
 130 }
 131
 132 static void si_shader_hs(struct si_shader *shader)
 133 {
 134         struct si_pm4_state *pm4;
 135         unsigned num_sgprs, num_user_sgprs;
 136         uint64_t va;
 137
 138         pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 139         if (!pm4)
 140                 return;
 141
 142         va = shader->bo->gpu_address;
 143         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 144
 145         num_user_sgprs = SI_TCS_NUM_USER_SGPR;
 146         num_sgprs = shader->config.num_sgprs;
 147         /* One SGPR after user SGPRs is pre-loaded with tessellation factor
 148          * buffer offset. */
 149         if ((num_user_sgprs + 1) > num_sgprs) {
 150                 /* Last 2 reserved SGPRs are used for VCC */
 151                 num_sgprs = num_user_sgprs + 1 + 2;
 152         }
 153         assert(num_sgprs <= 104);
 154
 155         si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
 156         si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS, va >> 40);
 157         si_pm4_set_reg(pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
 158                        S_00B428_VGPRS((shader->config.num_vgprs - 1) / 4) |
 159                        S_00B428_SGPRS((num_sgprs - 1) / 8) |
 160                        S_00B428_DX10_CLAMP(shader->dx10_clamp_mode));
 161         si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
 162                        S_00B42C_USER_SGPR(num_user_sgprs) |
 163                        S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 164 }
 165
 166 static void si_shader_es(struct si_shader *shader)
 167 {
 168         struct si_pm4_state *pm4;
 169         unsigned num_sgprs, num_user_sgprs;
 170         unsigned vgpr_comp_cnt;
 171         uint64_t va;
 172
 173         pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 174
 175         if (!pm4)
 176                 return;
 177
 178         va = shader->bo->gpu_address;
 179         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 180
 181         if (shader->selector->type == PIPE_SHADER_VERTEX) {
 182                 vgpr_comp_cnt = shader->uses_instanceid ? 3 : 0;
 183                 num_user_sgprs = SI_ES_NUM_USER_SGPR;
 184         } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
 185                 vgpr_comp_cnt = 3; /* all components are needed for TES */
 186                 num_user_sgprs = SI_TES_NUM_USER_SGPR;
 187         } else
 188                 unreachable("invalid shader selector type");
 189
 190         num_sgprs = shader->config.num_sgprs;
 191         /* One SGPR after user SGPRs is pre-loaded with es2gs_offset */
 192         if ((num_user_sgprs + 1) > num_sgprs) {
 193                 /* Last 2 reserved SGPRs are used for VCC */
 194                 num_sgprs = num_user_sgprs + 1 + 2;
 195         }
 196         assert(num_sgprs <= 104);
 197
 198         si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
 199                        shader->selector->esgs_itemsize / 4);
 200         si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
 201         si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
 202         si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
 203                        S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
 204                        S_00B328_SGPRS((num_sgprs - 1) / 8) |
 205                        S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
 206                        S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
 207         si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
 208                        S_00B32C_USER_SGPR(num_user_sgprs) |
 209                        S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 210
 211         if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
 212                 si_set_tesseval_regs(shader, pm4);
 213 }
 214
 215 static void si_shader_gs(struct si_shader *shader)
 216 {
 217         unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
 218         unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
 219         unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
 220         unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 221         unsigned cut_mode;
 222         struct si_pm4_state *pm4;
 223         unsigned num_sgprs, num_user_sgprs;
 224         uint64_t va;
 225         unsigned max_stream = shader->selector->max_gs_stream;
 226
 227         /* The GSVS_RING_ITEMSIZE register takes 15 bits */
 228         assert(gsvs_itemsize < (1 << 15));
 229
 230         pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 231
 232         if (!pm4)
 233                 return;
 234
 235         if (gs_max_vert_out <= 128) {
 236                 cut_mode = V_028A40_GS_CUT_128;
 237         } else if (gs_max_vert_out <= 256) {
 238                 cut_mode = V_028A40_GS_CUT_256;
 239         } else if (gs_max_vert_out <= 512) {
 240                 cut_mode = V_028A40_GS_CUT_512;
 241         } else {
 242                 assert(gs_max_vert_out <= 1024);
 243                 cut_mode = V_028A40_GS_CUT_1024;
 244         }
 245
 246         si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
 247                        S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
 248                        S_028A40_CUT_MODE(cut_mode)|
 249                        S_028A40_ES_WRITE_OPTIMIZE(1) |
 250                        S_028A40_GS_WRITE_OPTIMIZE(1));
 251
 252         si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
 253         si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
 254         si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
 255
 256         si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
 257
 258         si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
 259
 260         si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
 261         si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
 262         si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0);
 263         si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0);
 264
 265         si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
 266                        S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
 267                        S_028B90_ENABLE(gs_num_invocations > 0));
 268
 269         va = shader->bo->gpu_address;
 270         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 271         si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
 272         si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
 273
 274         num_user_sgprs = SI_GS_NUM_USER_SGPR;
 275         num_sgprs = shader->config.num_sgprs;
 276         /* Two SGPRs after user SGPRs are pre-loaded with gs2vs_offset, gs_wave_id */
 277         if ((num_user_sgprs + 2) > num_sgprs) {
 278                 /* Last 2 reserved SGPRs are used for VCC */
 279                 num_sgprs = num_user_sgprs + 2 + 2;
 280         }
 281         assert(num_sgprs <= 104);
 282
 283         si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
 284                        S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
 285                        S_00B228_SGPRS((num_sgprs - 1) / 8) |
 286                        S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
 287         si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
 288                        S_00B22C_USER_SGPR(num_user_sgprs) |
 289                        S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 290 }
 291
 292 static void si_shader_vs(struct si_shader *shader)
 293 {
 294         struct si_pm4_state *pm4;
 295         unsigned num_sgprs, num_user_sgprs;
 296         unsigned nparams, vgpr_comp_cnt;
 297         uint64_t va;
 298         unsigned window_space =
 299            shader->selector->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
 300         bool enable_prim_id = si_vs_exports_prim_id(shader);
 301
 302         pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 303
 304         if (!pm4)
 305                 return;
 306
 307         /* If this is the GS copy shader, the GS state writes this register.
 308          * Otherwise, the VS state writes it.
 309          */
 310         if (!shader->is_gs_copy_shader) {
 311                 si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE,
 312                                S_028A40_MODE(enable_prim_id ? V_028A40_GS_SCENARIO_A : 0));
 313                 si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, enable_prim_id);
 314         } else
 315                 si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0);
 316
 317         va = shader->bo->gpu_address;
 318         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 319
 320         if (shader->is_gs_copy_shader) {
 321                 vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
 322                 num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
 323         } else if (shader->selector->type == PIPE_SHADER_VERTEX) {
 324                 vgpr_comp_cnt = shader->uses_instanceid ? 3 : (enable_prim_id ? 2 : 0);
 325                 num_user_sgprs = SI_VS_NUM_USER_SGPR;
 326         } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) {
 327                 vgpr_comp_cnt = 3; /* all components are needed for TES */
 328                 num_user_sgprs = SI_TES_NUM_USER_SGPR;
 329         } else
 330                 unreachable("invalid shader selector type");
 331
 332         num_sgprs = shader->config.num_sgprs;
 333         if (num_user_sgprs > num_sgprs) {
 334                 /* Last 2 reserved SGPRs are used for VCC */
 335                 num_sgprs = num_user_sgprs + 2;
 336         }
 337         assert(num_sgprs <= 104);
 338
 339         /* VS is required to export at least one param. */
 340         nparams = MAX2(shader->nr_param_exports, 1);
 341         si_pm4_set_reg(pm4, R_0286C4_SPI_VS_OUT_CONFIG,
 342                        S_0286C4_VS_EXPORT_COUNT(nparams - 1));
 343
 344         si_pm4_set_reg(pm4, R_02870C_SPI_SHADER_POS_FORMAT,
 345                        S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
 346                        S_02870C_POS1_EXPORT_FORMAT(shader->nr_pos_exports > 1 ?
 347                                                    V_02870C_SPI_SHADER_4COMP :
 348                                                    V_02870C_SPI_SHADER_NONE) |
 349                        S_02870C_POS2_EXPORT_FORMAT(shader->nr_pos_exports > 2 ?
 350                                                    V_02870C_SPI_SHADER_4COMP :
 351                                                    V_02870C_SPI_SHADER_NONE) |
 352                        S_02870C_POS3_EXPORT_FORMAT(shader->nr_pos_exports > 3 ?
 353                                                    V_02870C_SPI_SHADER_4COMP :
 354                                                    V_02870C_SPI_SHADER_NONE));
 355
 356         si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
 357         si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, va >> 40);
 358         si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS,
 359                        S_00B128_VGPRS((shader->config.num_vgprs - 1) / 4) |
 360                        S_00B128_SGPRS((num_sgprs - 1) / 8) |
 361                        S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
 362                        S_00B128_DX10_CLAMP(shader->dx10_clamp_mode));
 363         si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS,
 364                        S_00B12C_USER_SGPR(num_user_sgprs) |
 365                        S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
 366                        S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
 367                        S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
 368                        S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
 369                        S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
 370                        S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 371         if (window_space)
 372                 si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
 373                                S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
 374         else
 375                 si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
 376                                S_028818_VTX_W0_FMT(1) |
 377                                S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
 378                                S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
 379                                S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
 380
 381         if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
 382                 si_set_tesseval_regs(shader, pm4);
 383 }
 384
 385 static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
 386 {
 387         unsigned value = shader->key.ps.spi_shader_col_format;
 388         unsigned i, num_targets = (util_last_bit(value) + 3) / 4;
 389
 390         /* If the i-th target format is set, all previous target formats must
 391          * be non-zero to avoid hangs.
 392          */
 393         for (i = 0; i < num_targets; i++)
 394                 if (!(value & (0xf << (i * 4))))
 395                         value |= V_028714_SPI_SHADER_32_R << (i * 4);
 396
 397         return value;
 398 }
 399
 400 static unsigned si_get_cb_shader_mask(unsigned spi_shader_col_format)
 401 {
 402         unsigned i, cb_shader_mask = 0;
 403
 404         for (i = 0; i < 8; i++) {
 405                 switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
 406                 case V_028714_SPI_SHADER_ZERO:
 407                         break;
 408                 case V_028714_SPI_SHADER_32_R:
 409                         cb_shader_mask |= 0x1 << (i * 4);
 410                         break;
 411                 case V_028714_SPI_SHADER_32_GR:
 412                         cb_shader_mask |= 0x3 << (i * 4);
 413                         break;
 414                 case V_028714_SPI_SHADER_32_AR:
 415                         cb_shader_mask |= 0x9 << (i * 4);
 416                         break;
 417                 case V_028714_SPI_SHADER_FP16_ABGR:
 418                 case V_028714_SPI_SHADER_UNORM16_ABGR:
 419                 case V_028714_SPI_SHADER_SNORM16_ABGR:
 420                 case V_028714_SPI_SHADER_UINT16_ABGR:
 421                 case V_028714_SPI_SHADER_SINT16_ABGR:
 422                 case V_028714_SPI_SHADER_32_ABGR:
 423                         cb_shader_mask |= 0xf << (i * 4);
 424                         break;
 425                 default:
 426                         assert(0);
 427                 }
 428         }
 429         return cb_shader_mask;
 430 }
 431
 432 static void si_shader_ps(struct si_shader *shader)
 433 {
 434         struct tgsi_shader_info *info = &shader->selector->info;
 435         struct si_pm4_state *pm4;
 436         unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
 437         unsigned num_sgprs, num_user_sgprs;
 438         unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
 439         uint64_t va;
 440         bool has_centroid;
 441
 442         pm4 = shader->pm4 = CALLOC_STRUCT(si_pm4_state);
 443
 444         if (!pm4)
 445                 return;
 446
 447         /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
 448          * Possible vaules:
 449          * 0 -> Position = pixel center
 450          * 1 -> Position = pixel centroid
 451          * 2 -> Position = at sample position
 452          *
 453          * From GLSL 4.5 specification, section 7.1:
 454          *   "The variable gl_FragCoord is available as an input variable from
 455          *    within fragment shaders and it holds the window relative coordinates
 456          *    (x, y, z, 1/w) values for the fragment. If multi-sampling, this
 457          *    value can be for any location within the pixel, or one of the
 458          *    fragment samples. The use of centroid does not further restrict
 459          *    this value to be inside the current primitive."
 460          *
 461          * Meaning that centroid has no effect and we can return anything within
 462          * the pixel. Thus, return the value at sample position, because that's
 463          * the most accurate one shaders can get.
 464          */
 465         spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
 466
 467         if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
 468             TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
 469                 spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
 470
 471         spi_shader_col_format = si_get_spi_shader_col_format(shader);
 472         cb_shader_mask = si_get_cb_shader_mask(spi_shader_col_format);
 473
 474         /* This must be non-zero for alpha-test/kill to work.
 475          * The hardware ignores the EXEC mask if no export memory is allocated.
 476          * Don't add this to CB_SHADER_MASK.
 477          */
 478         if (!spi_shader_col_format &&
 479             !info->writes_z && !info->writes_stencil && !info->writes_samplemask &&
 480             (shader->selector->info.uses_kill ||
 481              shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS))
 482                 spi_shader_col_format = V_028714_SPI_SHADER_32_R;
 483
 484         /* Set interpolation controls. */
 485         has_centroid = G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena) ||
 486                        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena);
 487
 488         spi_ps_in_control = S_0286D8_NUM_INTERP(shader->nparam) |
 489                             S_0286D8_BC_OPTIMIZE_DISABLE(has_centroid);
 490
 491         /* Set registers. */
 492         si_pm4_set_reg(pm4, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
 493         si_pm4_set_reg(pm4, R_0286D8_SPI_PS_IN_CONTROL, spi_ps_in_control);
 494
 495         si_pm4_set_reg(pm4, R_028710_SPI_SHADER_Z_FORMAT,
 496                        info->writes_samplemask ? V_028710_SPI_SHADER_32_ABGR :
 497                        info->writes_stencil ? V_028710_SPI_SHADER_32_GR :
 498                        info->writes_z ? V_028710_SPI_SHADER_32_R :
 499                        V_028710_SPI_SHADER_ZERO);
 500
 501         si_pm4_set_reg(pm4, R_028714_SPI_SHADER_COL_FORMAT, spi_shader_col_format);
 502         si_pm4_set_reg(pm4, R_02823C_CB_SHADER_MASK, cb_shader_mask);
 503
 504         va = shader->bo->gpu_address;
 505         si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 506         si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
 507         si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS, va >> 40);
 508
 509         num_user_sgprs = SI_PS_NUM_USER_SGPR;
 510         num_sgprs = shader->config.num_sgprs;
 511         /* One SGPR after user SGPRs is pre-loaded with {prim_mask, lds_offset} */
 512         if ((num_user_sgprs + 1) > num_sgprs) {
 513                 /* Last 2 reserved SGPRs are used for VCC */
 514                 num_sgprs = num_user_sgprs + 1 + 2;
 515         }
 516         assert(num_sgprs <= 104);
 517
 518         si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
 519                        S_00B028_VGPRS((shader->config.num_vgprs - 1) / 4) |
 520                        S_00B028_SGPRS((num_sgprs - 1) / 8) |
 521                        S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
 522         si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
 523                        S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
 524                        S_00B02C_USER_SGPR(num_user_sgprs) |
 525                        S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 526 }
 527
 528 static void si_shader_init_pm4_state(struct si_shader *shader)
 529 {
 530
 531         if (shader->pm4)
 532                 si_pm4_free_state_simple(shader->pm4);
 533
 534         switch (shader->selector->type) {
 535         case PIPE_SHADER_VERTEX:
 536                 if (shader->key.vs.as_ls)
 537                         si_shader_ls(shader);
 538                 else if (shader->key.vs.as_es)
 539                         si_shader_es(shader);
 540                 else
 541                         si_shader_vs(shader);
 542                 break;
 543         case PIPE_SHADER_TESS_CTRL:
 544                 si_shader_hs(shader);
 545                 break;
 546         case PIPE_SHADER_TESS_EVAL:
 547                 if (shader->key.tes.as_es)
 548                         si_shader_es(shader);
 549                 else
 550                         si_shader_vs(shader);
 551                 break;
 552         case PIPE_SHADER_GEOMETRY:
 553                 si_shader_gs(shader);
 554                 si_shader_vs(shader->gs_copy_shader);
 555                 break;
 556         case PIPE_SHADER_FRAGMENT:
 557                 si_shader_ps(shader);
 558                 break;
 559         default:
 560                 assert(0);
 561         }
 562 }
 563
 564 static unsigned si_get_alpha_test_func(struct si_context *sctx)
 565 {
 566         /* Alpha-test should be disabled if colorbuffer 0 is integer. */
 567         if (sctx->queued.named.dsa &&
 568             !sctx->framebuffer.cb0_is_integer)
 569                 return sctx->queued.named.dsa->alpha_func;
 570
 571         return PIPE_FUNC_ALWAYS;
 572 }
 573
 574 /* Compute the key for the hw shader variant */
 575 static inline void si_shader_selector_key(struct pipe_context *ctx,
 576                                           struct si_shader_selector *sel,
 577                                           union si_shader_key *key)
 578 {
 579         struct si_context *sctx = (struct si_context *)ctx;
 580         unsigned i;
 581
 582         memset(key, 0, sizeof(*key));
 583
 584         switch (sel->type) {
 585         case PIPE_SHADER_VERTEX:
 586                 if (sctx->vertex_elements)
 587                         for (i = 0; i < sctx->vertex_elements->count; ++i)
 588                                 key->vs.instance_divisors[i] =
 589                                         sctx->vertex_elements->elements[i].instance_divisor;
 590
 591                 if (sctx->tes_shader.cso)
 592                         key->vs.as_ls = 1;
 593                 else if (sctx->gs_shader.cso)
 594                         key->vs.as_es = 1;
 595
 596                 if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
 597                     sctx->ps_shader.cso->info.uses_primid)
 598                         key->vs.export_prim_id = 1;
 599                 break;
 600         case PIPE_SHADER_TESS_CTRL:
 601                 key->tcs.prim_mode =
 602                         sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 603                 break;
 604         case PIPE_SHADER_TESS_EVAL:
 605                 if (sctx->gs_shader.cso)
 606                         key->tes.as_es = 1;
 607                 else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 608                         key->tes.export_prim_id = 1;
 609                 break;
 610         case PIPE_SHADER_GEOMETRY:
 611                 break;
 612         case PIPE_SHADER_FRAGMENT: {
 613                 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 614                 struct si_state_blend *blend = sctx->queued.named.blend;
 615
 616                 if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
 617                     sel->info.colors_written == 0x1)
 618                         key->ps.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
 619
 620                 key->ps.spi_shader_col_format = sctx->framebuffer.spi_shader_col_format;
 621
 622                 /* If alpha-to-coverage is enabled, we have to export alpha
 623                  * even if there is no color buffer.
 624                  */
 625                 if (!(key->ps.spi_shader_col_format & 0xf) &&
 626                     blend && blend->alpha_to_coverage)
 627                         key->ps.spi_shader_col_format |= V_028710_SPI_SHADER_FP16_ABGR;
 628
 629                 if (rs) {
 630                         bool is_poly = (sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES &&
 631                                         sctx->current_rast_prim <= PIPE_PRIM_POLYGON) ||
 632                                        sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
 633                         bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS;
 634
 635                         key->ps.color_two_side = rs->two_side;
 636
 637                         if (sctx->queued.named.blend) {
 638                                 key->ps.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
 639                                                        rs->multisample_enable &&
 640                                                        !sctx->framebuffer.cb0_is_integer;
 641                         }
 642
 643                         key->ps.poly_stipple = rs->poly_stipple_enable && is_poly;
 644                         key->ps.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
 645                                                        (is_line && rs->line_smooth)) &&
 646                                                       sctx->framebuffer.nr_samples <= 1;
 647                         key->ps.clamp_color = rs->clamp_fragment_color;
 648                 }
 649
 650                 key->ps.alpha_func = si_get_alpha_test_func(sctx);
 651                 break;
 652         }
 653         default:
 654                 assert(0);
 655         }
 656 }
 657
 658 /* Select the hw shader variant depending on the current state. */
 659 static int si_shader_select(struct pipe_context *ctx,
 660                             struct si_shader_ctx_state *state)
 661 {
 662         struct si_context *sctx = (struct si_context *)ctx;
 663         struct si_shader_selector *sel = state->cso;
 664         struct si_shader *current = state->current;
 665         union si_shader_key key;
 666         struct si_shader *iter, *shader = NULL;
 667         int r;
 668
 669         si_shader_selector_key(ctx, sel, &key);
 670
 671         /* Check if we don't need to change anything.
 672          * This path is also used for most shaders that don't need multiple
 673          * variants, it will cost just a computation of the key and this
 674          * test. */
 675         if (likely(current && memcmp(&current->key, &key, sizeof(key)) == 0))
 676                 return 0;
 677
 678         pipe_mutex_lock(sel->mutex);
 679
 680         /* Find the shader variant. */
 681         for (iter = sel->first_variant; iter; iter = iter->next_variant) {
 682                 /* Don't check the "current" shader. We checked it above. */
 683                 if (current != iter &&
 684                     memcmp(&iter->key, &key, sizeof(key)) == 0) {
 685                         state->current = iter;
 686                         pipe_mutex_unlock(sel->mutex);
 687                         return 0;
 688                 }
 689         }
 690
 691         /* Build a new shader. */
 692         shader = CALLOC_STRUCT(si_shader);
 693         if (!shader) {
 694                 pipe_mutex_unlock(sel->mutex);
 695                 return -ENOMEM;
 696         }
 697         shader->selector = sel;
 698         shader->key = key;
 699
 700         r = si_shader_create(sctx->screen, sctx->tm, shader, &sctx->b.debug);
 701         if (unlikely(r)) {
 702                 R600_ERR("Failed to build shader variant (type=%u) %d\n",
 703                          sel->type, r);
 704                 FREE(shader);
 705                 pipe_mutex_unlock(sel->mutex);
 706                 return r;
 707         }
 708         si_shader_init_pm4_state(shader);
 709
 710         if (!sel->last_variant) {
 711                 sel->first_variant = shader;
 712                 sel->last_variant = shader;
 713         } else {
 714                 sel->last_variant->next_variant = shader;
 715                 sel->last_variant = shader;
 716         }
 717         state->current = shader;
 718         pipe_mutex_unlock(sel->mutex);
 719         return 0;
 720 }
 721
 722 static void *si_create_shader_selector(struct pipe_context *ctx,
 723                                        const struct pipe_shader_state *state)
 724 {
 725         struct si_screen *sscreen = (struct si_screen *)ctx->screen;
 726         struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
 727         int i;
 728
 729         if (!sel)
 730                 return NULL;
 731
 732         sel->tokens = tgsi_dup_tokens(state->tokens);
 733         if (!sel->tokens) {
 734                 FREE(sel);
 735                 return NULL;
 736         }
 737
 738         sel->so = state->stream_output;
 739         tgsi_scan_shader(state->tokens, &sel->info);
 740         sel->type = util_pipe_shader_from_tgsi_processor(sel->info.processor);
 741         p_atomic_inc(&sscreen->b.num_shaders_created);
 742
 743         /* First set which opcode uses which (i,j) pair. */
 744         if (sel->info.uses_persp_opcode_interp_centroid)
 745                 sel->info.uses_persp_centroid = true;
 746
 747         if (sel->info.uses_linear_opcode_interp_centroid)
 748                 sel->info.uses_linear_centroid = true;
 749
 750         if (sel->info.uses_persp_opcode_interp_offset ||
 751             sel->info.uses_persp_opcode_interp_sample)
 752                 sel->info.uses_persp_center = true;
 753
 754         if (sel->info.uses_linear_opcode_interp_offset ||
 755             sel->info.uses_linear_opcode_interp_sample)
 756                 sel->info.uses_linear_center = true;
 757
 758         /* Determine if the shader has to use a conditional assignment when
 759          * emulating force_persample_interp.
 760          */
 761         sel->forces_persample_interp_for_persp =
 762                 sel->info.uses_persp_center +
 763                 sel->info.uses_persp_centroid +
 764                 sel->info.uses_persp_sample >= 2;
 765
 766         sel->forces_persample_interp_for_linear =
 767                 sel->info.uses_linear_center +
 768                 sel->info.uses_linear_centroid +
 769                 sel->info.uses_linear_sample >= 2;
 770
 771         switch (sel->type) {
 772         case PIPE_SHADER_GEOMETRY:
 773                 sel->gs_output_prim =
 774                         sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
 775                 sel->gs_max_out_vertices =
 776                         sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
 777                 sel->gs_num_invocations =
 778                         sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
 779                 sel->gsvs_vertex_size = sel->info.num_outputs * 16;
 780                 sel->max_gsvs_emit_size = sel->gsvs_vertex_size *
 781                                           sel->gs_max_out_vertices;
 782
 783                 sel->max_gs_stream = 0;
 784                 for (i = 0; i < sel->so.num_outputs; i++)
 785                         sel->max_gs_stream = MAX2(sel->max_gs_stream,
 786                                                   sel->so.output[i].stream);
 787
 788                 sel->gs_input_verts_per_prim =
 789                         u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
 790                 break;
 791
 792         case PIPE_SHADER_VERTEX:
 793         case PIPE_SHADER_TESS_CTRL:
 794         case PIPE_SHADER_TESS_EVAL:
 795                 for (i = 0; i < sel->info.num_outputs; i++) {
 796                         unsigned name = sel->info.output_semantic_name[i];
 797                         unsigned index = sel->info.output_semantic_index[i];
 798
 799                         switch (name) {
 800                         case TGSI_SEMANTIC_TESSINNER:
 801                         case TGSI_SEMANTIC_TESSOUTER:
 802                         case TGSI_SEMANTIC_PATCH:
 803                                 sel->patch_outputs_written |=
 804                                         1llu << si_shader_io_get_unique_index(name, index);
 805                                 break;
 806                         default:
 807                                 sel->outputs_written |=
 808                                         1llu << si_shader_io_get_unique_index(name, index);
 809                         }
 810                 }
 811                 sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
 812                 break;
 813         }
 814
 815         /* DB_SHADER_CONTROL */
 816         sel->db_shader_control =
 817                 S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
 818                 S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
 819                 S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
 820                 S_02880C_KILL_ENABLE(sel->info.uses_kill);
 821
 822         switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
 823         case TGSI_FS_DEPTH_LAYOUT_GREATER:
 824                 sel->db_shader_control |=
 825                         S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
 826                 break;
 827         case TGSI_FS_DEPTH_LAYOUT_LESS:
 828                 sel->db_shader_control |=
 829                         S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
 830                 break;
 831         }
 832
 833         /* Pre-compilation. */
 834         if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
 835                 struct si_shader_ctx_state state = {sel};
 836
 837                 if (si_shader_select(ctx, &state)) {
 838                         fprintf(stderr, "radeonsi: can't create a shader\n");
 839                         tgsi_free_tokens(sel->tokens);
 840                         FREE(sel);
 841                         return NULL;
 842                 }
 843         }
 844
 845         pipe_mutex_init(sel->mutex);
 846         return sel;
 847 }
 848
 849 /**
 850  * Normally, we only emit 1 viewport and 1 scissor if no shader is using
 851  * the VIEWPORT_INDEX output, and emitting the other viewports and scissors
 852  * is delayed. When a shader with VIEWPORT_INDEX appears, this should be
 853  * called to emit the rest.
 854  */
 855 static void si_update_viewports_and_scissors(struct si_context *sctx)
 856 {
 857         struct tgsi_shader_info *info = si_get_vs_info(sctx);
 858
 859         if (!info || !info->writes_viewport_index)
 860                 return;
 861
 862         if (sctx->scissors.dirty_mask)
 863             si_mark_atom_dirty(sctx, &sctx->scissors.atom);
 864         if (sctx->viewports.dirty_mask)
 865             si_mark_atom_dirty(sctx, &sctx->viewports.atom);
 866 }
 867
 868 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 869 {
 870         struct si_context *sctx = (struct si_context *)ctx;
 871         struct si_shader_selector *sel = state;
 872
 873         if (sctx->vs_shader.cso == sel)
 874                 return;
 875
 876         sctx->vs_shader.cso = sel;
 877         sctx->vs_shader.current = sel ? sel->first_variant : NULL;
 878         si_mark_atom_dirty(sctx, &sctx->clip_regs);
 879         si_update_viewports_and_scissors(sctx);
 880 }
 881
 882 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 883 {
 884         struct si_context *sctx = (struct si_context *)ctx;
 885         struct si_shader_selector *sel = state;
 886         bool enable_changed = !!sctx->gs_shader.cso != !!sel;
 887
 888         if (sctx->gs_shader.cso == sel)
 889                 return;
 890
 891         sctx->gs_shader.cso = sel;
 892         sctx->gs_shader.current = sel ? sel->first_variant : NULL;
 893         si_mark_atom_dirty(sctx, &sctx->clip_regs);
 894         sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 895
 896         if (enable_changed)
 897                 si_shader_change_notify(sctx);
 898         si_update_viewports_and_scissors(sctx);
 899 }
 900
 901 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 902 {
 903         struct si_context *sctx = (struct si_context *)ctx;
 904         struct si_shader_selector *sel = state;
 905         bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
 906
 907         if (sctx->tcs_shader.cso == sel)
 908                 return;
 909
 910         sctx->tcs_shader.cso = sel;
 911         sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
 912
 913         if (enable_changed)
 914                 sctx->last_tcs = NULL; /* invalidate derived tess state */
 915 }
 916
 917 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 918 {
 919         struct si_context *sctx = (struct si_context *)ctx;
 920         struct si_shader_selector *sel = state;
 921         bool enable_changed = !!sctx->tes_shader.cso != !!sel;
 922
 923         if (sctx->tes_shader.cso == sel)
 924                 return;
 925
 926         sctx->tes_shader.cso = sel;
 927         sctx->tes_shader.current = sel ? sel->first_variant : NULL;
 928         si_mark_atom_dirty(sctx, &sctx->clip_regs);
 929         sctx->last_rast_prim = -1; /* reset this so that it gets updated */
 930
 931         if (enable_changed) {
 932                 si_shader_change_notify(sctx);
 933                 sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
 934         }
 935         si_update_viewports_and_scissors(sctx);
 936 }
 937
 938 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 939 {
 940         struct si_context *sctx = (struct si_context *)ctx;
 941         struct si_shader_selector *sel = state;
 942
 943         /* skip if supplied shader is one already in use */
 944         if (sctx->ps_shader.cso == sel)
 945                 return;
 946
 947         sctx->ps_shader.cso = sel;
 948         sctx->ps_shader.current = sel ? sel->first_variant : NULL;
 949         si_mark_atom_dirty(sctx, &sctx->cb_target_mask);
 950 }
 951
 952 static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 953 {
 954         struct si_context *sctx = (struct si_context *)ctx;
 955         struct si_shader_selector *sel = (struct si_shader_selector *)state;
 956         struct si_shader *p = sel->first_variant, *c;
 957         struct si_shader_ctx_state *current_shader[SI_NUM_SHADERS] = {
 958                 [PIPE_SHADER_VERTEX] = &sctx->vs_shader,
 959                 [PIPE_SHADER_TESS_CTRL] = &sctx->tcs_shader,
 960                 [PIPE_SHADER_TESS_EVAL] = &sctx->tes_shader,
 961                 [PIPE_SHADER_GEOMETRY] = &sctx->gs_shader,
 962                 [PIPE_SHADER_FRAGMENT] = &sctx->ps_shader,
 963         };
 964
 965         if (current_shader[sel->type]->cso == sel) {
 966                 current_shader[sel->type]->cso = NULL;
 967                 current_shader[sel->type]->current = NULL;
 968         }
 969
 970         while (p) {
 971                 c = p->next_variant;
 972                 switch (sel->type) {
 973                 case PIPE_SHADER_VERTEX:
 974                         if (p->key.vs.as_ls)
 975                                 si_pm4_delete_state(sctx, ls, p->pm4);
 976                         else if (p->key.vs.as_es)
 977                                 si_pm4_delete_state(sctx, es, p->pm4);
 978                         else
 979                                 si_pm4_delete_state(sctx, vs, p->pm4);
 980                         break;
 981                 case PIPE_SHADER_TESS_CTRL:
 982                         si_pm4_delete_state(sctx, hs, p->pm4);
 983                         break;
 984                 case PIPE_SHADER_TESS_EVAL:
 985                         if (p->key.tes.as_es)
 986                                 si_pm4_delete_state(sctx, es, p->pm4);
 987                         else
 988                                 si_pm4_delete_state(sctx, vs, p->pm4);
 989                         break;
 990                 case PIPE_SHADER_GEOMETRY:
 991                         si_pm4_delete_state(sctx, gs, p->pm4);
 992                         si_pm4_delete_state(sctx, vs, p->gs_copy_shader->pm4);
 993                         break;
 994                 case PIPE_SHADER_FRAGMENT:
 995                         si_pm4_delete_state(sctx, ps, p->pm4);
 996                         break;
 997                 }
 998
 999                 si_shader_destroy(p);
1000                 free(p);
1001                 p = c;
1002         }
1003
1004         pipe_mutex_destroy(sel->mutex);
1005         free(sel->tokens);
1006         free(sel);
1007 }
1008
1009 static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
1010 {
1011         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
1012         struct si_shader *ps = sctx->ps_shader.current;
1013         struct si_shader *vs = si_get_vs_state(sctx);
1014         struct tgsi_shader_info *psinfo;
1015         struct tgsi_shader_info *vsinfo = &vs->selector->info;
1016         unsigned i, j, tmp, num_written = 0;
1017
1018         if (!ps || !ps->nparam)
1019                 return;
1020
1021         psinfo = &ps->selector->info;
1022
1023         radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, ps->nparam);
1024
1025         for (i = 0; i < psinfo->num_inputs; i++) {
1026                 unsigned name = psinfo->input_semantic_name[i];
1027                 unsigned index = psinfo->input_semantic_index[i];
1028                 unsigned interpolate = psinfo->input_interpolate[i];
1029                 unsigned param_offset = ps->ps_input_param_offset[i];
1030 bcolor:
1031                 tmp = 0;
1032
1033                 if (interpolate == TGSI_INTERPOLATE_CONSTANT ||
1034                     (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade))
1035                         tmp |= S_028644_FLAT_SHADE(1);
1036
1037                 if (name == TGSI_SEMANTIC_PCOORD ||
1038                     (name == TGSI_SEMANTIC_TEXCOORD &&
1039                      sctx->sprite_coord_enable & (1 << index))) {
1040                         tmp |= S_028644_PT_SPRITE_TEX(1);
1041                 }
1042
1043                 for (j = 0; j < vsinfo->num_outputs; j++) {
1044                         if (name == vsinfo->output_semantic_name[j] &&
1045                             index == vsinfo->output_semantic_index[j]) {
1046                                 tmp |= S_028644_OFFSET(vs->vs_output_param_offset[j]);
1047                                 break;
1048                         }
1049                 }
1050
1051                 if (name == TGSI_SEMANTIC_PRIMID)
1052                         /* PrimID is written after the last output. */
1053                         tmp |= S_028644_OFFSET(vs->vs_output_param_offset[vsinfo->num_outputs]);
1054                 else if (j == vsinfo->num_outputs && !G_028644_PT_SPRITE_TEX(tmp)) {
1055                         /* No corresponding output found, load defaults into input.
1056                          * Don't set any other bits.
1057                          * (FLAT_SHADE=1 completely changes behavior) */
1058                         tmp = S_028644_OFFSET(0x20);
1059                 }
1060
1061                 assert(param_offset == num_written);
1062                 radeon_emit(cs, tmp);
1063                 num_written++;
1064
1065                 if (name == TGSI_SEMANTIC_COLOR &&
1066                     ps->key.ps.color_two_side) {
1067                         name = TGSI_SEMANTIC_BCOLOR;
1068                         param_offset++;
1069                         goto bcolor;
1070                 }
1071         }
1072         assert(ps->nparam == num_written);
1073 }
1074
1075 static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom)
1076 {
1077         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
1078         struct si_shader *ps = sctx->ps_shader.current;
1079         unsigned input_ena;
1080
1081         if (!ps)
1082                 return;
1083
1084         input_ena = ps->config.spi_ps_input_ena;
1085
1086         /* we need to enable at least one of them, otherwise we hang the GPU */
1087         assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
1088             G_0286CC_PERSP_CENTER_ENA(input_ena) ||
1089             G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
1090             G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
1091             G_0286CC_LINEAR_SAMPLE_ENA(input_ena) ||
1092             G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
1093             G_0286CC_LINEAR_CENTROID_ENA(input_ena) ||
1094             G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
1095
1096         if (sctx->force_persample_interp) {
1097                 unsigned num_persp = G_0286CC_PERSP_SAMPLE_ENA(input_ena) +
1098                                      G_0286CC_PERSP_CENTER_ENA(input_ena) +
1099                                      G_0286CC_PERSP_CENTROID_ENA(input_ena);
1100                 unsigned num_linear = G_0286CC_LINEAR_SAMPLE_ENA(input_ena) +
1101                                       G_0286CC_LINEAR_CENTER_ENA(input_ena) +
1102                                       G_0286CC_LINEAR_CENTROID_ENA(input_ena);
1103
1104                 /* If only one set of (i,j) coordinates is used, we can disable
1105                  * CENTER/CENTROID, enable SAMPLE and it will load SAMPLE coordinates
1106                  * where CENTER/CENTROID are expected, effectively forcing per-sample
1107                  * interpolation.
1108                  */
1109                 if (num_persp == 1) {
1110                         input_ena &= C_0286CC_PERSP_CENTER_ENA;
1111                         input_ena &= C_0286CC_PERSP_CENTROID_ENA;
1112                         input_ena |= G_0286CC_PERSP_SAMPLE_ENA(1);
1113                 }
1114                 if (num_linear == 1) {
1115                         input_ena &= C_0286CC_LINEAR_CENTER_ENA;
1116                         input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
1117                         input_ena |= G_0286CC_LINEAR_SAMPLE_ENA(1);
1118                 }
1119
1120                 /* If at least 2 sets of coordinates are used, we can't use this
1121                  * trick and have to select SAMPLE using a conditional assignment
1122                  * in the shader with "force_persample_interp" being a shader constant.
1123                  */
1124         }
1125
1126         radeon_set_context_reg_seq(cs, R_0286CC_SPI_PS_INPUT_ENA, 2);
1127         radeon_emit(cs, input_ena);
1128         radeon_emit(cs, input_ena);
1129
1130         if (ps->selector->forces_persample_interp_for_persp ||
1131             ps->selector->forces_persample_interp_for_linear)
1132                 radeon_set_sh_reg(cs, R_00B030_SPI_SHADER_USER_DATA_PS_0 +
1133                                       SI_SGPR_PS_STATE_BITS * 4,
1134                                   sctx->force_persample_interp);
1135 }
1136
1137 /**
1138  * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
1139  */
1140 static void si_init_config_add_vgt_flush(struct si_context *sctx)
1141 {
1142         if (sctx->init_config_has_vgt_flush)
1143                 return;
1144
1145         /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
1146         si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
1147         si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
1148         si_pm4_cmd_end(sctx->init_config, false);
1149         sctx->init_config_has_vgt_flush = true;
1150 }
1151
1152 /* Initialize state related to ESGS / GSVS ring buffers */
1153 static bool si_update_gs_ring_buffers(struct si_context *sctx)
1154 {
1155         struct si_shader_selector *es =
1156                 sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
1157         struct si_shader_selector *gs = sctx->gs_shader.cso;
1158         struct si_pm4_state *pm4;
1159
1160         /* Chip constants. */
1161         unsigned num_se = sctx->screen->b.info.max_se;
1162         unsigned wave_size = 64;
1163         unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
1164         unsigned gs_vertex_reuse = 16 * num_se; /* GS_VERTEX_REUSE register (per SE) */
1165         unsigned alignment = 256 * num_se;
1166         /* The maximum size is 63.999 MB per SE. */
1167         unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
1168
1169         /* Calculate the minimum size. */
1170         unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse *
1171                                             wave_size, alignment);
1172
1173         /* These are recommended sizes, not minimum sizes. */
1174         unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
1175                                   es->esgs_itemsize * gs->gs_input_verts_per_prim;
1176         unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
1177                                   gs->max_gsvs_emit_size * (gs->max_gs_stream + 1);
1178
1179         min_esgs_ring_size = align(min_esgs_ring_size, alignment);
1180         esgs_ring_size = align(esgs_ring_size, alignment);
1181         gsvs_ring_size = align(gsvs_ring_size, alignment);
1182
1183         esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
1184         gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
1185
1186         /* Some rings don't have to be allocated if shaders don't use them.
1187          * (e.g. no varyings between ES and GS or GS and VS)
1188          */
1189         bool update_esgs = esgs_ring_size &&
1190                            (!sctx->esgs_ring ||
1191                             sctx->esgs_ring->width0 < esgs_ring_size);
1192         bool update_gsvs = gsvs_ring_size &&
1193                            (!sctx->gsvs_ring ||
1194                             sctx->gsvs_ring->width0 < gsvs_ring_size);
1195
1196         if (!update_esgs && !update_gsvs)
1197                 return true;
1198
1199         if (update_esgs) {
1200                 pipe_resource_reference(&sctx->esgs_ring, NULL);
1201                 sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
1202                                                      PIPE_USAGE_DEFAULT,
1203                                                      esgs_ring_size);
1204                 if (!sctx->esgs_ring)
1205                         return false;
1206         }
1207
1208         if (update_gsvs) {
1209                 pipe_resource_reference(&sctx->gsvs_ring, NULL);
1210                 sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
1211                                                      PIPE_USAGE_DEFAULT,
1212                                                      gsvs_ring_size);
1213                 if (!sctx->gsvs_ring)
1214                         return false;
1215         }
1216
1217         /* Create the "init_config_gs_rings" state. */
1218         pm4 = CALLOC_STRUCT(si_pm4_state);
1219         if (!pm4)
1220                 return false;
1221
1222         if (sctx->b.chip_class >= CIK) {
1223                 if (sctx->esgs_ring)
1224                         si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
1225                                        sctx->esgs_ring->width0 / 256);
1226                 if (sctx->gsvs_ring)
1227                         si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
1228                                        sctx->gsvs_ring->width0 / 256);
1229         } else {
1230                 if (sctx->esgs_ring)
1231                         si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
1232                                        sctx->esgs_ring->width0 / 256);
1233                 if (sctx->gsvs_ring)
1234                         si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
1235                                        sctx->gsvs_ring->width0 / 256);
1236         }
1237
1238         /* Set the state. */
1239         if (sctx->init_config_gs_rings)
1240                 si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
1241         sctx->init_config_gs_rings = pm4;
1242
1243         if (!sctx->init_config_has_vgt_flush) {
1244                 si_init_config_add_vgt_flush(sctx);
1245                 si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
1246         }
1247
1248         /* Flush the context to re-emit both init_config states. */
1249         sctx->b.initial_gfx_cs_size = 0; /* force flush */
1250         si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
1251
1252         /* Set ring bindings. */
1253         if (sctx->esgs_ring) {
1254                 si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
1255                                    sctx->esgs_ring, 0, sctx->esgs_ring->width0,
1256                                    true, true, 4, 64, 0);
1257                 si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
1258                                    sctx->esgs_ring, 0, sctx->esgs_ring->width0,
1259                                    false, false, 0, 0, 0);
1260         }
1261         if (sctx->gsvs_ring)
1262                 si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
1263                                    sctx->gsvs_ring, 0, sctx->gsvs_ring->width0,
1264                                    false, false, 0, 0, 0);
1265         return true;
1266 }
1267
1268 static void si_update_gsvs_ring_bindings(struct si_context *sctx)
1269 {
1270         unsigned gsvs_itemsize = sctx->gs_shader.cso->max_gsvs_emit_size;
1271         uint64_t offset;
1272
1273         if (!sctx->gsvs_ring || gsvs_itemsize == sctx->last_gsvs_itemsize)
1274                 return;
1275
1276         sctx->last_gsvs_itemsize = gsvs_itemsize;
1277
1278         si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
1279                            sctx->gsvs_ring, gsvs_itemsize,
1280                            64, true, true, 4, 16, 0);
1281
1282         offset = gsvs_itemsize * 64;
1283         si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1,
1284                            sctx->gsvs_ring, gsvs_itemsize,
1285                            64, true, true, 4, 16, offset);
1286
1287         offset = (gsvs_itemsize * 2) * 64;
1288         si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2,
1289                            sctx->gsvs_ring, gsvs_itemsize,
1290                            64, true, true, 4, 16, offset);
1291
1292         offset = (gsvs_itemsize * 3) * 64;
1293         si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3,
1294                            sctx->gsvs_ring, gsvs_itemsize,
1295                            64, true, true, 4, 16, offset);
1296 }
1297
1298 /**
1299  * @returns 1 if \p sel has been updated to use a new scratch buffer
1300  *          0 if not
1301  *          < 0 if there was a failure
1302  */
1303 static int si_update_scratch_buffer(struct si_context *sctx,
1304                                     struct si_shader *shader)
1305 {
1306         uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
1307         int r;
1308
1309         if (!shader)
1310                 return 0;
1311
1312         /* This shader doesn't need a scratch buffer */
1313         if (shader->config.scratch_bytes_per_wave == 0)
1314                 return 0;
1315
1316         /* This shader is already configured to use the current
1317          * scratch buffer. */
1318         if (shader->scratch_bo == sctx->scratch_buffer)
1319                 return 0;
1320
1321         assert(sctx->scratch_buffer);
1322
1323         si_shader_apply_scratch_relocs(sctx, shader, scratch_va);
1324
1325         /* Replace the shader bo with a new bo that has the relocs applied. */
1326         r = si_shader_binary_upload(sctx->screen, shader);
1327         if (r)
1328                 return r;
1329
1330         /* Update the shader state to use the new shader bo. */
1331         si_shader_init_pm4_state(shader);
1332
1333         r600_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
1334
1335         return 1;
1336 }
1337
1338 static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
1339 {
1340         return sctx->scratch_buffer ? sctx->scratch_buffer->b.b.width0 : 0;
1341 }
1342
1343 static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_shader *shader)
1344 {
1345         return shader ? shader->config.scratch_bytes_per_wave : 0;
1346 }
1347
1348 static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
1349 {
1350         unsigned bytes = 0;
1351
1352         bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
1353         bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
1354         bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
1355         bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current));
1356         bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
1357         return bytes;
1358 }
1359
1360 static bool si_update_spi_tmpring_size(struct si_context *sctx)
1361 {
1362         unsigned current_scratch_buffer_size =
1363                 si_get_current_scratch_buffer_size(sctx);
1364         unsigned scratch_bytes_per_wave =
1365                 si_get_max_scratch_bytes_per_wave(sctx);
1366         unsigned scratch_needed_size = scratch_bytes_per_wave *
1367                 sctx->scratch_waves;
1368         unsigned spi_tmpring_size;
1369         int r;
1370
1371         if (scratch_needed_size > 0) {
1372                 if (scratch_needed_size > current_scratch_buffer_size) {
1373                         /* Create a bigger scratch buffer */
1374                         pipe_resource_reference(
1375                                         (struct pipe_resource**)&sctx->scratch_buffer,
1376                                         NULL);
1377
1378                         sctx->scratch_buffer =
1379                                         si_resource_create_custom(&sctx->screen->b.b,
1380                                         PIPE_USAGE_DEFAULT, scratch_needed_size);
1381                         if (!sctx->scratch_buffer)
1382                                 return false;
1383                         sctx->emit_scratch_reloc = true;
1384                 }
1385
1386                 /* Update the shaders, so they are using the latest scratch.  The
1387                  * scratch buffer may have been changed since these shaders were
1388                  * last used, so we still need to try to update them, even if
1389                  * they require scratch buffers smaller than the current size.
1390                  */
1391                 r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
1392                 if (r < 0)
1393                         return false;
1394                 if (r == 1)
1395                         si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
1396
1397                 r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
1398                 if (r < 0)
1399                         return false;
1400                 if (r == 1)
1401                         si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
1402
1403                 r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
1404                 if (r < 0)
1405                         return false;
1406                 if (r == 1)
1407                         si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
1408
1409                 /* VS can be bound as LS, ES, or VS. */
1410                 r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
1411                 if (r < 0)
1412                         return false;
1413                 if (r == 1) {
1414                         if (sctx->tes_shader.current)
1415                                 si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
1416                         else if (sctx->gs_shader.current)
1417                                 si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
1418                         else
1419                                 si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
1420                 }
1421
1422                 /* TES can be bound as ES or VS. */
1423                 r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
1424                 if (r < 0)
1425                         return false;
1426                 if (r == 1) {
1427                         if (sctx->gs_shader.current)
1428                                 si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
1429                         else
1430                                 si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
1431                 }
1432         }
1433
1434         /* The LLVM shader backend should be reporting aligned scratch_sizes. */
1435         assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
1436                 "scratch size should already be aligned correctly.");
1437
1438         spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
1439                            S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
1440         if (spi_tmpring_size != sctx->spi_tmpring_size) {
1441                 sctx->spi_tmpring_size = spi_tmpring_size;
1442                 sctx->emit_scratch_reloc = true;
1443         }
1444         return true;
1445 }
1446
1447 static void si_init_tess_factor_ring(struct si_context *sctx)
1448 {
1449         assert(!sctx->tf_ring);
1450
1451         sctx->tf_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
1452                                            PIPE_USAGE_DEFAULT,
1453                                            32768 * sctx->screen->b.info.max_se);
1454         if (!sctx->tf_ring)
1455                 return;
1456
1457         assert(((sctx->tf_ring->width0 / 4) & C_030938_SIZE) == 0);
1458
1459         si_init_config_add_vgt_flush(sctx);
1460
1461         /* Append these registers to the init config state. */
1462         if (sctx->b.chip_class >= CIK) {
1463                 si_pm4_set_reg(sctx->init_config, R_030938_VGT_TF_RING_SIZE,
1464                                S_030938_SIZE(sctx->tf_ring->width0 / 4));
1465                 si_pm4_set_reg(sctx->init_config, R_030940_VGT_TF_MEMORY_BASE,
1466                                r600_resource(sctx->tf_ring)->gpu_address >> 8);
1467         } else {
1468                 si_pm4_set_reg(sctx->init_config, R_008988_VGT_TF_RING_SIZE,
1469                                S_008988_SIZE(sctx->tf_ring->width0 / 4));
1470                 si_pm4_set_reg(sctx->init_config, R_0089B8_VGT_TF_MEMORY_BASE,
1471                                r600_resource(sctx->tf_ring)->gpu_address >> 8);
1472         }
1473
1474         /* Flush the context to re-emit the init_config state.
1475          * This is done only once in a lifetime of a context.
1476          */
1477         si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
1478         sctx->b.initial_gfx_cs_size = 0; /* force flush */
1479         si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
1480
1481         si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL,
1482                            SI_RING_TESS_FACTOR, sctx->tf_ring, 0,
1483                            sctx->tf_ring->width0, false, false, 0, 0, 0);
1484 }
1485
1486 /**
1487  * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
1488  * VS passes its outputs to TES directly, so the fixed-function shader only
1489  * has to write TESSOUTER and TESSINNER.
1490  */
1491 static void si_generate_fixed_func_tcs(struct si_context *sctx)
1492 {
1493         struct ureg_src const0, const1;
1494         struct ureg_dst tessouter, tessinner;
1495         struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL);
1496
1497         if (!ureg)
1498                 return; /* if we get here, we're screwed */
1499
1500         assert(!sctx->fixed_func_tcs_shader.cso);
1501
1502         ureg_DECL_constant2D(ureg, 0, 1, SI_DRIVER_STATE_CONST_BUF);
1503         const0 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 0),
1504                                     SI_DRIVER_STATE_CONST_BUF);
1505         const1 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 1),
1506                                     SI_DRIVER_STATE_CONST_BUF);
1507
1508         tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0);
1509         tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0);
1510
1511         ureg_MOV(ureg, tessouter, const0);
1512         ureg_MOV(ureg, tessinner, const1);
1513         ureg_END(ureg);
1514
1515         sctx->fixed_func_tcs_shader.cso =
1516                 ureg_create_shader_and_destroy(ureg, &sctx->b.b);
1517 }
1518
1519 static void si_update_vgt_shader_config(struct si_context *sctx)
1520 {
1521         /* Calculate the index of the config.
1522          * 0 = VS, 1 = VS+GS, 2 = VS+Tess, 3 = VS+Tess+GS */
1523         unsigned index = 2*!!sctx->tes_shader.cso + !!sctx->gs_shader.cso;
1524         struct si_pm4_state **pm4 = &sctx->vgt_shader_config[index];
1525
1526         if (!*pm4) {
1527                 uint32_t stages = 0;
1528
1529                 *pm4 = CALLOC_STRUCT(si_pm4_state);
1530
1531                 if (sctx->tes_shader.cso) {
1532                         stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) |
1533                                   S_028B54_HS_EN(1);
1534
1535                         if (sctx->gs_shader.cso)
1536                                 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) |
1537                                           S_028B54_GS_EN(1) |
1538                                           S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
1539                         else
1540                                 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
1541                 } else if (sctx->gs_shader.cso) {
1542                         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) |
1543                                   S_028B54_GS_EN(1) |
1544                                   S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
1545                 }
1546
1547                 si_pm4_set_reg(*pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
1548         }
1549         si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
1550 }
1551
1552 static void si_update_so(struct si_context *sctx, struct si_shader_selector *shader)
1553 {
1554         struct pipe_stream_output_info *so = &shader->so;
1555         uint32_t enabled_stream_buffers_mask = 0;
1556         int i;
1557
1558         for (i = 0; i < so->num_outputs; i++)
1559                 enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << (so->output[i].stream * 4);
1560         sctx->b.streamout.enabled_stream_buffers_mask = enabled_stream_buffers_mask;
1561         sctx->b.streamout.stride_in_dw = shader->so.stride;
1562 }
1563
1564 bool si_update_shaders(struct si_context *sctx)
1565 {
1566         struct pipe_context *ctx = (struct pipe_context*)sctx;
1567         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
1568         int r;
1569
1570         /* Update stages before GS. */
1571         if (sctx->tes_shader.cso) {
1572                 if (!sctx->tf_ring) {
1573                         si_init_tess_factor_ring(sctx);
1574                         if (!sctx->tf_ring)
1575                                 return false;
1576                 }
1577
1578                 /* VS as LS */
1579                 r = si_shader_select(ctx, &sctx->vs_shader);
1580                 if (r)
1581                         return false;
1582                 si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4);
1583
1584                 if (sctx->tcs_shader.cso) {
1585                         r = si_shader_select(ctx, &sctx->tcs_shader);
1586                         if (r)
1587                                 return false;
1588                         si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
1589                 } else {
1590                         if (!sctx->fixed_func_tcs_shader.cso) {
1591                                 si_generate_fixed_func_tcs(sctx);
1592                                 if (!sctx->fixed_func_tcs_shader.cso)
1593                                         return false;
1594                         }
1595
1596                         r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader);
1597                         if (r)
1598                                 return false;
1599                         si_pm4_bind_state(sctx, hs,
1600                                           sctx->fixed_func_tcs_shader.current->pm4);
1601                 }
1602
1603                 r = si_shader_select(ctx, &sctx->tes_shader);
1604                 if (r)
1605                         return false;
1606
1607                 if (sctx->gs_shader.cso) {
1608                         /* TES as ES */
1609                         si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4);
1610                 } else {
1611                         /* TES as VS */
1612                         si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4);
1613                         si_update_so(sctx, sctx->tes_shader.cso);
1614                 }
1615         } else if (sctx->gs_shader.cso) {
1616                 /* VS as ES */
1617                 r = si_shader_select(ctx, &sctx->vs_shader);
1618                 if (r)
1619                         return false;
1620                 si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4);
1621         } else {
1622                 /* VS as VS */
1623                 r = si_shader_select(ctx, &sctx->vs_shader);
1624                 if (r)
1625                         return false;
1626                 si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4);
1627                 si_update_so(sctx, sctx->vs_shader.cso);
1628         }
1629
1630         /* Update GS. */
1631         if (sctx->gs_shader.cso) {
1632                 r = si_shader_select(ctx, &sctx->gs_shader);
1633                 if (r)
1634                         return false;
1635                 si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
1636                 si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4);
1637                 si_update_so(sctx, sctx->gs_shader.cso);
1638
1639                 if (!si_update_gs_ring_buffers(sctx))
1640                         return false;
1641
1642                 si_update_gsvs_ring_bindings(sctx);
1643         } else {
1644                 si_pm4_bind_state(sctx, gs, NULL);
1645                 si_pm4_bind_state(sctx, es, NULL);
1646         }
1647
1648         si_update_vgt_shader_config(sctx);
1649
1650         if (sctx->ps_shader.cso) {
1651                 unsigned db_shader_control =
1652                         sctx->ps_shader.cso->db_shader_control |
1653                         S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) != PIPE_FUNC_ALWAYS);
1654
1655                 r = si_shader_select(ctx, &sctx->ps_shader);
1656                 if (r)
1657                         return false;
1658                 si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
1659
1660                 if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
1661                     sctx->sprite_coord_enable != rs->sprite_coord_enable ||
1662                     sctx->flatshade != rs->flatshade) {
1663                         sctx->sprite_coord_enable = rs->sprite_coord_enable;
1664                         sctx->flatshade = rs->flatshade;
1665                         si_mark_atom_dirty(sctx, &sctx->spi_map);
1666                 }
1667
1668                 if (si_pm4_state_changed(sctx, ps) ||
1669                     sctx->force_persample_interp != rs->force_persample_interp) {
1670                         sctx->force_persample_interp = rs->force_persample_interp;
1671                         si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
1672                 }
1673
1674                 if (sctx->ps_db_shader_control != db_shader_control) {
1675                         sctx->ps_db_shader_control = db_shader_control;
1676                         si_mark_atom_dirty(sctx, &sctx->db_render_state);
1677                 }
1678
1679                 if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.poly_line_smoothing) {
1680                         sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.poly_line_smoothing;
1681                         si_mark_atom_dirty(sctx, &sctx->msaa_config);
1682
1683                         if (sctx->b.chip_class == SI)
1684                                 si_mark_atom_dirty(sctx, &sctx->db_render_state);
1685                 }
1686         }
1687
1688         if (si_pm4_state_changed(sctx, ls) ||
1689             si_pm4_state_changed(sctx, hs) ||
1690             si_pm4_state_changed(sctx, es) ||
1691             si_pm4_state_changed(sctx, gs) ||
1692             si_pm4_state_changed(sctx, vs) ||
1693             si_pm4_state_changed(sctx, ps)) {
1694                 if (!si_update_spi_tmpring_size(sctx))
1695                         return false;
1696         }
1697         return true;
1698 }
1699
1700 void si_init_shader_functions(struct si_context *sctx)
1701 {
1702         si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
1703         si_init_atom(sctx, &sctx->spi_ps_input, &sctx->atoms.s.spi_ps_input, si_emit_spi_ps_input);
1704
1705         sctx->b.b.create_vs_state = si_create_shader_selector;
1706         sctx->b.b.create_tcs_state = si_create_shader_selector;
1707         sctx->b.b.create_tes_state = si_create_shader_selector;
1708         sctx->b.b.create_gs_state = si_create_shader_selector;
1709         sctx->b.b.create_fs_state = si_create_shader_selector;
1710
1711         sctx->b.b.bind_vs_state = si_bind_vs_shader;
1712         sctx->b.b.bind_tcs_state = si_bind_tcs_shader;
1713         sctx->b.b.bind_tes_state = si_bind_tes_shader;
1714         sctx->b.b.bind_gs_state = si_bind_gs_shader;
1715         sctx->b.b.bind_fs_state = si_bind_ps_shader;
1716
1717         sctx->b.b.delete_vs_state = si_delete_shader_selector;
1718         sctx->b.b.delete_tcs_state = si_delete_shader_selector;
1719         sctx->b.b.delete_tes_state = si_delete_shader_selector;
1720         sctx->b.b.delete_gs_state = si_delete_shader_selector;
1721         sctx->b.b.delete_fs_state = si_delete_shader_selector;
1722 }