src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include <llvm/Config/llvm-config.h>
  26
  27 #include "util/u_memory.h"
  28 #include "tgsi/tgsi_strings.h"
  29 #include "tgsi/tgsi_from_mesa.h"
  30
  31 #include "ac_exp_param.h"
  32 #include "ac_shader_util.h"
  33 #include "ac_rtld.h"
  34 #include "ac_llvm_util.h"
  35 #include "si_shader_internal.h"
  36 #include "si_pipe.h"
  37 #include "sid.h"
  38
  39 #include "compiler/nir/nir.h"
  40 #include "compiler/nir/nir_serialize.h"
  41
  42 static const char scratch_rsrc_dword0_symbol[] =
  43         "SCRATCH_RSRC_DWORD0";
  44
  45 static const char scratch_rsrc_dword1_symbol[] =
  46         "SCRATCH_RSRC_DWORD1";
  47
  48 static void si_llvm_emit_barrier(struct si_shader_context *ctx);
  49
  50 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
  51
  52 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  53                                         union si_shader_part_key *key);
  54 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  55                                          union si_shader_part_key *key);
  56 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  57                                         union si_shader_part_key *key);
  58 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  59                                         union si_shader_part_key *key);
  60 static void si_fix_resource_usage(struct si_screen *sscreen,
  61                                   struct si_shader *shader);
  62
  63 /* Ideally pass the sample mask input to the PS epilog as v14, which
  64  * is its usual location, so that the shader doesn't have to add v_mov.
  65  */
  66 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
  67
  68 static bool llvm_type_is_64bit(struct si_shader_context *ctx,
  69                                LLVMTypeRef type)
  70 {
  71         if (type == ctx->ac.i64 || type == ctx->ac.f64)
  72                 return true;
  73
  74         return false;
  75 }
  76
  77 /** Whether the shader runs as a combination of multiple API shaders */
  78 static bool is_multi_part_shader(struct si_shader_context *ctx)
  79 {
  80         if (ctx->screen->info.chip_class <= GFX8)
  81                 return false;
  82
  83         return ctx->shader->key.as_ls ||
  84                ctx->shader->key.as_es ||
  85                ctx->type == PIPE_SHADER_TESS_CTRL ||
  86                ctx->type == PIPE_SHADER_GEOMETRY;
  87 }
  88
  89 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
  90 static bool is_merged_shader(struct si_shader_context *ctx)
  91 {
  92         return ctx->shader->key.as_ngg || is_multi_part_shader(ctx);
  93 }
  94
  95 /**
  96  * Returns a unique index for a per-patch semantic name and index. The index
  97  * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
  98  * can be calculated.
  99  */
 100 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
 101 {
 102         switch (semantic_name) {
 103         case TGSI_SEMANTIC_TESSOUTER:
 104                 return 0;
 105         case TGSI_SEMANTIC_TESSINNER:
 106                 return 1;
 107         case TGSI_SEMANTIC_PATCH:
 108                 assert(index < 30);
 109                 return 2 + index;
 110
 111         default:
 112                 assert(!"invalid semantic name");
 113                 return 0;
 114         }
 115 }
 116
 117 /**
 118  * Returns a unique index for a semantic name and index. The index must be
 119  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 120  * calculated.
 121  */
 122 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
 123                                        unsigned is_varying)
 124 {
 125         switch (semantic_name) {
 126         case TGSI_SEMANTIC_POSITION:
 127                 return 0;
 128         case TGSI_SEMANTIC_GENERIC:
 129                 /* Since some shader stages use the the highest used IO index
 130                  * to determine the size to allocate for inputs/outputs
 131                  * (in LDS, tess and GS rings). GENERIC should be placed right
 132                  * after POSITION to make that size as small as possible.
 133                  */
 134                 if (index < SI_MAX_IO_GENERIC)
 135                         return 1 + index;
 136
 137                 assert(!"invalid generic index");
 138                 return 0;
 139         case TGSI_SEMANTIC_FOG:
 140                 return SI_MAX_IO_GENERIC + 1;
 141         case TGSI_SEMANTIC_COLOR:
 142                 assert(index < 2);
 143                 return SI_MAX_IO_GENERIC + 2 + index;
 144         case TGSI_SEMANTIC_BCOLOR:
 145                 assert(index < 2);
 146                 /* If it's a varying, COLOR and BCOLOR alias. */
 147                 if (is_varying)
 148                         return SI_MAX_IO_GENERIC + 2 + index;
 149                 else
 150                         return SI_MAX_IO_GENERIC + 4 + index;
 151         case TGSI_SEMANTIC_TEXCOORD:
 152                 assert(index < 8);
 153                 return SI_MAX_IO_GENERIC + 6 + index;
 154
 155         /* These are rarely used between LS and HS or ES and GS. */
 156         case TGSI_SEMANTIC_CLIPDIST:
 157                 assert(index < 2);
 158                 return SI_MAX_IO_GENERIC + 6 + 8 + index;
 159         case TGSI_SEMANTIC_CLIPVERTEX:
 160                 return SI_MAX_IO_GENERIC + 6 + 8 + 2;
 161         case TGSI_SEMANTIC_PSIZE:
 162                 return SI_MAX_IO_GENERIC + 6 + 8 + 3;
 163
 164         /* These can't be written by LS, HS, and ES. */
 165         case TGSI_SEMANTIC_LAYER:
 166                 return SI_MAX_IO_GENERIC + 6 + 8 + 4;
 167         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 168                 return SI_MAX_IO_GENERIC + 6 + 8 + 5;
 169         case TGSI_SEMANTIC_PRIMID:
 170                 STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
 171                 return SI_MAX_IO_GENERIC + 6 + 8 + 6;
 172         default:
 173                 fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
 174                 assert(!"invalid semantic name");
 175                 return 0;
 176         }
 177 }
 178
 179 /**
 180  * Get the value of a shader input parameter and extract a bitfield.
 181  */
 182 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
 183                                       LLVMValueRef value, unsigned rshift,
 184                                       unsigned bitwidth)
 185 {
 186         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 187                 value = ac_to_integer(&ctx->ac, value);
 188
 189         if (rshift)
 190                 value = LLVMBuildLShr(ctx->ac.builder, value,
 191                                       LLVMConstInt(ctx->i32, rshift, 0), "");
 192
 193         if (rshift + bitwidth < 32) {
 194                 unsigned mask = (1 << bitwidth) - 1;
 195                 value = LLVMBuildAnd(ctx->ac.builder, value,
 196                                      LLVMConstInt(ctx->i32, mask, 0), "");
 197         }
 198
 199         return value;
 200 }
 201
 202 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
 203                              struct ac_arg param, unsigned rshift,
 204                              unsigned bitwidth)
 205 {
 206         LLVMValueRef value = ac_get_arg(&ctx->ac, param);
 207
 208         return unpack_llvm_param(ctx, value, rshift, bitwidth);
 209 }
 210
 211 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 212 {
 213         switch (ctx->type) {
 214         case PIPE_SHADER_TESS_CTRL:
 215                 return si_unpack_param(ctx, ctx->args.tcs_rel_ids, 0, 8);
 216
 217         case PIPE_SHADER_TESS_EVAL:
 218                 return ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id);
 219
 220         default:
 221                 assert(0);
 222                 return NULL;
 223         }
 224 }
 225
 226 /* Tessellation shaders pass outputs to the next shader using LDS.
 227  *
 228  * LS outputs = TCS inputs
 229  * TCS outputs = TES inputs
 230  *
 231  * The LDS layout is:
 232  * - TCS inputs for patch 0
 233  * - TCS inputs for patch 1
 234  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 235  * - ...
 236  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 237  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 238  * - TCS outputs for patch 1
 239  * - Per-patch TCS outputs for patch 1
 240  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 241  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 242  * - ...
 243  *
 244  * All three shaders VS(LS), TCS, TES share the same LDS space.
 245  */
 246
 247 static LLVMValueRef
 248 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 249 {
 250         return si_unpack_param(ctx, ctx->vs_state_bits, 8, 13);
 251 }
 252
 253 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
 254 {
 255         assert(ctx->type == PIPE_SHADER_TESS_CTRL);
 256
 257         if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
 258                 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
 259
 260         return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
 261 }
 262
 263 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
 264 {
 265         unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 266
 267         return LLVMConstInt(ctx->i32, stride, 0);
 268 }
 269
 270 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
 271 {
 272         if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
 273                 return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
 274
 275         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 276         unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
 277         unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 278         unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
 279         unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
 280                                    num_patch_outputs * 4;
 281         return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
 282 }
 283
 284 static LLVMValueRef
 285 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 286 {
 287         return LLVMBuildMul(ctx->ac.builder,
 288                             si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 0, 16),
 289                             LLVMConstInt(ctx->i32, 4, 0), "");
 290 }
 291
 292 static LLVMValueRef
 293 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 294 {
 295         return LLVMBuildMul(ctx->ac.builder,
 296                             si_unpack_param(ctx, ctx->tcs_out_lds_offsets, 16, 16),
 297                             LLVMConstInt(ctx->i32, 4, 0), "");
 298 }
 299
 300 static LLVMValueRef
 301 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 302 {
 303         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 304         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 305
 306         return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
 307 }
 308
 309 static LLVMValueRef
 310 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 311 {
 312         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 313         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 314         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 315
 316         return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
 317 }
 318
 319 static LLVMValueRef
 320 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 321 {
 322         LLVMValueRef patch0_patch_data_offset =
 323                 get_tcs_out_patch0_patch_data_offset(ctx);
 324         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 325         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 326
 327         return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
 328 }
 329
 330 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
 331 {
 332         unsigned tcs_out_vertices =
 333                 ctx->shader->selector ?
 334                 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
 335
 336         /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
 337         if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
 338                 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
 339
 340         return si_unpack_param(ctx, ctx->tcs_offchip_layout, 6, 6);
 341 }
 342
 343 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 344 {
 345         unsigned stride;
 346
 347         switch (ctx->type) {
 348         case PIPE_SHADER_VERTEX:
 349                 stride = ctx->shader->selector->lshs_vertex_stride / 4;
 350                 return LLVMConstInt(ctx->i32, stride, 0);
 351
 352         case PIPE_SHADER_TESS_CTRL:
 353                 if (ctx->screen->info.chip_class >= GFX9 &&
 354                     ctx->shader->is_monolithic) {
 355                         stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
 356                         return LLVMConstInt(ctx->i32, stride, 0);
 357                 }
 358                 return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
 359
 360         default:
 361                 assert(0);
 362                 return NULL;
 363         }
 364 }
 365
 366 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
 367                                  LLVMValueRef i32, unsigned index)
 368 {
 369         assert(index <= 1);
 370
 371         if (index == 1)
 372                 return LLVMBuildAShr(ctx->ac.builder, i32,
 373                                      LLVMConstInt(ctx->i32, 16, 0), "");
 374
 375         return LLVMBuildSExt(ctx->ac.builder,
 376                              LLVMBuildTrunc(ctx->ac.builder, i32,
 377                                             ctx->ac.i16, ""),
 378                              ctx->i32, "");
 379 }
 380
 381 void si_llvm_load_input_vs(
 382         struct si_shader_context *ctx,
 383         unsigned input_index,
 384         LLVMValueRef out[4])
 385 {
 386         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 387         unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
 388
 389         if (vs_blit_property) {
 390                 LLVMValueRef vertex_id = ctx->abi.vertex_id;
 391                 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
 392                                                     LLVMIntULE, vertex_id,
 393                                                     ctx->i32_1, "");
 394                 /* Use LLVMIntNE, because we have 3 vertices and only
 395                  * the middle one should use y2.
 396                  */
 397                 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
 398                                                     LLVMIntNE, vertex_id,
 399                                                     ctx->i32_1, "");
 400
 401                 unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
 402                 if (input_index == 0) {
 403                         /* Position: */
 404                         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
 405                                                          param_vs_blit_inputs);
 406                         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
 407                                                          param_vs_blit_inputs + 1);
 408
 409                         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
 410                         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
 411                         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
 412                         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
 413
 414                         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 415                                                          x1, x2, "");
 416                         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 417                                                          y1, y2, "");
 418
 419                         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
 420                         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
 421                         out[2] = LLVMGetParam(ctx->main_fn,
 422                                               param_vs_blit_inputs + 2);
 423                         out[3] = ctx->ac.f32_1;
 424                         return;
 425                 }
 426
 427                 /* Color or texture coordinates: */
 428                 assert(input_index == 1);
 429
 430                 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
 431                         for (int i = 0; i < 4; i++) {
 432                                 out[i] = LLVMGetParam(ctx->main_fn,
 433                                                       param_vs_blit_inputs + 3 + i);
 434                         }
 435                 } else {
 436                         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
 437                         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
 438                                                        param_vs_blit_inputs + 3);
 439                         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
 440                                                        param_vs_blit_inputs + 4);
 441                         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
 442                                                        param_vs_blit_inputs + 5);
 443                         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
 444                                                        param_vs_blit_inputs + 6);
 445
 446                         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 447                                                  x1, x2, "");
 448                         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 449                                                  y1, y2, "");
 450                         out[2] = LLVMGetParam(ctx->main_fn,
 451                                               param_vs_blit_inputs + 7);
 452                         out[3] = LLVMGetParam(ctx->main_fn,
 453                                               param_vs_blit_inputs + 8);
 454                 }
 455                 return;
 456         }
 457
 458         unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
 459         union si_vs_fix_fetch fix_fetch;
 460         LLVMValueRef vb_desc;
 461         LLVMValueRef vertex_index;
 462         LLVMValueRef tmp;
 463
 464         if (input_index < num_vbos_in_user_sgprs) {
 465                 vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
 466         } else {
 467                 unsigned index= input_index - num_vbos_in_user_sgprs;
 468                 vb_desc = ac_build_load_to_sgpr(&ctx->ac,
 469                                                 ac_get_arg(&ctx->ac, ctx->vertex_buffers),
 470                                                 LLVMConstInt(ctx->i32, index, 0));
 471         }
 472
 473         vertex_index = LLVMGetParam(ctx->main_fn,
 474                                     ctx->vertex_index0.arg_index +
 475                                     input_index);
 476
 477         /* Use the open-coded implementation for all loads of doubles and
 478          * of dword-sized data that needs fixups. We need to insert conversion
 479          * code anyway, and the amd/common code does it for us.
 480          *
 481          * Note: On LLVM <= 8, we can only open-code formats with
 482          * channel size >= 4 bytes.
 483          */
 484         bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
 485         fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
 486         if (opencode ||
 487             (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
 488             (fix_fetch.u.log_size == 2)) {
 489                 tmp = ac_build_opencoded_load_format(
 490                                 &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
 491                                 fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
 492                                 vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
 493                 for (unsigned i = 0; i < 4; ++i)
 494                         out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
 495                 return;
 496         }
 497
 498         /* Do multiple loads for special formats. */
 499         unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
 500         LLVMValueRef fetches[4];
 501         unsigned num_fetches;
 502         unsigned fetch_stride;
 503         unsigned channels_per_fetch;
 504
 505         if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
 506                 num_fetches = MIN2(required_channels, 3);
 507                 fetch_stride = 1 << fix_fetch.u.log_size;
 508                 channels_per_fetch = 1;
 509         } else {
 510                 num_fetches = 1;
 511                 fetch_stride = 0;
 512                 channels_per_fetch = required_channels;
 513         }
 514
 515         for (unsigned i = 0; i < num_fetches; ++i) {
 516                 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
 517                 fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
 518                                                          channels_per_fetch, 0, true);
 519         }
 520
 521         if (num_fetches == 1 && channels_per_fetch > 1) {
 522                 LLVMValueRef fetch = fetches[0];
 523                 for (unsigned i = 0; i < channels_per_fetch; ++i) {
 524                         tmp = LLVMConstInt(ctx->i32, i, false);
 525                         fetches[i] = LLVMBuildExtractElement(
 526                                 ctx->ac.builder, fetch, tmp, "");
 527                 }
 528                 num_fetches = channels_per_fetch;
 529                 channels_per_fetch = 1;
 530         }
 531
 532         for (unsigned i = num_fetches; i < 4; ++i)
 533                 fetches[i] = LLVMGetUndef(ctx->f32);
 534
 535         if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
 536             required_channels == 4) {
 537                 if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
 538                         fetches[3] = ctx->ac.i32_1;
 539                 else
 540                         fetches[3] = ctx->ac.f32_1;
 541         } else if (fix_fetch.u.log_size == 3 &&
 542                    (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
 543                     fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
 544                     fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
 545                    required_channels == 4) {
 546                 /* For 2_10_10_10, the hardware returns an unsigned value;
 547                  * convert it to a signed one.
 548                  */
 549                 LLVMValueRef tmp = fetches[3];
 550                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 551
 552                 /* First, recover the sign-extended signed integer value. */
 553                 if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
 554                         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
 555                 else
 556                         tmp = ac_to_integer(&ctx->ac, tmp);
 557
 558                 /* For the integer-like cases, do a natural sign extension.
 559                  *
 560                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 561                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 562                  * exponent.
 563                  */
 564                 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
 565                                    fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
 566                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 567                 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 568
 569                 /* Convert back to the right type. */
 570                 if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
 571                         LLVMValueRef clamp;
 572                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 573                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 574                         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
 575                         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
 576                 } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
 577                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 578                 }
 579
 580                 fetches[3] = tmp;
 581         }
 582
 583         for (unsigned i = 0; i < 4; ++i)
 584                 out[i] = ac_to_float(&ctx->ac, fetches[i]);
 585 }
 586
 587 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
 588                                  unsigned swizzle)
 589 {
 590         if (swizzle > 0)
 591                 return ctx->i32_0;
 592
 593         switch (ctx->type) {
 594         case PIPE_SHADER_VERTEX:
 595                 return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
 596         case PIPE_SHADER_TESS_CTRL:
 597                 return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
 598         case PIPE_SHADER_TESS_EVAL:
 599                 return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
 600         case PIPE_SHADER_GEOMETRY:
 601                 return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
 602         default:
 603                 assert(0);
 604                 return ctx->i32_0;
 605         }
 606 }
 607
 608 static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
 609                                                         LLVMValueRef vertex_dw_stride,
 610                                                         LLVMValueRef base_addr,
 611                                                         LLVMValueRef vertex_index,
 612                                                         LLVMValueRef param_index,
 613                                                         ubyte name, ubyte index)
 614 {
 615         if (vertex_dw_stride) {
 616                 base_addr = ac_build_imad(&ctx->ac, vertex_index,
 617                                           vertex_dw_stride, base_addr);
 618         }
 619
 620         if (param_index) {
 621                 base_addr = ac_build_imad(&ctx->ac, param_index,
 622                                           LLVMConstInt(ctx->i32, 4, 0), base_addr);
 623         }
 624
 625         int param = name == TGSI_SEMANTIC_PATCH ||
 626                     name == TGSI_SEMANTIC_TESSINNER ||
 627                     name == TGSI_SEMANTIC_TESSOUTER ?
 628                 si_shader_io_get_unique_index_patch(name, index) :
 629                 si_shader_io_get_unique_index(name, index, false);
 630
 631         /* Add the base address of the element. */
 632         return LLVMBuildAdd(ctx->ac.builder, base_addr,
 633                             LLVMConstInt(ctx->i32, param * 4, 0), "");
 634 }
 635
 636 /* The offchip buffer layout for TCS->TES is
 637  *
 638  * - attribute 0 of patch 0 vertex 0
 639  * - attribute 0 of patch 0 vertex 1
 640  * - attribute 0 of patch 0 vertex 2
 641  *   ...
 642  * - attribute 0 of patch 1 vertex 0
 643  * - attribute 0 of patch 1 vertex 1
 644  *   ...
 645  * - attribute 1 of patch 0 vertex 0
 646  * - attribute 1 of patch 0 vertex 1
 647  *   ...
 648  * - per patch attribute 0 of patch 0
 649  * - per patch attribute 0 of patch 1
 650  *   ...
 651  *
 652  * Note that every attribute has 4 components.
 653  */
 654 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 655                                                LLVMValueRef rel_patch_id,
 656                                                LLVMValueRef vertex_index,
 657                                                LLVMValueRef param_index)
 658 {
 659         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 660         LLVMValueRef param_stride, constant16;
 661
 662         vertices_per_patch = get_num_tcs_out_vertices(ctx);
 663         num_patches = si_unpack_param(ctx, ctx->tcs_offchip_layout, 0, 6);
 664         total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
 665                                       num_patches, "");
 666
 667         constant16 = LLVMConstInt(ctx->i32, 16, 0);
 668         if (vertex_index) {
 669                 base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
 670                                           vertices_per_patch, vertex_index);
 671                 param_stride = total_vertices;
 672         } else {
 673                 base_addr = rel_patch_id;
 674                 param_stride = num_patches;
 675         }
 676
 677         base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
 678         base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
 679
 680         if (!vertex_index) {
 681                 LLVMValueRef patch_data_offset =
 682                            si_unpack_param(ctx, ctx->tcs_offchip_layout, 12, 20);
 683
 684                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 685                                          patch_data_offset, "");
 686         }
 687         return base_addr;
 688 }
 689
 690 /* This is a generic helper that can be shared by the NIR and TGSI backends */
 691 static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
 692                                         struct si_shader_context *ctx,
 693                                         LLVMValueRef vertex_index,
 694                                         LLVMValueRef param_index,
 695                                         ubyte name, ubyte index)
 696 {
 697         unsigned param_index_base;
 698
 699         param_index_base = name == TGSI_SEMANTIC_PATCH ||
 700                            name == TGSI_SEMANTIC_TESSINNER ||
 701                            name == TGSI_SEMANTIC_TESSOUTER ?
 702                 si_shader_io_get_unique_index_patch(name, index) :
 703                 si_shader_io_get_unique_index(name, index, false);
 704
 705         if (param_index) {
 706                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
 707                                            LLVMConstInt(ctx->i32, param_index_base, 0),
 708                                            "");
 709         } else {
 710                 param_index = LLVMConstInt(ctx->i32, param_index_base, 0);
 711         }
 712
 713         return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
 714                                           vertex_index, param_index);
 715 }
 716
 717 static LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
 718                                           LLVMTypeRef type,
 719                                           LLVMValueRef val1,
 720                                           LLVMValueRef val2)
 721 {
 722         LLVMValueRef values[2] = {
 723                 ac_to_integer(&ctx->ac, val1),
 724                 ac_to_integer(&ctx->ac, val2),
 725         };
 726         LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2);
 727         return LLVMBuildBitCast(ctx->ac.builder, result, type, "");
 728 }
 729
 730 static LLVMValueRef buffer_load(struct si_shader_context *ctx,
 731                                 LLVMTypeRef type, unsigned swizzle,
 732                                 LLVMValueRef buffer, LLVMValueRef offset,
 733                                 LLVMValueRef base, bool can_speculate)
 734 {
 735         LLVMValueRef value, value2;
 736         LLVMTypeRef vec_type = LLVMVectorType(type, 4);
 737
 738         if (swizzle == ~0) {
 739                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 740                                              0, ac_glc, can_speculate, false);
 741
 742                 return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
 743         }
 744
 745         if (!llvm_type_is_64bit(ctx, type)) {
 746                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 747                                              0, ac_glc, can_speculate, false);
 748
 749                 value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
 750                 return LLVMBuildExtractElement(ctx->ac.builder, value,
 751                                     LLVMConstInt(ctx->i32, swizzle, 0), "");
 752         }
 753
 754         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 755                                   swizzle * 4, ac_glc, can_speculate, false);
 756
 757         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 758                                    swizzle * 4 + 4, ac_glc, can_speculate, false);
 759
 760         return si_build_gather_64bit(ctx, type, value, value2);
 761 }
 762
 763 /**
 764  * Load from LSHS LDS storage.
 765  *
 766  * \param type          output value type
 767  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 768  * \param dw_addr       address in dwords
 769  */
 770 static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx,
 771                                   LLVMTypeRef type, unsigned swizzle,
 772                                   LLVMValueRef dw_addr)
 773 {
 774         LLVMValueRef value;
 775
 776         if (swizzle == ~0) {
 777                 LLVMValueRef values[4];
 778
 779                 for (unsigned chan = 0; chan < 4; chan++)
 780                         values[chan] = lshs_lds_load(ctx, type, chan, dw_addr);
 781
 782                 return ac_build_gather_values(&ctx->ac, values, 4);
 783         }
 784
 785         /* Split 64-bit loads. */
 786         if (llvm_type_is_64bit(ctx, type)) {
 787                 LLVMValueRef lo, hi;
 788
 789                 lo = lshs_lds_load(ctx, ctx->i32, swizzle, dw_addr);
 790                 hi = lshs_lds_load(ctx, ctx->i32, swizzle + 1, dw_addr);
 791                 return si_build_gather_64bit(ctx, type, lo, hi);
 792         }
 793
 794         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
 795                                LLVMConstInt(ctx->i32, swizzle, 0), "");
 796
 797         value = ac_lds_load(&ctx->ac, dw_addr);
 798
 799         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
 800 }
 801
 802 /**
 803  * Store to LSHS LDS storage.
 804  *
 805  * \param swizzle       offset (typically 0..3)
 806  * \param dw_addr       address in dwords
 807  * \param value         value to store
 808  */
 809 static void lshs_lds_store(struct si_shader_context *ctx,
 810                       unsigned dw_offset_imm, LLVMValueRef dw_addr,
 811                       LLVMValueRef value)
 812 {
 813         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
 814                                LLVMConstInt(ctx->i32, dw_offset_imm, 0), "");
 815
 816         ac_lds_store(&ctx->ac, dw_addr, value);
 817 }
 818
 819 enum si_tess_ring {
 820         TCS_FACTOR_RING,
 821         TESS_OFFCHIP_RING_TCS,
 822         TESS_OFFCHIP_RING_TES,
 823 };
 824
 825 static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
 826                                              enum si_tess_ring ring)
 827 {
 828         LLVMBuilderRef builder = ctx->ac.builder;
 829         LLVMValueRef addr = ac_get_arg(&ctx->ac,
 830                                        ring == TESS_OFFCHIP_RING_TES ?
 831                                        ctx->tes_offchip_addr :
 832                                        ctx->tcs_out_lds_layout);
 833
 834         /* TCS only receives high 13 bits of the address. */
 835         if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
 836                 addr = LLVMBuildAnd(builder, addr,
 837                                     LLVMConstInt(ctx->i32, 0xfff80000, 0), "");
 838         }
 839
 840         if (ring == TCS_FACTOR_RING) {
 841                 unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
 842                 addr = LLVMBuildAdd(builder, addr,
 843                                     LLVMConstInt(ctx->i32, tf_offset, 0), "");
 844         }
 845
 846         uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 847                          S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 848                          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
 849                          S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
 850
 851         if (ctx->screen->info.chip_class >= GFX10)
 852                 rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
 853                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
 854                          S_008F0C_RESOURCE_LEVEL(1);
 855         else
 856                 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
 857                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 858
 859         LLVMValueRef desc[4];
 860         desc[0] = addr;
 861         desc[1] = LLVMConstInt(ctx->i32,
 862                                S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
 863         desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0);
 864         desc[3] = LLVMConstInt(ctx->i32, rsrc3, false);
 865
 866         return ac_build_gather_values(&ctx->ac, desc, 4);
 867 }
 868
 869 static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
 870                                              LLVMTypeRef type,
 871                                              LLVMValueRef vertex_index,
 872                                              LLVMValueRef param_index,
 873                                              unsigned const_index,
 874                                              unsigned location,
 875                                              unsigned driver_location,
 876                                              unsigned component,
 877                                              unsigned num_components,
 878                                              bool is_patch,
 879                                              bool is_compact,
 880                                              bool load_input)
 881 {
 882         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 883         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 884         LLVMValueRef dw_addr, stride;
 885         ubyte name, index;
 886
 887         driver_location = driver_location / 4;
 888
 889         if (load_input) {
 890                 name = info->input_semantic_name[driver_location];
 891                 index = info->input_semantic_index[driver_location];
 892         } else {
 893                 name = info->output_semantic_name[driver_location];
 894                 index = info->output_semantic_index[driver_location];
 895         }
 896
 897         assert((name == TGSI_SEMANTIC_PATCH ||
 898                 name == TGSI_SEMANTIC_TESSINNER ||
 899                 name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
 900
 901         if (load_input) {
 902                 stride = get_tcs_in_vertex_dw_stride(ctx);
 903                 dw_addr = get_tcs_in_current_patch_offset(ctx);
 904         } else {
 905                 if (is_patch) {
 906                         stride = NULL;
 907                         dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 908                 } else {
 909                         stride = get_tcs_out_vertex_dw_stride(ctx);
 910                         dw_addr = get_tcs_out_current_patch_offset(ctx);
 911                 }
 912         }
 913
 914         if (!param_index) {
 915                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
 916         }
 917
 918         dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
 919                                                       vertex_index, param_index,
 920                                                       name, index);
 921
 922         LLVMValueRef value[4];
 923         for (unsigned i = 0; i < num_components; i++) {
 924                 unsigned offset = i;
 925                 if (llvm_type_is_64bit(ctx, type))
 926                         offset *= 2;
 927
 928                 offset += component;
 929                 value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr);
 930         }
 931
 932         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 933 }
 934
 935 LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
 936                                    LLVMTypeRef type,
 937                                    LLVMValueRef vertex_index,
 938                                    LLVMValueRef param_index,
 939                                    unsigned const_index,
 940                                    unsigned location,
 941                                    unsigned driver_location,
 942                                    unsigned component,
 943                                    unsigned num_components,
 944                                    bool is_patch,
 945                                    bool is_compact,
 946                                    bool load_input)
 947 {
 948         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 949         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 950         LLVMValueRef base, addr;
 951
 952         driver_location = driver_location / 4;
 953         ubyte name = info->input_semantic_name[driver_location];
 954         ubyte index = info->input_semantic_index[driver_location];
 955
 956         assert((name == TGSI_SEMANTIC_PATCH ||
 957                 name == TGSI_SEMANTIC_TESSINNER ||
 958                 name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
 959
 960         base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
 961
 962         if (!param_index) {
 963                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
 964         }
 965
 966         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
 967                                                                param_index,
 968                                                                name, index);
 969
 970         /* TODO: This will generate rather ordinary llvm code, although it
 971          * should be easy for the optimiser to fix up. In future we might want
 972          * to refactor buffer_load(), but for now this maximises code sharing
 973          * between the NIR and TGSI backends.
 974          */
 975         LLVMValueRef value[4];
 976         for (unsigned i = 0; i < num_components; i++) {
 977                 unsigned offset = i;
 978                 if (llvm_type_is_64bit(ctx, type)) {
 979                         offset *= 2;
 980                         if (offset == 4) {
 981                                 ubyte name = info->input_semantic_name[driver_location + 1];
 982                                 ubyte index = info->input_semantic_index[driver_location + 1];
 983                                 addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
 984                                                                                        vertex_index,
 985                                                                                        param_index,
 986                                                                                        name, index);
 987                         }
 988
 989                         offset = offset % 4;
 990                 }
 991
 992                 offset += component;
 993                 value[i + component] = buffer_load(ctx, type, offset,
 994                                                    ctx->tess_offchip_ring, base, addr, true);
 995         }
 996
 997         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 998 }
 999
1000 static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
1001                                     const struct nir_variable *var,
1002                                     LLVMValueRef vertex_index,
1003                                     LLVMValueRef param_index,
1004                                     unsigned const_index,
1005                                     LLVMValueRef src,
1006                                     unsigned writemask)
1007 {
1008         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1009         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1010         const unsigned component = var->data.location_frac;
1011         unsigned driver_location = var->data.driver_location;
1012         LLVMValueRef dw_addr, stride;
1013         LLVMValueRef buffer, base, addr;
1014         LLVMValueRef values[8];
1015         bool skip_lds_store;
1016         bool is_tess_factor = false, is_tess_inner = false;
1017
1018         driver_location = driver_location / 4;
1019         ubyte name = info->output_semantic_name[driver_location];
1020         ubyte index = info->output_semantic_index[driver_location];
1021
1022         bool is_const = !param_index;
1023         if (!param_index)
1024                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
1025
1026         const bool is_patch = var->data.patch ||
1027                               var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
1028                               var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
1029
1030         assert((name == TGSI_SEMANTIC_PATCH ||
1031                 name == TGSI_SEMANTIC_TESSINNER ||
1032                 name == TGSI_SEMANTIC_TESSOUTER) == is_patch);
1033
1034         if (!is_patch) {
1035                 stride = get_tcs_out_vertex_dw_stride(ctx);
1036                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1037                 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1038                                                               vertex_index, param_index,
1039                                                               name, index);
1040
1041                 skip_lds_store = !info->reads_pervertex_outputs;
1042         } else {
1043                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1044                 dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
1045                                                               vertex_index, param_index,
1046                                                               name, index);
1047
1048                 skip_lds_store = !info->reads_perpatch_outputs;
1049
1050                 if (is_const && const_index == 0) {
1051                         int name = info->output_semantic_name[driver_location];
1052
1053                         /* Always write tess factors into LDS for the TCS epilog. */
1054                         if (name == TGSI_SEMANTIC_TESSINNER ||
1055                             name == TGSI_SEMANTIC_TESSOUTER) {
1056                                 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1057                                 skip_lds_store = !info->reads_tessfactor_outputs &&
1058                                                  ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1059                                 is_tess_factor = true;
1060                                 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1061                         }
1062                 }
1063         }
1064
1065         buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1066
1067         base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
1068
1069         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1070                                                                param_index, name, index);
1071
1072         for (unsigned chan = component; chan < 8; chan++) {
1073                 if (!(writemask & (1 << chan)))
1074                         continue;
1075                 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
1076
1077                 unsigned buffer_store_offset = chan % 4;
1078                 if (chan == 4) {
1079                         ubyte name = info->output_semantic_name[driver_location + 1];
1080                         ubyte index = info->output_semantic_index[driver_location + 1];
1081                         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx,
1082                                                                                vertex_index,
1083                                                                                param_index,
1084                                                                                name, index);
1085                 }
1086
1087                 /* Skip LDS stores if there is no LDS read of this output. */
1088                 if (!skip_lds_store)
1089                         lshs_lds_store(ctx, chan, dw_addr, value);
1090
1091                 value = ac_to_integer(&ctx->ac, value);
1092                 values[chan] = value;
1093
1094                 if (writemask != 0xF && !is_tess_factor) {
1095                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1096                                                     addr, base,
1097                                                     4 * buffer_store_offset,
1098                                                     ac_glc);
1099                 }
1100
1101                 /* Write tess factors into VGPRs for the epilog. */
1102                 if (is_tess_factor &&
1103                     ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1104                         if (!is_tess_inner) {
1105                                 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1106                                                ctx->invoc0_tess_factors[chan]);
1107                         } else if (chan < 2) {
1108                                 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1109                                                ctx->invoc0_tess_factors[4 + chan]);
1110                         }
1111                 }
1112         }
1113
1114         if (writemask == 0xF && !is_tess_factor) {
1115                 LLVMValueRef value = ac_build_gather_values(&ctx->ac,
1116                                                             values, 4);
1117                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
1118                                             base, 0, ac_glc);
1119         }
1120 }
1121
1122 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
1123                                           unsigned input_index,
1124                                           unsigned vtx_offset_param,
1125                                           LLVMTypeRef type,
1126                                           unsigned swizzle)
1127 {
1128         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1129         struct si_shader *shader = ctx->shader;
1130         LLVMValueRef vtx_offset, soffset;
1131         struct tgsi_shader_info *info = &shader->selector->info;
1132         unsigned semantic_name = info->input_semantic_name[input_index];
1133         unsigned semantic_index = info->input_semantic_index[input_index];
1134         unsigned param;
1135         LLVMValueRef value;
1136
1137         param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
1138
1139         /* GFX9 has the ESGS ring in LDS. */
1140         if (ctx->screen->info.chip_class >= GFX9) {
1141                 unsigned index = vtx_offset_param;
1142
1143                 switch (index / 2) {
1144                 case 0:
1145                         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset,
1146                                                      index % 2 ? 16 : 0, 16);
1147                         break;
1148                 case 1:
1149                         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset,
1150                                                      index % 2 ? 16 : 0, 16);
1151                         break;
1152                 case 2:
1153                         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset,
1154                                                      index % 2 ? 16 : 0, 16);
1155                         break;
1156                 default:
1157                         assert(0);
1158                         return NULL;
1159                 }
1160
1161                 unsigned offset = param * 4 + swizzle;
1162                 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
1163                                           LLVMConstInt(ctx->i32, offset, false), "");
1164
1165                 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
1166                 LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
1167                 if (llvm_type_is_64bit(ctx, type)) {
1168                         ptr = LLVMBuildGEP(ctx->ac.builder, ptr,
1169                                            &ctx->ac.i32_1, 1, "");
1170                         LLVMValueRef values[2] = {
1171                                 value,
1172                                 LLVMBuildLoad(ctx->ac.builder, ptr, "")
1173                         };
1174                         value = ac_build_gather_values(&ctx->ac, values, 2);
1175                 }
1176                 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1177         }
1178
1179         /* GFX6: input load from the ESGS ring in memory. */
1180         if (swizzle == ~0) {
1181                 LLVMValueRef values[4];
1182                 unsigned chan;
1183                 for (chan = 0; chan < 4; chan++) {
1184                         values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
1185                                                              type, chan);
1186                 }
1187                 return ac_build_gather_values(&ctx->ac, values, 4);
1188         }
1189
1190         /* Get the vertex offset parameter on GFX6. */
1191         LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac,
1192                                                 ctx->gs_vtx_offset[vtx_offset_param]);
1193
1194         vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
1195                                   LLVMConstInt(ctx->i32, 4, 0), "");
1196
1197         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1198
1199         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1200                                      vtx_offset, soffset, 0, ac_glc, true, false);
1201         if (llvm_type_is_64bit(ctx, type)) {
1202                 LLVMValueRef value2;
1203                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1204
1205                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1206                                               ctx->i32_0, vtx_offset, soffset,
1207                                               0, ac_glc, true, false);
1208                 return si_build_gather_64bit(ctx, type, value, value2);
1209         }
1210         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1211 }
1212
1213 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
1214                                          unsigned location,
1215                                          unsigned driver_location,
1216                                          unsigned component,
1217                                          unsigned num_components,
1218                                          unsigned vertex_index,
1219                                          unsigned const_index,
1220                                          LLVMTypeRef type)
1221 {
1222         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1223
1224         LLVMValueRef value[4];
1225         for (unsigned i = 0; i < num_components; i++) {
1226                 unsigned offset = i;
1227                 if (llvm_type_is_64bit(ctx, type))
1228                         offset *= 2;
1229
1230                 offset += component;
1231                 value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4 + const_index,
1232                                                              vertex_index, type, offset);
1233         }
1234
1235         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1236 }
1237
1238 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1239                                        unsigned attr_index, unsigned chan,
1240                                        LLVMValueRef prim_mask,
1241                                        LLVMValueRef i, LLVMValueRef j)
1242 {
1243         if (i || j) {
1244                 return ac_build_fs_interp(&ctx->ac,
1245                                           LLVMConstInt(ctx->i32, chan, 0),
1246                                           LLVMConstInt(ctx->i32, attr_index, 0),
1247                                           prim_mask, i, j);
1248         }
1249         return ac_build_fs_interp_mov(&ctx->ac,
1250                                       LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1251                                       LLVMConstInt(ctx->i32, chan, 0),
1252                                       LLVMConstInt(ctx->i32, attr_index, 0),
1253                                       prim_mask);
1254 }
1255
1256 /**
1257  * Interpolate a fragment shader input.
1258  *
1259  * @param ctx           context
1260  * @param input_index           index of the input in hardware
1261  * @param semantic_name         TGSI_SEMANTIC_*
1262  * @param semantic_index        semantic index
1263  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1264  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1265  * @param interp_param          interpolation weights (i,j)
1266  * @param prim_mask             SI_PARAM_PRIM_MASK
1267  * @param face                  SI_PARAM_FRONT_FACE
1268  * @param result                the return value (4 components)
1269  */
1270 static void interp_fs_color(struct si_shader_context *ctx,
1271                             unsigned input_index,
1272                             unsigned semantic_index,
1273                             unsigned num_interp_inputs,
1274                             unsigned colors_read_mask,
1275                             LLVMValueRef interp_param,
1276                             LLVMValueRef prim_mask,
1277                             LLVMValueRef face,
1278                             LLVMValueRef result[4])
1279 {
1280         LLVMValueRef i = NULL, j = NULL;
1281         unsigned chan;
1282
1283         /* fs.constant returns the param from the middle vertex, so it's not
1284          * really useful for flat shading. It's meant to be used for custom
1285          * interpolation (but the intrinsic can't fetch from the other two
1286          * vertices).
1287          *
1288          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1289          * to do the right thing. The only reason we use fs.constant is that
1290          * fs.interp cannot be used on integers, because they can be equal
1291          * to NaN.
1292          *
1293          * When interp is false we will use fs.constant or for newer llvm,
1294          * amdgcn.interp.mov.
1295          */
1296         bool interp = interp_param != NULL;
1297
1298         if (interp) {
1299                 interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
1300                                                 LLVMVectorType(ctx->f32, 2), "");
1301
1302                 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1303                                                 ctx->i32_0, "");
1304                 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1305                                                 ctx->i32_1, "");
1306         }
1307
1308         if (ctx->shader->key.part.ps.prolog.color_two_side) {
1309                 LLVMValueRef is_face_positive;
1310
1311                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1312                  * otherwise it's at offset "num_inputs".
1313                  */
1314                 unsigned back_attr_offset = num_interp_inputs;
1315                 if (semantic_index == 1 && colors_read_mask & 0xf)
1316                         back_attr_offset += 1;
1317
1318                 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
1319                                                  face, ctx->i32_0, "");
1320
1321                 for (chan = 0; chan < 4; chan++) {
1322                         LLVMValueRef front, back;
1323
1324                         front = si_build_fs_interp(ctx,
1325                                                    input_index, chan,
1326                                                    prim_mask, i, j);
1327                         back = si_build_fs_interp(ctx,
1328                                                   back_attr_offset, chan,
1329                                                   prim_mask, i, j);
1330
1331                         result[chan] = LLVMBuildSelect(ctx->ac.builder,
1332                                                 is_face_positive,
1333                                                 front,
1334                                                 back,
1335                                                 "");
1336                 }
1337         } else {
1338                 for (chan = 0; chan < 4; chan++) {
1339                         result[chan] = si_build_fs_interp(ctx,
1340                                                           input_index, chan,
1341                                                           prim_mask, i, j);
1342                 }
1343         }
1344 }
1345
1346 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
1347 {
1348         return si_unpack_param(ctx, ctx->args.ancillary, 8, 4);
1349 }
1350
1351 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
1352 {
1353         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1354
1355         /* For non-indexed draws, the base vertex set by the driver
1356          * (for direct draws) or the CP (for indirect draws) is the
1357          * first vertex ID, but GLSL expects 0 to be returned.
1358          */
1359         LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
1360                                            ctx->vs_state_bits);
1361         LLVMValueRef indexed;
1362
1363         indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
1364         indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
1365
1366         return LLVMBuildSelect(ctx->ac.builder, indexed,
1367                                ac_get_arg(&ctx->ac, ctx->args.base_vertex),
1368                                ctx->i32_0, "");
1369 }
1370
1371 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
1372 {
1373         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1374
1375         LLVMValueRef values[3];
1376         LLVMValueRef result;
1377         unsigned i;
1378         unsigned *properties = ctx->shader->selector->info.properties;
1379
1380         if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1381                 unsigned sizes[3] = {
1382                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1383                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1384                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1385                 };
1386
1387                 for (i = 0; i < 3; ++i)
1388                         values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1389
1390                 result = ac_build_gather_values(&ctx->ac, values, 3);
1391         } else {
1392                 result = ac_get_arg(&ctx->ac, ctx->block_size);
1393         }
1394
1395         return result;
1396 }
1397
1398 /**
1399  * Load a dword from a constant buffer.
1400  */
1401 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1402                                       LLVMValueRef resource,
1403                                       LLVMValueRef offset)
1404 {
1405         return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1406                                     0, 0, true, true);
1407 }
1408
1409 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
1410 {
1411         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1412         LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->rw_buffers);
1413         LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1414         LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
1415
1416         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1417         LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->i32, 8, 0), "");
1418         LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1419
1420         LLVMValueRef pos[4] = {
1421                 buffer_load_const(ctx, resource, offset0),
1422                 buffer_load_const(ctx, resource, offset1),
1423                 LLVMConstReal(ctx->f32, 0),
1424                 LLVMConstReal(ctx->f32, 0)
1425         };
1426
1427         return ac_build_gather_values(&ctx->ac, pos, 4);
1428 }
1429
1430 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
1431 {
1432         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1433         return ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.sample_coverage));
1434 }
1435
1436 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
1437 {
1438         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1439         LLVMValueRef coord[4] = {
1440                 ac_get_arg(&ctx->ac, ctx->tes_u),
1441                 ac_get_arg(&ctx->ac, ctx->tes_v),
1442                 ctx->ac.f32_0,
1443                 ctx->ac.f32_0
1444         };
1445
1446         /* For triangles, the vector should be (u, v, 1-u-v). */
1447         if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1448             PIPE_PRIM_TRIANGLES) {
1449                 coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
1450                                          LLVMBuildFAdd(ctx->ac.builder,
1451                                                        coord[0], coord[1], ""), "");
1452         }
1453         return ac_build_gather_values(&ctx->ac, coord, 4);
1454 }
1455
1456 static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
1457                                     unsigned semantic_name)
1458 {
1459         LLVMValueRef base, addr;
1460
1461         int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
1462
1463         base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
1464         addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1465                                           LLVMConstInt(ctx->i32, param, 0));
1466
1467         return buffer_load(ctx, ctx->f32,
1468                            ~0, ctx->tess_offchip_ring, base, addr, true);
1469
1470 }
1471
1472 static LLVMValueRef load_tess_level_default(struct si_shader_context *ctx,
1473                                             unsigned semantic_name)
1474 {
1475         LLVMValueRef buf, slot, val[4];
1476         int i, offset;
1477
1478         slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1479         buf = ac_get_arg(&ctx->ac, ctx->rw_buffers);
1480         buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
1481         offset = semantic_name == TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL ? 4 : 0;
1482
1483         for (i = 0; i < 4; i++)
1484                 val[i] = buffer_load_const(ctx, buf,
1485                                            LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1486         return ac_build_gather_values(&ctx->ac, val, 4);
1487 }
1488
1489 static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
1490                                        unsigned varying_id,
1491                                        bool load_default_state)
1492 {
1493         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1494         unsigned semantic_name;
1495
1496         if (load_default_state) {
1497                 switch (varying_id) {
1498                 case VARYING_SLOT_TESS_LEVEL_INNER:
1499                         semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL;
1500                         break;
1501                 case VARYING_SLOT_TESS_LEVEL_OUTER:
1502                         semantic_name = TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL;
1503                         break;
1504                 default:
1505                         unreachable("unknown tess level");
1506                 }
1507                 return load_tess_level_default(ctx, semantic_name);
1508         }
1509
1510         switch (varying_id) {
1511         case VARYING_SLOT_TESS_LEVEL_INNER:
1512                 semantic_name = TGSI_SEMANTIC_TESSINNER;
1513                 break;
1514         case VARYING_SLOT_TESS_LEVEL_OUTER:
1515                 semantic_name = TGSI_SEMANTIC_TESSOUTER;
1516                 break;
1517         default:
1518                 unreachable("unknown tess level");
1519         }
1520
1521         return load_tess_level(ctx, semantic_name);
1522
1523 }
1524
1525 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
1526 {
1527         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1528         if (ctx->type == PIPE_SHADER_TESS_CTRL)
1529                 return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 13, 6);
1530         else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1531                 return get_num_tcs_out_vertices(ctx);
1532         else
1533                 unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1534 }
1535
1536 void si_declare_compute_memory(struct si_shader_context *ctx)
1537 {
1538         struct si_shader_selector *sel = ctx->shader->selector;
1539         unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
1540
1541         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
1542         LLVMValueRef var;
1543
1544         assert(!ctx->ac.lds);
1545
1546         var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1547                                           LLVMArrayType(ctx->i8, lds_size),
1548                                           "compute_lds",
1549                                           AC_ADDR_SPACE_LDS);
1550         LLVMSetAlignment(var, 64 * 1024);
1551
1552         ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
1553 }
1554
1555 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
1556 {
1557         LLVMValueRef ptr =
1558                 ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
1559         struct si_shader_selector *sel = ctx->shader->selector;
1560
1561         /* Do the bounds checking with a descriptor, because
1562          * doing computation and manual bounds checking of 64-bit
1563          * addresses generates horrible VALU code with very high
1564          * VGPR usage and very low SIMD occupancy.
1565          */
1566         ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
1567
1568         LLVMValueRef desc0, desc1;
1569         desc0 = ptr;
1570         desc1 = LLVMConstInt(ctx->i32,
1571                              S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
1572
1573         uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1574                          S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1575                          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1576                          S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
1577
1578         if (ctx->screen->info.chip_class >= GFX10)
1579                 rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
1580                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
1581                          S_008F0C_RESOURCE_LEVEL(1);
1582         else
1583                 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1584                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1585
1586         LLVMValueRef desc_elems[] = {
1587                 desc0,
1588                 desc1,
1589                 LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
1590                 LLVMConstInt(ctx->i32, rsrc3, false)
1591         };
1592
1593         return ac_build_gather_values(&ctx->ac, desc_elems, 4);
1594 }
1595
1596 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
1597 {
1598         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1599         struct si_shader_selector *sel = ctx->shader->selector;
1600
1601         LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
1602
1603         if (sel->info.const_buffers_declared == 1 &&
1604             sel->info.shader_buffers_declared == 0) {
1605                 return load_const_buffer_desc_fast_path(ctx);
1606         }
1607
1608         index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
1609         index = LLVMBuildAdd(ctx->ac.builder, index,
1610                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1611
1612         return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
1613 }
1614
1615 static LLVMValueRef
1616 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
1617 {
1618         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1619         LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac,
1620                                            ctx->const_and_shader_buffers);
1621
1622         index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
1623         index = LLVMBuildSub(ctx->ac.builder,
1624                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
1625                              index, "");
1626
1627         return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
1628 }
1629
1630 /* Initialize arguments for the shader export intrinsic */
1631 static void si_llvm_init_export_args(struct si_shader_context *ctx,
1632                                      LLVMValueRef *values,
1633                                      unsigned target,
1634                                      struct ac_export_args *args)
1635 {
1636         LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
1637         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1638         unsigned chan;
1639         bool is_int8, is_int10;
1640
1641         /* Default is 0xf. Adjusted below depending on the format. */
1642         args->enabled_channels = 0xf; /* writemask */
1643
1644         /* Specify whether the EXEC mask represents the valid mask */
1645         args->valid_mask = 0;
1646
1647         /* Specify whether this is the last export */
1648         args->done = 0;
1649
1650         /* Specify the target we are exporting */
1651         args->target = target;
1652
1653         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1654                 const struct si_shader_key *key = &ctx->shader->key;
1655                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1656                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1657
1658                 assert(cbuf >= 0 && cbuf < 8);
1659                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1660                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1661                 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1662         }
1663
1664         args->compr = false;
1665         args->out[0] = f32undef;
1666         args->out[1] = f32undef;
1667         args->out[2] = f32undef;
1668         args->out[3] = f32undef;
1669
1670         LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
1671         LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
1672                               unsigned bits, bool hi) = NULL;
1673
1674         switch (spi_shader_col_format) {
1675         case V_028714_SPI_SHADER_ZERO:
1676                 args->enabled_channels = 0; /* writemask */
1677                 args->target = V_008DFC_SQ_EXP_NULL;
1678                 break;
1679
1680         case V_028714_SPI_SHADER_32_R:
1681                 args->enabled_channels = 1; /* writemask */
1682                 args->out[0] = values[0];
1683                 break;
1684
1685         case V_028714_SPI_SHADER_32_GR:
1686                 args->enabled_channels = 0x3; /* writemask */
1687                 args->out[0] = values[0];
1688                 args->out[1] = values[1];
1689                 break;
1690
1691         case V_028714_SPI_SHADER_32_AR:
1692                 if (ctx->screen->info.chip_class >= GFX10) {
1693                         args->enabled_channels = 0x3; /* writemask */
1694                         args->out[0] = values[0];
1695                         args->out[1] = values[3];
1696                 } else {
1697                         args->enabled_channels = 0x9; /* writemask */
1698                         args->out[0] = values[0];
1699                         args->out[3] = values[3];
1700                 }
1701                 break;
1702
1703         case V_028714_SPI_SHADER_FP16_ABGR:
1704                 packf = ac_build_cvt_pkrtz_f16;
1705                 break;
1706
1707         case V_028714_SPI_SHADER_UNORM16_ABGR:
1708                 packf = ac_build_cvt_pknorm_u16;
1709                 break;
1710
1711         case V_028714_SPI_SHADER_SNORM16_ABGR:
1712                 packf = ac_build_cvt_pknorm_i16;
1713                 break;
1714
1715         case V_028714_SPI_SHADER_UINT16_ABGR:
1716                 packi = ac_build_cvt_pk_u16;
1717                 break;
1718
1719         case V_028714_SPI_SHADER_SINT16_ABGR:
1720                 packi = ac_build_cvt_pk_i16;
1721                 break;
1722
1723         case V_028714_SPI_SHADER_32_ABGR:
1724                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1725                 break;
1726         }
1727
1728         /* Pack f16 or norm_i16/u16. */
1729         if (packf) {
1730                 for (chan = 0; chan < 2; chan++) {
1731                         LLVMValueRef pack_args[2] = {
1732                                 values[2 * chan],
1733                                 values[2 * chan + 1]
1734                         };
1735                         LLVMValueRef packed;
1736
1737                         packed = packf(&ctx->ac, pack_args);
1738                         args->out[chan] = ac_to_float(&ctx->ac, packed);
1739                 }
1740                 args->compr = 1; /* COMPR flag */
1741         }
1742         /* Pack i16/u16. */
1743         if (packi) {
1744                 for (chan = 0; chan < 2; chan++) {
1745                         LLVMValueRef pack_args[2] = {
1746                                 ac_to_integer(&ctx->ac, values[2 * chan]),
1747                                 ac_to_integer(&ctx->ac, values[2 * chan + 1])
1748                         };
1749                         LLVMValueRef packed;
1750
1751                         packed = packi(&ctx->ac, pack_args,
1752                                        is_int8 ? 8 : is_int10 ? 10 : 16,
1753                                        chan == 1);
1754                         args->out[chan] = ac_to_float(&ctx->ac, packed);
1755                 }
1756                 args->compr = 1; /* COMPR flag */
1757         }
1758 }
1759
1760 static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
1761 {
1762         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1763                 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
1764                         [PIPE_FUNC_LESS] = LLVMRealOLT,
1765                         [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
1766                         [PIPE_FUNC_LEQUAL] = LLVMRealOLE,
1767                         [PIPE_FUNC_GREATER] = LLVMRealOGT,
1768                         [PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
1769                         [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
1770                 };
1771                 LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
1772                 assert(cond);
1773
1774                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
1775                                 SI_PARAM_ALPHA_REF);
1776                 LLVMValueRef alpha_pass =
1777                         LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
1778                 ac_build_kill_if_false(&ctx->ac, alpha_pass);
1779         } else {
1780                 ac_build_kill_if_false(&ctx->ac, ctx->i1false);
1781         }
1782 }
1783
1784 static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx,
1785                                                   LLVMValueRef alpha,
1786                                                   unsigned samplemask_param)
1787 {
1788         LLVMValueRef coverage;
1789
1790         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
1791         coverage = LLVMGetParam(ctx->main_fn,
1792                                 samplemask_param);
1793         coverage = ac_to_integer(&ctx->ac, coverage);
1794
1795         coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32",
1796                                    ctx->i32,
1797                                    &coverage, 1, AC_FUNC_ATTR_READNONE);
1798
1799         coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
1800                                    ctx->f32, "");
1801
1802         coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
1803                                  LLVMConstReal(ctx->f32,
1804                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
1805
1806         return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
1807 }
1808
1809 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
1810                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
1811 {
1812         unsigned reg_index;
1813         unsigned chan;
1814         unsigned const_chan;
1815         LLVMValueRef base_elt;
1816         LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
1817         LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
1818                                                    SI_VS_CONST_CLIP_PLANES, 0);
1819         LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
1820
1821         for (reg_index = 0; reg_index < 2; reg_index ++) {
1822                 struct ac_export_args *args = &pos[2 + reg_index];
1823
1824                 args->out[0] =
1825                 args->out[1] =
1826                 args->out[2] =
1827                 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
1828
1829                 /* Compute dot products of position and user clip plane vectors */
1830                 for (chan = 0; chan < 4; chan++) {
1831                         for (const_chan = 0; const_chan < 4; const_chan++) {
1832                                 LLVMValueRef addr =
1833                                         LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
1834                                                                 const_chan) * 4, 0);
1835                                 base_elt = buffer_load_const(ctx, const_resource,
1836                                                              addr);
1837                                 args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
1838                                                                 out_elts[const_chan], args->out[chan]);
1839                         }
1840                 }
1841
1842                 args->enabled_channels = 0xf;
1843                 args->valid_mask = 0;
1844                 args->done = 0;
1845                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
1846                 args->compr = 0;
1847         }
1848 }
1849
1850 static void si_dump_streamout(struct pipe_stream_output_info *so)
1851 {
1852         unsigned i;
1853
1854         if (so->num_outputs)
1855                 fprintf(stderr, "STREAMOUT\n");
1856
1857         for (i = 0; i < so->num_outputs; i++) {
1858                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
1859                                 so->output[i].start_component;
1860                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
1861                         i, so->output[i].output_buffer,
1862                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
1863                         so->output[i].register_index,
1864                         mask & 1 ? "x" : "",
1865                         mask & 2 ? "y" : "",
1866                         mask & 4 ? "z" : "",
1867                         mask & 8 ? "w" : "");
1868         }
1869 }
1870
1871 void si_emit_streamout_output(struct si_shader_context *ctx,
1872                               LLVMValueRef const *so_buffers,
1873                               LLVMValueRef const *so_write_offsets,
1874                               struct pipe_stream_output *stream_out,
1875                               struct si_shader_output_values *shader_out)
1876 {
1877         unsigned buf_idx = stream_out->output_buffer;
1878         unsigned start = stream_out->start_component;
1879         unsigned num_comps = stream_out->num_components;
1880         LLVMValueRef out[4];
1881
1882         assert(num_comps && num_comps <= 4);
1883         if (!num_comps || num_comps > 4)
1884                 return;
1885
1886         /* Load the output as int. */
1887         for (int j = 0; j < num_comps; j++) {
1888                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
1889
1890                 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
1891         }
1892
1893         /* Pack the output. */
1894         LLVMValueRef vdata = NULL;
1895
1896         switch (num_comps) {
1897         case 1: /* as i32 */
1898                 vdata = out[0];
1899                 break;
1900         case 2: /* as v2i32 */
1901         case 3: /* as v3i32 */
1902                 if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
1903                         vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
1904                         break;
1905                 }
1906                 /* as v4i32 (aligned to 4) */
1907                 out[3] = LLVMGetUndef(ctx->i32);
1908                 /* fall through */
1909         case 4: /* as v4i32 */
1910                 vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
1911                 break;
1912         }
1913
1914         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
1915                                     vdata, num_comps,
1916                                     so_write_offsets[buf_idx],
1917                                     ctx->i32_0,
1918                                     stream_out->dst_offset * 4, ac_glc | ac_slc);
1919 }
1920
1921 /**
1922  * Write streamout data to buffers for vertex stream @p stream (different
1923  * vertex streams can occur for GS copy shaders).
1924  */
1925 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
1926                                    struct si_shader_output_values *outputs,
1927                                    unsigned noutput, unsigned stream)
1928 {
1929         struct si_shader_selector *sel = ctx->shader->selector;
1930         struct pipe_stream_output_info *so = &sel->so;
1931         LLVMBuilderRef builder = ctx->ac.builder;
1932         int i;
1933
1934         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1935         LLVMValueRef so_vtx_count =
1936                 si_unpack_param(ctx, ctx->streamout_config, 16, 7);
1937
1938         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
1939
1940         /* can_emit = tid < so_vtx_count; */
1941         LLVMValueRef can_emit =
1942                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
1943
1944         /* Emit the streamout code conditionally. This actually avoids
1945          * out-of-bounds buffer access. The hw tells us via the SGPR
1946          * (so_vtx_count) which threads are allowed to emit streamout data. */
1947         ac_build_ifcc(&ctx->ac, can_emit, 6501);
1948         {
1949                 /* The buffer offset is computed as follows:
1950                  *   ByteOffset = streamout_offset[buffer_id]*4 +
1951                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
1952                  *                attrib_offset
1953                  */
1954
1955                 LLVMValueRef so_write_index =
1956                         ac_get_arg(&ctx->ac,
1957                                    ctx->streamout_write_index);
1958
1959                 /* Compute (streamout_write_index + thread_id). */
1960                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
1961
1962                 /* Load the descriptor and compute the write offset for each
1963                  * enabled buffer. */
1964                 LLVMValueRef so_write_offset[4] = {};
1965                 LLVMValueRef so_buffers[4];
1966                 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
1967                                                   ctx->rw_buffers);
1968
1969                 for (i = 0; i < 4; i++) {
1970                         if (!so->stride[i])
1971                                 continue;
1972
1973                         LLVMValueRef offset = LLVMConstInt(ctx->i32,
1974                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
1975
1976                         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
1977
1978                         LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
1979                                                             ctx->streamout_offset[i]);
1980                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
1981
1982                         so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
1983                                                            LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
1984                                                            so_offset);
1985                 }
1986
1987                 /* Write streamout data. */
1988                 for (i = 0; i < so->num_outputs; i++) {
1989                         unsigned reg = so->output[i].register_index;
1990
1991                         if (reg >= noutput)
1992                                 continue;
1993
1994                         if (stream != so->output[i].stream)
1995                                 continue;
1996
1997                         si_emit_streamout_output(ctx, so_buffers, so_write_offset,
1998                                                  &so->output[i], &outputs[reg]);
1999                 }
2000         }
2001         ac_build_endif(&ctx->ac, 6501);
2002 }
2003
2004 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2005                             LLVMValueRef *values)
2006 {
2007         struct ac_export_args args;
2008
2009         si_llvm_init_export_args(ctx, values,
2010                                  V_008DFC_SQ_EXP_PARAM + index, &args);
2011         ac_build_export(&ctx->ac, &args);
2012 }
2013
2014 static void si_build_param_exports(struct si_shader_context *ctx,
2015                                    struct si_shader_output_values *outputs,
2016                                    unsigned noutput)
2017 {
2018         struct si_shader *shader = ctx->shader;
2019         unsigned param_count = 0;
2020
2021         for (unsigned i = 0; i < noutput; i++) {
2022                 unsigned semantic_name = outputs[i].semantic_name;
2023                 unsigned semantic_index = outputs[i].semantic_index;
2024
2025                 if (outputs[i].vertex_stream[0] != 0 &&
2026                     outputs[i].vertex_stream[1] != 0 &&
2027                     outputs[i].vertex_stream[2] != 0 &&
2028                     outputs[i].vertex_stream[3] != 0)
2029                         continue;
2030
2031                 switch (semantic_name) {
2032                 case TGSI_SEMANTIC_LAYER:
2033                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2034                 case TGSI_SEMANTIC_CLIPDIST:
2035                 case TGSI_SEMANTIC_COLOR:
2036                 case TGSI_SEMANTIC_BCOLOR:
2037                 case TGSI_SEMANTIC_PRIMID:
2038                 case TGSI_SEMANTIC_FOG:
2039                 case TGSI_SEMANTIC_TEXCOORD:
2040                 case TGSI_SEMANTIC_GENERIC:
2041                         break;
2042                 default:
2043                         continue;
2044                 }
2045
2046                 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2047                      semantic_index < SI_MAX_IO_GENERIC) &&
2048                     shader->key.opt.kill_outputs &
2049                     (1ull << si_shader_io_get_unique_index(semantic_name,
2050                                                            semantic_index, true)))
2051                         continue;
2052
2053                 si_export_param(ctx, param_count, outputs[i].values);
2054
2055                 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2056                 shader->info.vs_output_param_offset[i] = param_count++;
2057         }
2058
2059         shader->info.nr_param_exports = param_count;
2060 }
2061
2062 /**
2063  * Vertex color clamping.
2064  *
2065  * This uses a state constant loaded in a user data SGPR and
2066  * an IF statement is added that clamps all colors if the constant
2067  * is true.
2068  */
2069 static void si_vertex_color_clamping(struct si_shader_context *ctx,
2070                                      struct si_shader_output_values *outputs,
2071                                      unsigned noutput)
2072 {
2073         LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
2074         bool has_colors = false;
2075
2076         /* Store original colors to alloca variables. */
2077         for (unsigned i = 0; i < noutput; i++) {
2078                 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
2079                     outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
2080                         continue;
2081
2082                 for (unsigned j = 0; j < 4; j++) {
2083                         addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->f32, "");
2084                         LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
2085                 }
2086                 has_colors = true;
2087         }
2088
2089         if (!has_colors)
2090                 return;
2091
2092         /* The state is in the first bit of the user SGPR. */
2093         LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
2094         cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->i1, "");
2095
2096         ac_build_ifcc(&ctx->ac, cond, 6502);
2097
2098         /* Store clamped colors to alloca variables within the conditional block. */
2099         for (unsigned i = 0; i < noutput; i++) {
2100                 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
2101                     outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
2102                         continue;
2103
2104                 for (unsigned j = 0; j < 4; j++) {
2105                         LLVMBuildStore(ctx->ac.builder,
2106                                        ac_build_clamp(&ctx->ac, outputs[i].values[j]),
2107                                        addr[i][j]);
2108                 }
2109         }
2110         ac_build_endif(&ctx->ac, 6502);
2111
2112         /* Load clamped colors */
2113         for (unsigned i = 0; i < noutput; i++) {
2114                 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
2115                     outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
2116                         continue;
2117
2118                 for (unsigned j = 0; j < 4; j++) {
2119                         outputs[i].values[j] =
2120                                 LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
2121                 }
2122         }
2123 }
2124
2125 /* Generate export instructions for hardware VS shader stage or NGG GS stage
2126  * (position and parameter data only).
2127  */
2128 void si_llvm_export_vs(struct si_shader_context *ctx,
2129                        struct si_shader_output_values *outputs,
2130                        unsigned noutput)
2131 {
2132         struct si_shader *shader = ctx->shader;
2133         struct ac_export_args pos_args[4] = {};
2134         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2135         unsigned pos_idx;
2136         int i;
2137
2138         si_vertex_color_clamping(ctx, outputs, noutput);
2139
2140         /* Build position exports. */
2141         for (i = 0; i < noutput; i++) {
2142                 switch (outputs[i].semantic_name) {
2143                 case TGSI_SEMANTIC_POSITION:
2144                         si_llvm_init_export_args(ctx, outputs[i].values,
2145                                                  V_008DFC_SQ_EXP_POS, &pos_args[0]);
2146                         break;
2147                 case TGSI_SEMANTIC_PSIZE:
2148                         psize_value = outputs[i].values[0];
2149                         break;
2150                 case TGSI_SEMANTIC_LAYER:
2151                         layer_value = outputs[i].values[0];
2152                         break;
2153                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2154                         viewport_index_value = outputs[i].values[0];
2155                         break;
2156                 case TGSI_SEMANTIC_EDGEFLAG:
2157                         edgeflag_value = outputs[i].values[0];
2158                         break;
2159                 case TGSI_SEMANTIC_CLIPDIST:
2160                         if (!shader->key.opt.clip_disable) {
2161                                 unsigned index = 2 + outputs[i].semantic_index;
2162                                 si_llvm_init_export_args(ctx, outputs[i].values,
2163                                                          V_008DFC_SQ_EXP_POS + index,
2164                                                          &pos_args[index]);
2165                         }
2166                         break;
2167                 case TGSI_SEMANTIC_CLIPVERTEX:
2168                         if (!shader->key.opt.clip_disable) {
2169                                 si_llvm_emit_clipvertex(ctx, pos_args,
2170                                                         outputs[i].values);
2171                         }
2172                         break;
2173                 }
2174         }
2175
2176         /* We need to add the position output manually if it's missing. */
2177         if (!pos_args[0].out[0]) {
2178                 pos_args[0].enabled_channels = 0xf; /* writemask */
2179                 pos_args[0].valid_mask = 0; /* EXEC mask */
2180                 pos_args[0].done = 0; /* last export? */
2181                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2182                 pos_args[0].compr = 0; /* COMPR flag */
2183                 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2184                 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2185                 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2186                 pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
2187         }
2188
2189         bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
2190                                    !shader->key.as_ngg;
2191
2192         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2193         if (shader->selector->info.writes_psize ||
2194             pos_writes_edgeflag ||
2195             shader->selector->info.writes_viewport_index ||
2196             shader->selector->info.writes_layer) {
2197                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2198                                                (pos_writes_edgeflag << 1) |
2199                                                (shader->selector->info.writes_layer << 2);
2200
2201                 pos_args[1].valid_mask = 0; /* EXEC mask */
2202                 pos_args[1].done = 0; /* last export? */
2203                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2204                 pos_args[1].compr = 0; /* COMPR flag */
2205                 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
2206                 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
2207                 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
2208                 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
2209
2210                 if (shader->selector->info.writes_psize)
2211                         pos_args[1].out[0] = psize_value;
2212
2213                 if (pos_writes_edgeflag) {
2214                         /* The output is a float, but the hw expects an integer
2215                          * with the first bit containing the edge flag. */
2216                         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
2217                                                          edgeflag_value,
2218                                                          ctx->i32, "");
2219                         edgeflag_value = ac_build_umin(&ctx->ac,
2220                                                       edgeflag_value,
2221                                                       ctx->i32_1);
2222
2223                         /* The LLVM intrinsic expects a float. */
2224                         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
2225                 }
2226
2227                 if (ctx->screen->info.chip_class >= GFX9) {
2228                         /* GFX9 has the layer in out.z[10:0] and the viewport
2229                          * index in out.z[19:16].
2230                          */
2231                         if (shader->selector->info.writes_layer)
2232                                 pos_args[1].out[2] = layer_value;
2233
2234                         if (shader->selector->info.writes_viewport_index) {
2235                                 LLVMValueRef v = viewport_index_value;
2236
2237                                 v = ac_to_integer(&ctx->ac, v);
2238                                 v = LLVMBuildShl(ctx->ac.builder, v,
2239                                                  LLVMConstInt(ctx->i32, 16, 0), "");
2240                                 v = LLVMBuildOr(ctx->ac.builder, v,
2241                                                 ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
2242                                 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
2243                                 pos_args[1].enabled_channels |= 1 << 2;
2244                         }
2245                 } else {
2246                         if (shader->selector->info.writes_layer)
2247                                 pos_args[1].out[2] = layer_value;
2248
2249                         if (shader->selector->info.writes_viewport_index) {
2250                                 pos_args[1].out[3] = viewport_index_value;
2251                                 pos_args[1].enabled_channels |= 1 << 3;
2252                         }
2253                 }
2254         }
2255
2256         for (i = 0; i < 4; i++)
2257                 if (pos_args[i].out[0])
2258                         shader->info.nr_pos_exports++;
2259
2260         /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
2261          * Setting valid_mask=1 prevents it and has no other effect.
2262          */
2263         if (ctx->screen->info.family == CHIP_NAVI10 ||
2264             ctx->screen->info.family == CHIP_NAVI12 ||
2265             ctx->screen->info.family == CHIP_NAVI14)
2266                 pos_args[0].valid_mask = 1;
2267
2268         pos_idx = 0;
2269         for (i = 0; i < 4; i++) {
2270                 if (!pos_args[i].out[0])
2271                         continue;
2272
2273                 /* Specify the target we are exporting */
2274                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2275
2276                 if (pos_idx == shader->info.nr_pos_exports)
2277                         /* Specify that this is the last export */
2278                         pos_args[i].done = 1;
2279
2280                 ac_build_export(&ctx->ac, &pos_args[i]);
2281         }
2282
2283         /* Build parameter exports. */
2284         si_build_param_exports(ctx, outputs, noutput);
2285 }
2286
2287 /**
2288  * Forward all outputs from the vertex shader to the TES. This is only used
2289  * for the fixed function TCS.
2290  */
2291 static void si_copy_tcs_inputs(struct si_shader_context *ctx)
2292 {
2293         LLVMValueRef invocation_id, buffer, buffer_offset;
2294         LLVMValueRef lds_vertex_stride, lds_base;
2295         uint64_t inputs;
2296
2297         invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
2298         buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
2299         buffer_offset = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
2300
2301         lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
2302         lds_base = get_tcs_in_current_patch_offset(ctx);
2303         lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
2304                                  lds_base);
2305
2306         inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2307         while (inputs) {
2308                 unsigned i = u_bit_scan64(&inputs);
2309
2310                 LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
2311                                             LLVMConstInt(ctx->i32, 4 * i, 0),
2312                                              "");
2313
2314                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2315                                               get_rel_patch_id(ctx),
2316                                               invocation_id,
2317                                               LLVMConstInt(ctx->i32, i, 0));
2318
2319                 LLVMValueRef value = lshs_lds_load(ctx, ctx->ac.i32, ~0, lds_ptr);
2320
2321                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2322                                             buffer_offset, 0, ac_glc);
2323         }
2324 }
2325
2326 static void si_write_tess_factors(struct si_shader_context *ctx,
2327                                   LLVMValueRef rel_patch_id,
2328                                   LLVMValueRef invocation_id,
2329                                   LLVMValueRef tcs_out_current_patch_data_offset,
2330                                   LLVMValueRef invoc0_tf_outer[4],
2331                                   LLVMValueRef invoc0_tf_inner[2])
2332 {
2333         struct si_shader *shader = ctx->shader;
2334         unsigned tess_inner_index, tess_outer_index;
2335         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2336         LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2337         unsigned stride, outer_comps, inner_comps, i, offset;
2338
2339         /* Add a barrier before loading tess factors from LDS. */
2340         if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
2341                 si_llvm_emit_barrier(ctx);
2342
2343         /* Do this only for invocation 0, because the tess levels are per-patch,
2344          * not per-vertex.
2345          *
2346          * This can't jump, because invocation 0 executes this. It should
2347          * at least mask out the loads and stores for other invocations.
2348          */
2349         ac_build_ifcc(&ctx->ac,
2350                       LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
2351                                     invocation_id, ctx->i32_0, ""), 6503);
2352
2353         /* Determine the layout of one tess factor element in the buffer. */
2354         switch (shader->key.part.tcs.epilog.prim_mode) {
2355         case PIPE_PRIM_LINES:
2356                 stride = 2; /* 2 dwords, 1 vec2 store */
2357                 outer_comps = 2;
2358                 inner_comps = 0;
2359                 break;
2360         case PIPE_PRIM_TRIANGLES:
2361                 stride = 4; /* 4 dwords, 1 vec4 store */
2362                 outer_comps = 3;
2363                 inner_comps = 1;
2364                 break;
2365         case PIPE_PRIM_QUADS:
2366                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2367                 outer_comps = 4;
2368                 inner_comps = 2;
2369                 break;
2370         default:
2371                 assert(0);
2372                 return;
2373         }
2374
2375         for (i = 0; i < 4; i++) {
2376                 inner[i] = LLVMGetUndef(ctx->i32);
2377                 outer[i] = LLVMGetUndef(ctx->i32);
2378         }
2379
2380         if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
2381                 /* Tess factors are in VGPRs. */
2382                 for (i = 0; i < outer_comps; i++)
2383                         outer[i] = out[i] = invoc0_tf_outer[i];
2384                 for (i = 0; i < inner_comps; i++)
2385                         inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
2386         } else {
2387                 /* Load tess_inner and tess_outer from LDS.
2388                  * Any invocation can write them, so we can't get them from a temporary.
2389                  */
2390                 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2391                 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2392
2393                 lds_base = tcs_out_current_patch_data_offset;
2394                 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
2395                                          LLVMConstInt(ctx->i32,
2396                                                       tess_inner_index * 4, 0), "");
2397                 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
2398                                          LLVMConstInt(ctx->i32,
2399                                                       tess_outer_index * 4, 0), "");
2400
2401                 for (i = 0; i < outer_comps; i++) {
2402                         outer[i] = out[i] =
2403                                 lshs_lds_load(ctx, ctx->ac.i32, i, lds_outer);
2404                 }
2405                 for (i = 0; i < inner_comps; i++) {
2406                         inner[i] = out[outer_comps+i] =
2407                                 lshs_lds_load(ctx, ctx->ac.i32, i, lds_inner);
2408                 }
2409         }
2410
2411         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2412                 /* For isolines, the hardware expects tess factors in the
2413                  * reverse order from what GLSL / TGSI specify.
2414                  */
2415                 LLVMValueRef tmp = out[0];
2416                 out[0] = out[1];
2417                 out[1] = tmp;
2418         }
2419
2420         /* Convert the outputs to vectors for stores. */
2421         vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
2422         vec1 = NULL;
2423
2424         if (stride > 4)
2425                 vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
2426
2427         /* Get the buffer. */
2428         buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
2429
2430         /* Get the offset. */
2431         tf_base = ac_get_arg(&ctx->ac,
2432                              ctx->tcs_factor_offset);
2433         byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
2434                                   LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2435
2436         ac_build_ifcc(&ctx->ac,
2437                       LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
2438                                     rel_patch_id, ctx->i32_0, ""), 6504);
2439
2440         /* Store the dynamic HS control word. */
2441         offset = 0;
2442         if (ctx->screen->info.chip_class <= GFX8) {
2443                 ac_build_buffer_store_dword(&ctx->ac, buffer,
2444                                             LLVMConstInt(ctx->i32, 0x80000000, 0),
2445                                             1, ctx->i32_0, tf_base,
2446                                             offset, ac_glc);
2447                 offset += 4;
2448         }
2449
2450         ac_build_endif(&ctx->ac, 6504);
2451
2452         /* Store the tessellation factors. */
2453         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2454                                     MIN2(stride, 4), byteoffset, tf_base,
2455                                     offset, ac_glc);
2456         offset += 16;
2457         if (vec1)
2458                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2459                                             stride - 4, byteoffset, tf_base,
2460                                             offset, ac_glc);
2461
2462         /* Store the tess factors into the offchip buffer if TES reads them. */
2463         if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2464                 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2465                 LLVMValueRef tf_inner_offset;
2466                 unsigned param_outer, param_inner;
2467
2468                 buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
2469                 base = ac_get_arg(&ctx->ac, ctx->tcs_offchip_offset);
2470
2471                 param_outer = si_shader_io_get_unique_index_patch(
2472                                       TGSI_SEMANTIC_TESSOUTER, 0);
2473                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2474                                         LLVMConstInt(ctx->i32, param_outer, 0));
2475
2476                 unsigned outer_vec_size =
2477                         ac_has_vec3_support(ctx->screen->info.chip_class, false) ?
2478                                 outer_comps : util_next_power_of_two(outer_comps);
2479                 outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_vec_size);
2480
2481                 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2482                                             outer_comps, tf_outer_offset,
2483                                             base, 0, ac_glc);
2484                 if (inner_comps) {
2485                         param_inner = si_shader_io_get_unique_index_patch(
2486                                               TGSI_SEMANTIC_TESSINNER, 0);
2487                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2488                                         LLVMConstInt(ctx->i32, param_inner, 0));
2489
2490                         inner_vec = inner_comps == 1 ? inner[0] :
2491                                     ac_build_gather_values(&ctx->ac, inner, inner_comps);
2492                         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2493                                                     inner_comps, tf_inner_offset,
2494                                                     base, 0, ac_glc);
2495                 }
2496         }
2497
2498         ac_build_endif(&ctx->ac, 6503);
2499 }
2500
2501 static LLVMValueRef
2502 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2503                     struct ac_arg param, unsigned return_index)
2504 {
2505         return LLVMBuildInsertValue(ctx->ac.builder, ret,
2506                                     ac_get_arg(&ctx->ac, param),
2507                                     return_index, "");
2508 }
2509
2510 static LLVMValueRef
2511 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2512                           struct ac_arg param, unsigned return_index)
2513 {
2514         LLVMBuilderRef builder = ctx->ac.builder;
2515         LLVMValueRef p = ac_get_arg(&ctx->ac, param);
2516
2517         return LLVMBuildInsertValue(builder, ret,
2518                                     ac_to_float(&ctx->ac, p),
2519                                     return_index, "");
2520 }
2521
2522 static LLVMValueRef
2523 si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
2524                     struct ac_arg param, unsigned return_index)
2525 {
2526         LLVMBuilderRef builder = ctx->ac.builder;
2527         LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
2528         ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
2529         return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
2530 }
2531
2532 /* This only writes the tessellation factor levels. */
2533 static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
2534                                       unsigned max_outputs,
2535                                       LLVMValueRef *addrs)
2536 {
2537         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2538         LLVMBuilderRef builder = ctx->ac.builder;
2539         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2540
2541         si_copy_tcs_inputs(ctx);
2542
2543         rel_patch_id = get_rel_patch_id(ctx);
2544         invocation_id = si_unpack_param(ctx, ctx->args.tcs_rel_ids, 8, 5);
2545         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2546
2547         if (ctx->screen->info.chip_class >= GFX9) {
2548                 LLVMBasicBlockRef blocks[2] = {
2549                         LLVMGetInsertBlock(builder),
2550                         ctx->merged_wrap_if_entry_block
2551                 };
2552                 LLVMValueRef values[2];
2553
2554                 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
2555
2556                 values[0] = rel_patch_id;
2557                 values[1] = LLVMGetUndef(ctx->i32);
2558                 rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2559
2560                 values[0] = tf_lds_offset;
2561                 values[1] = LLVMGetUndef(ctx->i32);
2562                 tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2563
2564                 values[0] = invocation_id;
2565                 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2566                 invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2567         }
2568
2569         /* Return epilog parameters from this function. */
2570         LLVMValueRef ret = ctx->return_value;
2571         unsigned vgpr;
2572
2573         if (ctx->screen->info.chip_class >= GFX9) {
2574                 ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
2575                                           8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2576                 ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
2577                                           8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2578                 /* Tess offchip and tess factor offsets are at the beginning. */
2579                 ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
2580                 ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
2581                 vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
2582         } else {
2583                 ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
2584                                           GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2585                 ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
2586                                           GFX6_SGPR_TCS_OUT_LAYOUT);
2587                 /* Tess offchip and tess factor offsets are after user SGPRs. */
2588                 ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset,
2589                                           GFX6_TCS_NUM_USER_SGPR);
2590                 ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset,
2591                                           GFX6_TCS_NUM_USER_SGPR + 1);
2592                 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2593         }
2594
2595         /* VGPRs */
2596         rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
2597         invocation_id = ac_to_float(&ctx->ac, invocation_id);
2598         tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
2599
2600         /* Leave a hole corresponding to the two input VGPRs. This ensures that
2601          * the invocation_id output does not alias the tcs_rel_ids input,
2602          * which saves a V_MOV on gfx9.
2603          */
2604         vgpr += 2;
2605
2606         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2607         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2608
2609         if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
2610                 vgpr++; /* skip the tess factor LDS offset */
2611                 for (unsigned i = 0; i < 6; i++) {
2612                         LLVMValueRef value =
2613                                 LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
2614                         value = ac_to_float(&ctx->ac, value);
2615                         ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
2616                 }
2617         } else {
2618                 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2619         }
2620         ctx->return_value = ret;
2621 }
2622
2623 /* Pass TCS inputs from LS to TCS on GFX9. */
2624 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2625 {
2626         LLVMValueRef ret = ctx->return_value;
2627
2628         ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
2629         ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
2630         ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 2);
2631         ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
2632         ret = si_insert_input_ret(ctx, ret, ctx->tcs_factor_offset, 4);
2633         ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
2634
2635         ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
2636                                   8 + SI_SGPR_RW_BUFFERS);
2637         ret = si_insert_input_ptr(ctx, ret,
2638                                   ctx->bindless_samplers_and_images,
2639                                   8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
2640
2641         ret = si_insert_input_ret(ctx, ret, ctx->vs_state_bits,
2642                                   8 + SI_SGPR_VS_STATE_BITS);
2643
2644         ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_layout,
2645                                   8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2646         ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_offsets,
2647                                   8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2648         ret = si_insert_input_ret(ctx, ret, ctx->tcs_out_lds_layout,
2649                                   8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2650
2651         unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2652         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
2653                                    ac_to_float(&ctx->ac,
2654                                                ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id)),
2655                                    vgpr++, "");
2656         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
2657                                    ac_to_float(&ctx->ac,
2658                                                ac_get_arg(&ctx->ac, ctx->args.tcs_rel_ids)),
2659                                    vgpr++, "");
2660         ctx->return_value = ret;
2661 }
2662
2663 /* Pass GS inputs from ES to GS on GFX9. */
2664 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2665 {
2666         LLVMValueRef ret = ctx->return_value;
2667
2668         ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
2669         ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
2670         if (ctx->shader->key.as_ngg)
2671                 ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
2672         else
2673                 ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
2674         ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
2675         ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
2676
2677         ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers,
2678                                   8 + SI_SGPR_RW_BUFFERS);
2679         ret = si_insert_input_ptr(ctx, ret,
2680                                   ctx->bindless_samplers_and_images,
2681                                   8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
2682         if (ctx->screen->use_ngg) {
2683                 ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits,
2684                                           8 + SI_SGPR_VS_STATE_BITS);
2685         }
2686
2687         unsigned vgpr;
2688         if (ctx->type == PIPE_SHADER_VERTEX)
2689                 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
2690         else
2691                 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
2692
2693         ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
2694         ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
2695         ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
2696         ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
2697         ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
2698         ctx->return_value = ret;
2699 }
2700
2701 static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi,
2702                                      unsigned max_outputs,
2703                                      LLVMValueRef *addrs)
2704 {
2705         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2706         struct si_shader *shader = ctx->shader;
2707         struct tgsi_shader_info *info = &shader->selector->info;
2708         unsigned i, chan;
2709         LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->rel_auto_id);
2710         LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
2711         LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
2712                                                  vertex_dw_stride, "");
2713
2714         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2715          * its inputs from it. */
2716         for (i = 0; i < info->num_outputs; i++) {
2717                 unsigned name = info->output_semantic_name[i];
2718                 unsigned index = info->output_semantic_index[i];
2719
2720                 /* The ARB_shader_viewport_layer_array spec contains the
2721                  * following issue:
2722                  *
2723                  *    2) What happens if gl_ViewportIndex or gl_Layer is
2724                  *    written in the vertex shader and a geometry shader is
2725                  *    present?
2726                  *
2727                  *    RESOLVED: The value written by the last vertex processing
2728                  *    stage is used. If the last vertex processing stage
2729                  *    (vertex, tessellation evaluation or geometry) does not
2730                  *    statically assign to gl_ViewportIndex or gl_Layer, index
2731                  *    or layer zero is assumed.
2732                  *
2733                  * So writes to those outputs in VS-as-LS are simply ignored.
2734                  */
2735                 if (name == TGSI_SEMANTIC_LAYER ||
2736                     name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2737                         continue;
2738
2739                 int param = si_shader_io_get_unique_index(name, index, false);
2740                 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
2741                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
2742
2743                 for (chan = 0; chan < 4; chan++) {
2744                         if (!(info->output_usagemask[i] & (1 << chan)))
2745                                 continue;
2746
2747                         lshs_lds_store(ctx, chan, dw_addr,
2748                                   LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
2749                 }
2750         }
2751
2752         if (ctx->screen->info.chip_class >= GFX9)
2753                 si_set_ls_return_value_for_tcs(ctx);
2754 }
2755
2756 static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
2757                                      unsigned max_outputs,
2758                                      LLVMValueRef *addrs)
2759 {
2760         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2761         struct si_shader *es = ctx->shader;
2762         struct tgsi_shader_info *info = &es->selector->info;
2763         LLVMValueRef lds_base = NULL;
2764         unsigned chan;
2765         int i;
2766
2767         if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
2768                 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2769                 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
2770                 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
2771                 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
2772                                          LLVMBuildMul(ctx->ac.builder, wave_idx,
2773                                                       LLVMConstInt(ctx->i32, ctx->ac.wave_size, false), ""), "");
2774                 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
2775                                         LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2776         }
2777
2778         for (i = 0; i < info->num_outputs; i++) {
2779                 int param;
2780
2781                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2782                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2783                         continue;
2784
2785                 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2786                                                       info->output_semantic_index[i], false);
2787
2788                 for (chan = 0; chan < 4; chan++) {
2789                         if (!(info->output_usagemask[i] & (1 << chan)))
2790                                 continue;
2791
2792                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
2793                         out_val = ac_to_integer(&ctx->ac, out_val);
2794
2795                         /* GFX9 has the ESGS ring in LDS. */
2796                         if (ctx->screen->info.chip_class >= GFX9) {
2797                                 LLVMValueRef idx = LLVMConstInt(ctx->i32, param * 4 + chan, false);
2798                                 idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
2799                                 ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
2800                                 continue;
2801                         }
2802
2803                         ac_build_buffer_store_dword(&ctx->ac,
2804                                                     ctx->esgs_ring,
2805                                                     out_val, 1, NULL,
2806                                                     ac_get_arg(&ctx->ac, ctx->es2gs_offset),
2807                                                     (4 * param + chan) * 4,
2808                                                     ac_glc | ac_slc | ac_swizzled);
2809                 }
2810         }
2811
2812         if (ctx->screen->info.chip_class >= GFX9)
2813                 si_set_es_return_value_for_gs(ctx);
2814 }
2815
2816 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2817 {
2818         if (ctx->screen->info.chip_class >= GFX9)
2819                 return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
2820         else
2821                 return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
2822 }
2823
2824 static void emit_gs_epilogue(struct si_shader_context *ctx)
2825 {
2826         if (ctx->shader->key.as_ngg) {
2827                 gfx10_ngg_gs_emit_epilogue(ctx);
2828                 return;
2829         }
2830
2831         if (ctx->screen->info.chip_class >= GFX10)
2832                 LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
2833
2834         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2835                          si_get_gs_wave_id(ctx));
2836
2837         if (ctx->screen->info.chip_class >= GFX9)
2838                 ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
2839 }
2840
2841 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
2842                                      unsigned max_outputs,
2843                                      LLVMValueRef *addrs)
2844 {
2845         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2846         struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info;
2847
2848         assert(info->num_outputs <= max_outputs);
2849
2850         emit_gs_epilogue(ctx);
2851 }
2852
2853 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
2854                                      unsigned max_outputs,
2855                                      LLVMValueRef *addrs)
2856 {
2857         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2858         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2859         struct si_shader_output_values *outputs = NULL;
2860         int i,j;
2861
2862         assert(!ctx->shader->is_gs_copy_shader);
2863         assert(info->num_outputs <= max_outputs);
2864
2865         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2866
2867         for (i = 0; i < info->num_outputs; i++) {
2868                 outputs[i].semantic_name = info->output_semantic_name[i];
2869                 outputs[i].semantic_index = info->output_semantic_index[i];
2870
2871                 for (j = 0; j < 4; j++) {
2872                         outputs[i].values[j] =
2873                                 LLVMBuildLoad(ctx->ac.builder,
2874                                               addrs[4 * i + j],
2875                                               "");
2876                         outputs[i].vertex_stream[j] =
2877                                 (info->output_streams[i] >> (2 * j)) & 3;
2878                 }
2879         }
2880
2881         if (!ctx->screen->use_ngg_streamout &&
2882             ctx->shader->selector->so.num_outputs)
2883                 si_llvm_emit_streamout(ctx, outputs, i, 0);
2884
2885         /* Export PrimitiveID. */
2886         if (ctx->shader->key.mono.u.vs_export_prim_id) {
2887                 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
2888                 outputs[i].semantic_index = 0;
2889                 outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
2890                 for (j = 1; j < 4; j++)
2891                         outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
2892
2893                 memset(outputs[i].vertex_stream, 0,
2894                        sizeof(outputs[i].vertex_stream));
2895                 i++;
2896         }
2897
2898         si_llvm_export_vs(ctx, outputs, i);
2899         FREE(outputs);
2900 }
2901
2902 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
2903                                                   unsigned max_outputs,
2904                                                   LLVMValueRef *addrs)
2905 {
2906         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2907         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2908         LLVMValueRef pos[4] = {};
2909
2910         assert(info->num_outputs <= max_outputs);
2911
2912         for (unsigned i = 0; i < info->num_outputs; i++) {
2913                 if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
2914                         continue;
2915
2916                 for (unsigned chan = 0; chan < 4; chan++)
2917                         pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
2918                 break;
2919         }
2920         assert(pos[0] != NULL);
2921
2922         /* Return the position output. */
2923         LLVMValueRef ret = ctx->return_value;
2924         for (unsigned chan = 0; chan < 4; chan++)
2925                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
2926         ctx->return_value = ret;
2927 }
2928
2929 struct si_ps_exports {
2930         unsigned num;
2931         struct ac_export_args args[10];
2932 };
2933
2934 static void si_export_mrt_z(struct si_shader_context *ctx,
2935                             LLVMValueRef depth, LLVMValueRef stencil,
2936                             LLVMValueRef samplemask, struct si_ps_exports *exp)
2937 {
2938         struct ac_export_args args;
2939
2940         ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
2941
2942         memcpy(&exp->args[exp->num++], &args, sizeof(args));
2943 }
2944
2945 static void si_export_mrt_color(struct si_shader_context *ctx,
2946                                 LLVMValueRef *color, unsigned index,
2947                                 unsigned samplemask_param,
2948                                 bool is_last, struct si_ps_exports *exp)
2949 {
2950         int i;
2951
2952         /* Clamp color */
2953         if (ctx->shader->key.part.ps.epilog.clamp_color)
2954                 for (i = 0; i < 4; i++)
2955                         color[i] = ac_build_clamp(&ctx->ac, color[i]);
2956
2957         /* Alpha to one */
2958         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
2959                 color[3] = ctx->ac.f32_1;
2960
2961         /* Alpha test */
2962         if (index == 0 &&
2963             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2964                 si_alpha_test(ctx, color[3]);
2965
2966         /* Line & polygon smoothing */
2967         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
2968                 color[3] = si_scale_alpha_by_sample_mask(ctx, color[3],
2969                                                          samplemask_param);
2970
2971         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2972         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
2973                 struct ac_export_args args[8];
2974                 int c, last = -1;
2975
2976                 /* Get the export arguments, also find out what the last one is. */
2977                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
2978                         si_llvm_init_export_args(ctx, color,
2979                                                  V_008DFC_SQ_EXP_MRT + c, &args[c]);
2980                         if (args[c].enabled_channels)
2981                                 last = c;
2982                 }
2983
2984                 /* Emit all exports. */
2985                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
2986                         if (is_last && last == c) {
2987                                 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
2988                                 args[c].done = 1; /* DONE bit */
2989                         } else if (!args[c].enabled_channels)
2990                                 continue; /* unnecessary NULL export */
2991
2992                         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
2993                 }
2994         } else {
2995                 struct ac_export_args args;
2996
2997                 /* Export */
2998                 si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
2999                                          &args);
3000                 if (is_last) {
3001                         args.valid_mask = 1; /* whether the EXEC mask is valid */
3002                         args.done = 1; /* DONE bit */
3003                 } else if (!args.enabled_channels)
3004                         return; /* unnecessary NULL export */
3005
3006                 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3007         }
3008 }
3009
3010 static void si_emit_ps_exports(struct si_shader_context *ctx,
3011                                struct si_ps_exports *exp)
3012 {
3013         for (unsigned i = 0; i < exp->num; i++)
3014                 ac_build_export(&ctx->ac, &exp->args[i]);
3015 }
3016
3017 /**
3018  * Return PS outputs in this order:
3019  *
3020  * v[0:3] = color0.xyzw
3021  * v[4:7] = color1.xyzw
3022  * ...
3023  * vN+0 = Depth
3024  * vN+1 = Stencil
3025  * vN+2 = SampleMask
3026  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3027  *
3028  * The alpha-ref SGPR is returned via its original location.
3029  */
3030 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3031                                       unsigned max_outputs,
3032                                       LLVMValueRef *addrs)
3033 {
3034         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3035         struct si_shader *shader = ctx->shader;
3036         struct tgsi_shader_info *info = &shader->selector->info;
3037         LLVMBuilderRef builder = ctx->ac.builder;
3038         unsigned i, j, first_vgpr, vgpr;
3039
3040         LLVMValueRef color[8][4] = {};
3041         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3042         LLVMValueRef ret;
3043
3044         if (ctx->postponed_kill)
3045                 ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3046
3047         /* Read the output values. */
3048         for (i = 0; i < info->num_outputs; i++) {
3049                 unsigned semantic_name = info->output_semantic_name[i];
3050                 unsigned semantic_index = info->output_semantic_index[i];
3051
3052                 switch (semantic_name) {
3053                 case TGSI_SEMANTIC_COLOR:
3054                         assert(semantic_index < 8);
3055                         for (j = 0; j < 4; j++) {
3056                                 LLVMValueRef ptr = addrs[4 * i + j];
3057                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3058                                 color[semantic_index][j] = result;
3059                         }
3060                         break;
3061                 case TGSI_SEMANTIC_POSITION:
3062                         depth = LLVMBuildLoad(builder,
3063                                               addrs[4 * i + 2], "");
3064                         break;
3065                 case TGSI_SEMANTIC_STENCIL:
3066                         stencil = LLVMBuildLoad(builder,
3067                                                 addrs[4 * i + 1], "");
3068                         break;
3069                 case TGSI_SEMANTIC_SAMPLEMASK:
3070                         samplemask = LLVMBuildLoad(builder,
3071                                                    addrs[4 * i + 0], "");
3072                         break;
3073                 default:
3074                         fprintf(stderr, "Warning: GFX6 unhandled fs output type:%d\n",
3075                                 semantic_name);
3076                 }
3077         }
3078
3079         /* Fill the return structure. */
3080         ret = ctx->return_value;
3081
3082         /* Set SGPRs. */
3083         ret = LLVMBuildInsertValue(builder, ret,
3084                                    ac_to_integer(&ctx->ac,
3085                                                  LLVMGetParam(ctx->main_fn,
3086                                                               SI_PARAM_ALPHA_REF)),
3087                                    SI_SGPR_ALPHA_REF, "");
3088
3089         /* Set VGPRs */
3090         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3091         for (i = 0; i < ARRAY_SIZE(color); i++) {
3092                 if (!color[i][0])
3093                         continue;
3094
3095                 for (j = 0; j < 4; j++)
3096                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3097         }
3098         if (depth)
3099                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3100         if (stencil)
3101                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3102         if (samplemask)
3103                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3104
3105         /* Add the input sample mask for smoothing at the end. */
3106         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3107                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3108         ret = LLVMBuildInsertValue(builder, ret,
3109                                    LLVMGetParam(ctx->main_fn,
3110                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3111
3112         ctx->return_value = ret;
3113 }
3114
3115 /* Emit one vertex from the geometry shader */
3116 static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
3117                                 unsigned stream,
3118                                 LLVMValueRef *addrs)
3119 {
3120         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3121
3122         if (ctx->shader->key.as_ngg) {
3123                 gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
3124                 return;
3125         }
3126
3127         struct tgsi_shader_info *info = &ctx->shader->selector->info;
3128         struct si_shader *shader = ctx->shader;
3129         LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
3130         LLVMValueRef gs_next_vertex;
3131         LLVMValueRef can_emit;
3132         unsigned chan, offset;
3133         int i;
3134
3135         /* Write vertex attribute values to GSVS ring */
3136         gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
3137                                        ctx->gs_next_vertex[stream],
3138                                        "");
3139
3140         /* If this thread has already emitted the declared maximum number of
3141          * vertices, skip the write: excessive vertex emissions are not
3142          * supposed to have any effect.
3143          *
3144          * If the shader has no writes to memory, kill it instead. This skips
3145          * further memory loads and may allow LLVM to skip to the end
3146          * altogether.
3147          */
3148         can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
3149                                  LLVMConstInt(ctx->i32,
3150                                               shader->selector->gs_max_out_vertices, 0), "");
3151
3152         bool use_kill = !info->writes_memory;
3153         if (use_kill) {
3154                 ac_build_kill_if_false(&ctx->ac, can_emit);
3155         } else {
3156                 ac_build_ifcc(&ctx->ac, can_emit, 6505);
3157         }
3158
3159         offset = 0;
3160         for (i = 0; i < info->num_outputs; i++) {
3161                 for (chan = 0; chan < 4; chan++) {
3162                         if (!(info->output_usagemask[i] & (1 << chan)) ||
3163                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
3164                                 continue;
3165
3166                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
3167                         LLVMValueRef voffset =
3168                                 LLVMConstInt(ctx->i32, offset *
3169                                              shader->selector->gs_max_out_vertices, 0);
3170                         offset++;
3171
3172                         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
3173                         voffset = LLVMBuildMul(ctx->ac.builder, voffset,
3174                                                LLVMConstInt(ctx->i32, 4, 0), "");
3175
3176                         out_val = ac_to_integer(&ctx->ac, out_val);
3177
3178                         ac_build_buffer_store_dword(&ctx->ac,
3179                                                     ctx->gsvs_ring[stream],
3180                                                     out_val, 1,
3181                                                     voffset, soffset, 0,
3182                                                     ac_glc | ac_slc | ac_swizzled);
3183                 }
3184         }
3185
3186         gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
3187         LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
3188
3189         /* Signal vertex emission if vertex data was written. */
3190         if (offset) {
3191                 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
3192                                  si_get_gs_wave_id(ctx));
3193         }
3194
3195         if (!use_kill)
3196                 ac_build_endif(&ctx->ac, 6505);
3197 }
3198
3199 /* Cut one primitive from the geometry shader */
3200 static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
3201                                    unsigned stream)
3202 {
3203         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3204
3205         if (ctx->shader->key.as_ngg) {
3206                 LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
3207                 return;
3208         }
3209
3210         /* Signal primitive cut */
3211         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
3212                          si_get_gs_wave_id(ctx));
3213 }
3214
3215 static void si_llvm_emit_barrier(struct si_shader_context *ctx)
3216 {
3217         /* GFX6 only (thanks to a hw bug workaround):
3218          * The real barrier instruction isn’t needed, because an entire patch
3219          * always fits into a single wave.
3220          */
3221         if (ctx->screen->info.chip_class == GFX6 &&
3222             ctx->type == PIPE_SHADER_TESS_CTRL) {
3223                 ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
3224                 return;
3225         }
3226
3227         ac_build_s_barrier(&ctx->ac);
3228 }
3229
3230 void si_create_function(struct si_shader_context *ctx,
3231                         const char *name,
3232                         LLVMTypeRef *returns, unsigned num_returns,
3233                         unsigned max_workgroup_size)
3234 {
3235         si_llvm_create_func(ctx, name, returns, num_returns);
3236         ctx->return_value = LLVMGetUndef(ctx->return_type);
3237
3238         if (ctx->screen->info.address32_hi) {
3239                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
3240                                                      "amdgpu-32bit-address-high-bits",
3241                                                      ctx->screen->info.address32_hi);
3242         }
3243
3244         LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
3245                                            "no-signed-zeros-fp-math",
3246                                            "true");
3247
3248         ac_llvm_set_workgroup_size(ctx->main_fn, max_workgroup_size);
3249 }
3250
3251 static void declare_streamout_params(struct si_shader_context *ctx,
3252                                      struct pipe_stream_output_info *so)
3253 {
3254         if (ctx->screen->use_ngg_streamout) {
3255                 if (ctx->type == PIPE_SHADER_TESS_EVAL)
3256                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3257                 return;
3258         }
3259
3260         /* Streamout SGPRs. */
3261         if (so->num_outputs) {
3262                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
3263                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
3264         } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
3265                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3266         }
3267
3268         /* A streamout buffer offset is loaded if the stride is non-zero. */
3269         for (int i = 0; i < 4; i++) {
3270                 if (!so->stride[i])
3271                         continue;
3272
3273                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
3274         }
3275 }
3276
3277 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
3278 {
3279         switch (shader->selector->type) {
3280         case PIPE_SHADER_VERTEX:
3281         case PIPE_SHADER_TESS_EVAL:
3282                 return shader->key.as_ngg ? 128 : 0;
3283
3284         case PIPE_SHADER_TESS_CTRL:
3285                 /* Return this so that LLVM doesn't remove s_barrier
3286                  * instructions on chips where we use s_barrier. */
3287                 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
3288
3289         case PIPE_SHADER_GEOMETRY:
3290                 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
3291
3292         case PIPE_SHADER_COMPUTE:
3293                 break; /* see below */
3294
3295         default:
3296                 return 0;
3297         }
3298
3299         const unsigned *properties = shader->selector->info.properties;
3300         unsigned max_work_group_size =
3301                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
3302                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
3303                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
3304
3305         if (!max_work_group_size) {
3306                 /* This is a variable group size compute shader,
3307                  * compile it for the maximum possible group size.
3308                  */
3309                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
3310         }
3311         return max_work_group_size;
3312 }
3313
3314 static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
3315                                              bool assign_params)
3316 {
3317         enum ac_arg_type const_shader_buf_type;
3318
3319         if (ctx->shader->selector->info.const_buffers_declared == 1 &&
3320             ctx->shader->selector->info.shader_buffers_declared == 0)
3321                 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
3322         else
3323                 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
3324
3325         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
3326                    assign_params ? &ctx->const_and_shader_buffers :
3327                    &ctx->other_const_and_shader_buffers);
3328 }
3329
3330 static void declare_samplers_and_images(struct si_shader_context *ctx,
3331                                         bool assign_params)
3332 {
3333         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
3334                    assign_params ? &ctx->samplers_and_images :
3335                    &ctx->other_samplers_and_images);
3336 }
3337
3338 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
3339                                             bool assign_params)
3340 {
3341         declare_const_and_shader_buffers(ctx, assign_params);
3342         declare_samplers_and_images(ctx, assign_params);
3343 }
3344
3345 static void declare_global_desc_pointers(struct si_shader_context *ctx)
3346 {
3347         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
3348                    &ctx->rw_buffers);
3349         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
3350                    &ctx->bindless_samplers_and_images);
3351 }
3352
3353 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
3354 {
3355         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
3356         if (!ctx->shader->is_gs_copy_shader) {
3357                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
3358                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
3359                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
3360         }
3361 }
3362
3363 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
3364 {
3365         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
3366
3367         unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
3368         if (num_vbos_in_user_sgprs) {
3369                 unsigned user_sgprs = ctx->args.num_sgprs_used;
3370
3371                 if (is_merged_shader(ctx))
3372                         user_sgprs -= 8;
3373                 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
3374
3375                 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
3376                 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
3377                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
3378
3379                 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
3380                 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
3381                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
3382         }
3383 }
3384
3385 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
3386                                    unsigned *num_prolog_vgprs)
3387 {
3388         struct si_shader *shader = ctx->shader;
3389
3390         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
3391         if (shader->key.as_ls) {
3392                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
3393                 if (ctx->screen->info.chip_class >= GFX10) {
3394                         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
3395                         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
3396                 } else {
3397                         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
3398                         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
3399                 }
3400         } else if (ctx->screen->info.chip_class >= GFX10) {
3401                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
3402                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
3403                            &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
3404                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
3405         } else {
3406                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
3407                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
3408                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
3409         }
3410
3411         if (!shader->is_gs_copy_shader) {
3412                 /* Vertex load indices. */
3413                 if (shader->selector->info.num_inputs) {
3414                         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
3415                                    &ctx->vertex_index0);
3416                         for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
3417                                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
3418                 }
3419                 *num_prolog_vgprs += shader->selector->info.num_inputs;
3420         }
3421 }
3422
3423 static void declare_vs_blit_inputs(struct si_shader_context *ctx,
3424                                    unsigned vs_blit_property)
3425 {
3426         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
3427                    &ctx->vs_blit_inputs); /* i16 x1, y1 */
3428         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
3429         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
3430
3431         if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
3432                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
3433                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
3434                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
3435                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
3436         } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
3437                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
3438                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
3439                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
3440                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
3441                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
3442                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
3443         }
3444 }
3445
3446 static void declare_tes_input_vgprs(struct si_shader_context *ctx)
3447 {
3448         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
3449         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
3450         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
3451         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
3452 }
3453
3454 enum {
3455         /* Convenient merged shader definitions. */
3456         SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
3457         SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
3458 };
3459
3460 static void add_arg_checked(struct ac_shader_args *args,
3461                             enum ac_arg_regfile file,
3462                             unsigned registers, enum ac_arg_type type,
3463                             struct ac_arg *arg,
3464                             unsigned idx)
3465 {
3466         assert(args->arg_count == idx);
3467         ac_add_arg(args, file, registers, type, arg);
3468 }
3469
3470 static void create_function(struct si_shader_context *ctx)
3471 {
3472         struct si_shader *shader = ctx->shader;
3473         LLVMTypeRef returns[AC_MAX_ARGS];
3474         unsigned i, num_return_sgprs;
3475         unsigned num_returns = 0;
3476         unsigned num_prolog_vgprs = 0;
3477         unsigned type = ctx->type;
3478         unsigned vs_blit_property =
3479                 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
3480
3481         memset(&ctx->args, 0, sizeof(ctx->args));
3482
3483         /* Set MERGED shaders. */
3484         if (ctx->screen->info.chip_class >= GFX9) {
3485                 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
3486                         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
3487                 else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
3488                         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
3489         }
3490
3491         switch (type) {
3492         case PIPE_SHADER_VERTEX:
3493                 declare_global_desc_pointers(ctx);
3494
3495                 if (vs_blit_property) {
3496                         declare_vs_blit_inputs(ctx, vs_blit_property);
3497
3498                         /* VGPRs */
3499                         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
3500                         break;
3501                 }
3502
3503                 declare_per_stage_desc_pointers(ctx, true);
3504                 declare_vs_specific_input_sgprs(ctx);
3505                 if (!shader->is_gs_copy_shader)
3506                         declare_vb_descriptor_input_sgprs(ctx);
3507
3508                 if (shader->key.as_es) {
3509                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
3510                                    &ctx->es2gs_offset);
3511                 } else if (shader->key.as_ls) {
3512                         /* no extra parameters */
3513                 } else {
3514                         /* The locations of the other parameters are assigned dynamically. */
3515                         declare_streamout_params(ctx, &shader->selector->so);
3516                 }
3517
3518                 /* VGPRs */
3519                 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
3520
3521                 /* Return values */
3522                 if (shader->key.opt.vs_as_prim_discard_cs) {
3523                         for (i = 0; i < 4; i++)
3524                                 returns[num_returns++] = ctx->f32; /* VGPRs */
3525                 }
3526                 break;
3527
3528         case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
3529                 declare_global_desc_pointers(ctx);
3530                 declare_per_stage_desc_pointers(ctx, true);
3531                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
3532                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
3533                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
3534                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
3535                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
3536                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
3537
3538                 /* VGPRs */
3539                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
3540                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
3541
3542                 /* param_tcs_offchip_offset and param_tcs_factor_offset are
3543                  * placed after the user SGPRs.
3544                  */
3545                 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
3546                         returns[num_returns++] = ctx->i32; /* SGPRs */
3547                 for (i = 0; i < 11; i++)
3548                         returns[num_returns++] = ctx->f32; /* VGPRs */
3549                 break;
3550
3551         case SI_SHADER_MERGED_VERTEX_TESSCTRL:
3552                 /* Merged stages have 8 system SGPRs at the beginning. */
3553                 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
3554                 declare_per_stage_desc_pointers(ctx,
3555                                                 ctx->type == PIPE_SHADER_TESS_CTRL);
3556                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
3557                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
3558                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
3559                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
3560                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
3561                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
3562
3563                 declare_global_desc_pointers(ctx);
3564                 declare_per_stage_desc_pointers(ctx,
3565                                                 ctx->type == PIPE_SHADER_VERTEX);
3566                 declare_vs_specific_input_sgprs(ctx);
3567
3568                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
3569                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
3570                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
3571                 declare_vb_descriptor_input_sgprs(ctx);
3572
3573                 /* VGPRs (first TCS, then VS) */
3574                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
3575                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
3576
3577                 if (ctx->type == PIPE_SHADER_VERTEX) {
3578                         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
3579
3580                         /* LS return values are inputs to the TCS main shader part. */
3581                         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
3582                                 returns[num_returns++] = ctx->i32; /* SGPRs */
3583                         for (i = 0; i < 2; i++)
3584                                 returns[num_returns++] = ctx->f32; /* VGPRs */
3585                 } else {
3586                         /* TCS return values are inputs to the TCS epilog.
3587                          *
3588                          * param_tcs_offchip_offset, param_tcs_factor_offset,
3589                          * param_tcs_offchip_layout, and param_rw_buffers
3590                          * should be passed to the epilog.
3591                          */
3592                         for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
3593                                 returns[num_returns++] = ctx->i32; /* SGPRs */
3594                         for (i = 0; i < 11; i++)
3595                                 returns[num_returns++] = ctx->f32; /* VGPRs */
3596                 }
3597                 break;
3598
3599         case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
3600                 /* Merged stages have 8 system SGPRs at the beginning. */
3601                 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
3602                 declare_per_stage_desc_pointers(ctx,
3603                                                 ctx->type == PIPE_SHADER_GEOMETRY);
3604
3605                 if (ctx->shader->key.as_ngg)
3606                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
3607                 else
3608                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
3609
3610                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
3611                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
3612                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
3613                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
3614                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
3615
3616                 declare_global_desc_pointers(ctx);
3617                 if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
3618                         declare_per_stage_desc_pointers(ctx,
3619                                                         (ctx->type == PIPE_SHADER_VERTEX ||
3620                                                          ctx->type == PIPE_SHADER_TESS_EVAL));
3621                 }
3622
3623                 if (ctx->type == PIPE_SHADER_VERTEX) {
3624                         if (vs_blit_property)
3625                                 declare_vs_blit_inputs(ctx, vs_blit_property);
3626                         else
3627                                 declare_vs_specific_input_sgprs(ctx);
3628                 } else {
3629                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
3630                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
3631                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
3632                         /* Declare as many input SGPRs as the VS has. */
3633                 }
3634
3635                 if (ctx->type == PIPE_SHADER_VERTEX)
3636                         declare_vb_descriptor_input_sgprs(ctx);
3637
3638                 /* VGPRs (first GS, then VS/TES) */
3639                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
3640                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
3641                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
3642                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
3643                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
3644
3645                 if (ctx->type == PIPE_SHADER_VERTEX) {
3646                         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
3647                 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
3648                         declare_tes_input_vgprs(ctx);
3649                 }
3650
3651                 if (ctx->shader->key.as_es &&
3652                     (ctx->type == PIPE_SHADER_VERTEX ||
3653                      ctx->type == PIPE_SHADER_TESS_EVAL)) {
3654                         unsigned num_user_sgprs;
3655
3656                         if (ctx->type == PIPE_SHADER_VERTEX)
3657                                 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
3658                         else
3659                                 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
3660
3661                         /* ES return values are inputs to GS. */
3662                         for (i = 0; i < 8 + num_user_sgprs; i++)
3663                                 returns[num_returns++] = ctx->i32; /* SGPRs */
3664                         for (i = 0; i < 5; i++)
3665                                 returns[num_returns++] = ctx->f32; /* VGPRs */
3666                 }
3667                 break;
3668
3669         case PIPE_SHADER_TESS_EVAL:
3670                 declare_global_desc_pointers(ctx);
3671                 declare_per_stage_desc_pointers(ctx, true);
3672                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
3673                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
3674                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
3675
3676                 if (shader->key.as_es) {
3677                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
3678                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3679                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
3680                 } else {
3681                         declare_streamout_params(ctx, &shader->selector->so);
3682                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
3683                 }
3684
3685                 /* VGPRs */
3686                 declare_tes_input_vgprs(ctx);
3687                 break;
3688
3689         case PIPE_SHADER_GEOMETRY:
3690                 declare_global_desc_pointers(ctx);
3691                 declare_per_stage_desc_pointers(ctx, true);
3692                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
3693                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
3694
3695                 /* VGPRs */
3696                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
3697                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
3698                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
3699                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
3700                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
3701                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
3702                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
3703                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
3704                 break;
3705
3706         case PIPE_SHADER_FRAGMENT:
3707                 declare_global_desc_pointers(ctx);
3708                 declare_per_stage_desc_pointers(ctx, true);
3709                 add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
3710                                 SI_PARAM_ALPHA_REF);
3711                 add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
3712                                 &ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
3713
3714                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
3715                                 SI_PARAM_PERSP_SAMPLE);
3716                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
3717                                 &ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
3718                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
3719                                 &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
3720                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
3721                                 NULL, SI_PARAM_PERSP_PULL_MODEL);
3722                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
3723                                 &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
3724                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
3725                                 &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
3726                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
3727                                 &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
3728                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
3729                                 NULL, SI_PARAM_LINE_STIPPLE_TEX);
3730                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
3731                                 &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
3732                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
3733                                 &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
3734                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
3735                                 &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
3736                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
3737                                 &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
3738                 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
3739                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
3740                                 &ctx->args.front_face, SI_PARAM_FRONT_FACE);
3741                 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
3742                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
3743                                 &ctx->args.ancillary, SI_PARAM_ANCILLARY);
3744                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
3745                                 &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
3746                 add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
3747                                 &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
3748
3749                 /* Color inputs from the prolog. */
3750                 if (shader->selector->info.colors_read) {
3751                         unsigned num_color_elements =
3752                                 util_bitcount(shader->selector->info.colors_read);
3753
3754                         for (i = 0; i < num_color_elements; i++)
3755                                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
3756
3757                         num_prolog_vgprs += num_color_elements;
3758                 }
3759
3760                 /* Outputs for the epilog. */
3761                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
3762                 num_returns =
3763                         num_return_sgprs +
3764                         util_bitcount(shader->selector->info.colors_written) * 4 +
3765                         shader->selector->info.writes_z +
3766                         shader->selector->info.writes_stencil +
3767                         shader->selector->info.writes_samplemask +
3768                         1 /* SampleMaskIn */;
3769
3770                 num_returns = MAX2(num_returns,
3771                                    num_return_sgprs +
3772                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
3773
3774                 for (i = 0; i < num_return_sgprs; i++)
3775                         returns[i] = ctx->i32;
3776                 for (; i < num_returns; i++)
3777                         returns[i] = ctx->f32;
3778                 break;
3779
3780         case PIPE_SHADER_COMPUTE:
3781                 declare_global_desc_pointers(ctx);
3782                 declare_per_stage_desc_pointers(ctx, true);
3783                 if (shader->selector->info.uses_grid_size)
3784                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
3785                                    &ctx->args.num_work_groups);
3786                 if (shader->selector->info.uses_block_size &&
3787                     shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
3788                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
3789
3790                 unsigned cs_user_data_dwords =
3791                         shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
3792                 if (cs_user_data_dwords) {
3793                         ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
3794                                    &ctx->cs_user_data);
3795                 }
3796
3797                 /* Hardware SGPRs. */
3798                 for (i = 0; i < 3; i++) {
3799                         if (shader->selector->info.uses_block_id[i]) {
3800                                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
3801                                            &ctx->args.workgroup_ids[i]);
3802                         }
3803                 }
3804                 if (shader->selector->info.uses_subgroup_info)
3805                         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
3806
3807                 /* Hardware VGPRs. */
3808                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
3809                            &ctx->args.local_invocation_ids);
3810                 break;
3811         default:
3812                 assert(0 && "unimplemented shader");
3813                 return;
3814         }
3815
3816         si_create_function(ctx, "main", returns, num_returns,
3817                            si_get_max_workgroup_size(shader));
3818
3819         /* Reserve register locations for VGPR inputs the PS prolog may need. */
3820         if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
3821                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
3822                                                      "InitialPSInputAddr",
3823                                                      S_0286D0_PERSP_SAMPLE_ENA(1) |
3824                                                      S_0286D0_PERSP_CENTER_ENA(1) |
3825                                                      S_0286D0_PERSP_CENTROID_ENA(1) |
3826                                                      S_0286D0_LINEAR_SAMPLE_ENA(1) |
3827                                                      S_0286D0_LINEAR_CENTER_ENA(1) |
3828                                                      S_0286D0_LINEAR_CENTROID_ENA(1) |
3829                                                      S_0286D0_FRONT_FACE_ENA(1) |
3830                                                      S_0286D0_ANCILLARY_ENA(1) |
3831                                                      S_0286D0_POS_FIXED_PT_ENA(1));
3832         }
3833
3834         shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
3835         shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
3836
3837         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
3838         shader->info.num_input_vgprs -= num_prolog_vgprs;
3839
3840         if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
3841                 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
3842                         /* The LSHS size is not known until draw time, so we append it
3843                          * at the end of whatever LDS use there may be in the rest of
3844                          * the shader (currently none, unless LLVM decides to do its
3845                          * own LDS-based lowering).
3846                          */
3847                         ctx->ac.lds = LLVMAddGlobalInAddressSpace(
3848                                 ctx->ac.module, LLVMArrayType(ctx->i32, 0),
3849                                 "__lds_end", AC_ADDR_SPACE_LDS);
3850                         LLVMSetAlignment(ctx->ac.lds, 256);
3851                 } else {
3852                         ac_declare_lds_as_pointer(&ctx->ac);
3853                 }
3854         }
3855
3856         /* Unlike radv, we override these arguments in the prolog, so to the
3857          * API shader they appear as normal arguments.
3858          */
3859         if (ctx->type == PIPE_SHADER_VERTEX) {
3860                 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
3861                 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
3862         } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
3863                 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
3864                 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
3865         }
3866 }
3867
3868 /* Ensure that the esgs ring is declared.
3869  *
3870  * We declare it with 64KB alignment as a hint that the
3871  * pointer value will always be 0.
3872  */
3873 static void declare_esgs_ring(struct si_shader_context *ctx)
3874 {
3875         if (ctx->esgs_ring)
3876                 return;
3877
3878         assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
3879
3880         ctx->esgs_ring = LLVMAddGlobalInAddressSpace(
3881                 ctx->ac.module, LLVMArrayType(ctx->i32, 0),
3882                 "esgs_ring",
3883                 AC_ADDR_SPACE_LDS);
3884         LLVMSetLinkage(ctx->esgs_ring, LLVMExternalLinkage);
3885         LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
3886 }
3887
3888 /**
3889  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
3890  * for later use.
3891  */
3892 static void preload_ring_buffers(struct si_shader_context *ctx)
3893 {
3894         LLVMBuilderRef builder = ctx->ac.builder;
3895
3896         LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
3897
3898         if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) {
3899                 if (ctx->screen->info.chip_class <= GFX8) {
3900                         unsigned ring =
3901                                 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
3902                                                                   : SI_ES_RING_ESGS;
3903                         LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
3904
3905                         ctx->esgs_ring =
3906                                 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
3907                 } else {
3908                         if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
3909                                 /* Declare the ESGS ring as an explicit LDS symbol. */
3910                                 declare_esgs_ring(ctx);
3911                         } else {
3912                                 ac_declare_lds_as_pointer(&ctx->ac);
3913                                 ctx->esgs_ring = ctx->ac.lds;
3914                         }
3915                 }
3916         }
3917
3918         if (ctx->shader->is_gs_copy_shader) {
3919                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
3920
3921                 ctx->gsvs_ring[0] =
3922                         ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
3923         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
3924                 const struct si_shader_selector *sel = ctx->shader->selector;
3925                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
3926                 LLVMValueRef base_ring;
3927
3928                 base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
3929
3930                 /* The conceptual layout of the GSVS ring is
3931                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
3932                  * but the real memory layout is swizzled across
3933                  * threads:
3934                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
3935                  *   t16v0c0 ..
3936                  * Override the buffer descriptor accordingly.
3937                  */
3938                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
3939                 uint64_t stream_offset = 0;
3940
3941                 for (unsigned stream = 0; stream < 4; ++stream) {
3942                         unsigned num_components;
3943                         unsigned stride;
3944                         unsigned num_records;
3945                         LLVMValueRef ring, tmp;
3946
3947                         num_components = sel->info.num_stream_output_components[stream];
3948                         if (!num_components)
3949                                 continue;
3950
3951                         stride = 4 * num_components * sel->gs_max_out_vertices;
3952
3953                         /* Limit on the stride field for <= GFX7. */
3954                         assert(stride < (1 << 14));
3955
3956                         num_records = ctx->ac.wave_size;
3957
3958                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
3959                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
3960                         tmp = LLVMBuildAdd(builder, tmp,
3961                                            LLVMConstInt(ctx->i64,
3962                                                         stream_offset, 0), "");
3963                         stream_offset += stride * ctx->ac.wave_size;
3964
3965                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
3966                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
3967                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
3968                         tmp = LLVMBuildOr(builder, tmp,
3969                                 LLVMConstInt(ctx->i32,
3970                                              S_008F04_STRIDE(stride) |
3971                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
3972                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
3973                         ring = LLVMBuildInsertElement(builder, ring,
3974                                         LLVMConstInt(ctx->i32, num_records, 0),
3975                                         LLVMConstInt(ctx->i32, 2, 0), "");
3976
3977                         uint32_t rsrc3 =
3978                                         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3979                                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3980                                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3981                                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
3982                                         S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
3983                                         S_008F0C_ADD_TID_ENABLE(1);
3984
3985                         if (ctx->ac.chip_class >= GFX10) {
3986                                 rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3987                                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) |
3988                                          S_008F0C_RESOURCE_LEVEL(1);
3989                         } else {
3990                                 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3991                                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
3992                                          S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
3993                         }
3994
3995                         ring = LLVMBuildInsertElement(builder, ring,
3996                                 LLVMConstInt(ctx->i32, rsrc3, false),
3997                                 LLVMConstInt(ctx->i32, 3, 0), "");
3998
3999                         ctx->gsvs_ring[stream] = ring;
4000                 }
4001         } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4002                 ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
4003         }
4004 }
4005
4006 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4007                                          LLVMValueRef param_rw_buffers,
4008                                          struct ac_arg param_pos_fixed_pt)
4009 {
4010         LLVMBuilderRef builder = ctx->ac.builder;
4011         LLVMValueRef slot, desc, offset, row, bit, address[2];
4012
4013         /* Use the fixed-point gl_FragCoord input.
4014          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4015          * per coordinate to get the repeating effect.
4016          */
4017         address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4018         address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4019
4020         /* Load the buffer descriptor. */
4021         slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4022         desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
4023
4024         /* The stipple pattern is 32x32, each row has 32 bits. */
4025         offset = LLVMBuildMul(builder, address[1],
4026                               LLVMConstInt(ctx->i32, 4, 0), "");
4027         row = buffer_load_const(ctx, desc, offset);
4028         row = ac_to_integer(&ctx->ac, row);
4029         bit = LLVMBuildLShr(builder, row, address[0], "");
4030         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4031         ac_build_kill_if_false(&ctx->ac, bit);
4032 }
4033
4034 /* For the UMR disassembler. */
4035 #define DEBUGGER_END_OF_CODE_MARKER     0xbf9f0000 /* invalid instruction */
4036 #define DEBUGGER_NUM_MARKERS            5
4037
4038 static bool si_shader_binary_open(struct si_screen *screen,
4039                                   struct si_shader *shader,
4040                                   struct ac_rtld_binary *rtld)
4041 {
4042         const struct si_shader_selector *sel = shader->selector;
4043         const char *part_elfs[5];
4044         size_t part_sizes[5];
4045         unsigned num_parts = 0;
4046
4047 #define add_part(shader_or_part) \
4048         if (shader_or_part) { \
4049                 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
4050                 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
4051                 num_parts++; \
4052         }
4053
4054         add_part(shader->prolog);
4055         add_part(shader->previous_stage);
4056         add_part(shader->prolog2);
4057         add_part(shader);
4058         add_part(shader->epilog);
4059
4060 #undef add_part
4061
4062         struct ac_rtld_symbol lds_symbols[2];
4063         unsigned num_lds_symbols = 0;
4064
4065         if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
4066             (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
4067                 /* We add this symbol even on LLVM <= 8 to ensure that
4068                  * shader->config.lds_size is set correctly below.
4069                  */
4070                 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
4071                 sym->name = "esgs_ring";
4072                 sym->size = shader->gs_info.esgs_ring_size;
4073                 sym->align = 64 * 1024;
4074         }
4075
4076         if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
4077                 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
4078                 sym->name = "ngg_emit";
4079                 sym->size = shader->ngg.ngg_emit_size * 4;
4080                 sym->align = 4;
4081         }
4082
4083         bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
4084                         .info = &screen->info,
4085                         .options = {
4086                                 .halt_at_entry = screen->options.halt_shaders,
4087                         },
4088                         .shader_type = tgsi_processor_to_shader_stage(sel->type),
4089                         .wave_size = si_get_shader_wave_size(shader),
4090                         .num_parts = num_parts,
4091                         .elf_ptrs = part_elfs,
4092                         .elf_sizes = part_sizes,
4093                         .num_shared_lds_symbols = num_lds_symbols,
4094                         .shared_lds_symbols = lds_symbols });
4095
4096         if (rtld->lds_size > 0) {
4097                 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
4098                 shader->config.lds_size =
4099                         align(rtld->lds_size, alloc_granularity) / alloc_granularity;
4100         }
4101
4102         return ok;
4103 }
4104
4105 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
4106 {
4107         struct ac_rtld_binary rtld;
4108         si_shader_binary_open(screen, shader, &rtld);
4109         return rtld.exec_size;
4110 }
4111
4112 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
4113 {
4114         uint64_t *scratch_va = data;
4115
4116         if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
4117                 *value = (uint32_t)*scratch_va;
4118                 return true;
4119         }
4120         if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
4121                 /* Enable scratch coalescing. */
4122                 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
4123                          S_008F04_SWIZZLE_ENABLE(1);
4124                 return true;
4125         }
4126
4127         return false;
4128 }
4129
4130 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
4131                              uint64_t scratch_va)
4132 {
4133         struct ac_rtld_binary binary;
4134         if (!si_shader_binary_open(sscreen, shader, &binary))
4135                 return false;
4136
4137         si_resource_reference(&shader->bo, NULL);
4138         shader->bo = si_aligned_buffer_create(&sscreen->b,
4139                                               sscreen->info.cpdma_prefetch_writes_memory ?
4140                                                 0 : SI_RESOURCE_FLAG_READ_ONLY,
4141                                               PIPE_USAGE_IMMUTABLE,
4142                                               align(binary.rx_size, SI_CPDMA_ALIGNMENT),
4143                                               256);
4144         if (!shader->bo)
4145                 return false;
4146
4147         /* Upload. */
4148         struct ac_rtld_upload_info u = {};
4149         u.binary = &binary;
4150         u.get_external_symbol = si_get_external_symbol;
4151         u.cb_data = &scratch_va;
4152         u.rx_va = shader->bo->gpu_address;
4153         u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
4154                                         PIPE_TRANSFER_READ_WRITE |
4155                                         PIPE_TRANSFER_UNSYNCHRONIZED |
4156                                         RADEON_TRANSFER_TEMPORARY);
4157         if (!u.rx_ptr)
4158                 return false;
4159
4160         bool ok = ac_rtld_upload(&u);
4161
4162         sscreen->ws->buffer_unmap(shader->bo->buf);
4163         ac_rtld_close(&binary);
4164
4165         return ok;
4166 }
4167
4168 static void si_shader_dump_disassembly(struct si_screen *screen,
4169                                        const struct si_shader_binary *binary,
4170                                        enum pipe_shader_type shader_type,
4171                                        unsigned wave_size,
4172                                        struct pipe_debug_callback *debug,
4173                                        const char *name, FILE *file)
4174 {
4175         struct ac_rtld_binary rtld_binary;
4176
4177         if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
4178                         .info = &screen->info,
4179                         .shader_type = tgsi_processor_to_shader_stage(shader_type),
4180                         .wave_size = wave_size,
4181                         .num_parts = 1,
4182                         .elf_ptrs = &binary->elf_buffer,
4183                         .elf_sizes = &binary->elf_size }))
4184                 return;
4185
4186         const char *disasm;
4187         size_t nbytes;
4188
4189         if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
4190                 goto out;
4191
4192         if (nbytes > INT_MAX)
4193                 goto out;
4194
4195         if (debug && debug->debug_message) {
4196                 /* Very long debug messages are cut off, so send the
4197                  * disassembly one line at a time. This causes more
4198                  * overhead, but on the plus side it simplifies
4199                  * parsing of resulting logs.
4200                  */
4201                 pipe_debug_message(debug, SHADER_INFO,
4202                                    "Shader Disassembly Begin");
4203
4204                 uint64_t line = 0;
4205                 while (line < nbytes) {
4206                         int count = nbytes - line;
4207                         const char *nl = memchr(disasm + line, '\n', nbytes - line);
4208                         if (nl)
4209                                 count = nl - (disasm + line);
4210
4211                         if (count) {
4212                                 pipe_debug_message(debug, SHADER_INFO,
4213                                                    "%.*s", count, disasm + line);
4214                         }
4215
4216                         line += count + 1;
4217                 }
4218
4219                 pipe_debug_message(debug, SHADER_INFO,
4220                                    "Shader Disassembly End");
4221         }
4222
4223         if (file) {
4224                 fprintf(file, "Shader %s disassembly:\n", name);
4225                 fprintf(file, "%*s", (int)nbytes, disasm);
4226         }
4227
4228 out:
4229         ac_rtld_close(&rtld_binary);
4230 }
4231
4232 static void si_calculate_max_simd_waves(struct si_shader *shader)
4233 {
4234         struct si_screen *sscreen = shader->selector->screen;
4235         struct ac_shader_config *conf = &shader->config;
4236         unsigned num_inputs = shader->selector->info.num_inputs;
4237         unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
4238         unsigned lds_per_wave = 0;
4239         unsigned max_simd_waves;
4240
4241         max_simd_waves = sscreen->info.max_wave64_per_simd;
4242
4243         /* Compute LDS usage for PS. */
4244         switch (shader->selector->type) {
4245         case PIPE_SHADER_FRAGMENT:
4246                 /* The minimum usage per wave is (num_inputs * 48). The maximum
4247                  * usage is (num_inputs * 48 * 16).
4248                  * We can get anything in between and it varies between waves.
4249                  *
4250                  * The 48 bytes per input for a single primitive is equal to
4251                  * 4 bytes/component * 4 components/input * 3 points.
4252                  *
4253                  * Other stages don't know the size at compile time or don't
4254                  * allocate LDS per wave, but instead they do it per thread group.
4255                  */
4256                 lds_per_wave = conf->lds_size * lds_increment +
4257                                align(num_inputs * 48, lds_increment);
4258                 break;
4259         case PIPE_SHADER_COMPUTE:
4260                 if (shader->selector) {
4261                         unsigned max_workgroup_size =
4262                                 si_get_max_workgroup_size(shader);
4263                         lds_per_wave = (conf->lds_size * lds_increment) /
4264                                        DIV_ROUND_UP(max_workgroup_size,
4265                                                     sscreen->compute_wave_size);
4266                 }
4267                 break;
4268         default:;
4269         }
4270
4271         /* Compute the per-SIMD wave counts. */
4272         if (conf->num_sgprs) {
4273                 max_simd_waves =
4274                         MIN2(max_simd_waves,
4275                              sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
4276         }
4277
4278         if (conf->num_vgprs) {
4279                 /* Always print wave limits as Wave64, so that we can compare
4280                  * Wave32 and Wave64 with shader-db fairly. */
4281                 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
4282                 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
4283         }
4284
4285         /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
4286          * 16KB makes some SIMDs unoccupied).
4287          *
4288          * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
4289          */
4290         unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024;
4291         unsigned max_lds_per_simd = max_lds_size / 4;
4292         if (lds_per_wave)
4293                 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
4294
4295         shader->info.max_simd_waves = max_simd_waves;
4296 }
4297
4298 void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
4299                                         struct si_shader *shader,
4300                                         struct pipe_debug_callback *debug)
4301 {
4302         const struct ac_shader_config *conf = &shader->config;
4303
4304         if (screen->options.debug_disassembly)
4305                 si_shader_dump_disassembly(screen, &shader->binary,
4306                                            shader->selector->type,
4307                                            si_get_shader_wave_size(shader),
4308                                            debug, "main", NULL);
4309
4310         pipe_debug_message(debug, SHADER_INFO,
4311                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
4312                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
4313                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
4314                            conf->num_sgprs, conf->num_vgprs,
4315                            si_get_shader_binary_size(screen, shader),
4316                            conf->lds_size, conf->scratch_bytes_per_wave,
4317                            shader->info.max_simd_waves, conf->spilled_sgprs,
4318                            conf->spilled_vgprs, shader->info.private_mem_vgprs);
4319 }
4320
4321 static void si_shader_dump_stats(struct si_screen *sscreen,
4322                                  struct si_shader *shader,
4323                                  FILE *file,
4324                                  bool check_debug_option)
4325 {
4326         const struct ac_shader_config *conf = &shader->config;
4327
4328         if (!check_debug_option ||
4329             si_can_dump_shader(sscreen, shader->selector->type)) {
4330                 if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
4331                         fprintf(file, "*** SHADER CONFIG ***\n"
4332                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
4333                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
4334                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
4335                 }
4336
4337                 fprintf(file, "*** SHADER STATS ***\n"
4338                         "SGPRS: %d\n"
4339                         "VGPRS: %d\n"
4340                         "Spilled SGPRs: %d\n"
4341                         "Spilled VGPRs: %d\n"
4342                         "Private memory VGPRs: %d\n"
4343                         "Code Size: %d bytes\n"
4344                         "LDS: %d blocks\n"
4345                         "Scratch: %d bytes per wave\n"
4346                         "Max Waves: %d\n"
4347                         "********************\n\n\n",
4348                         conf->num_sgprs, conf->num_vgprs,
4349                         conf->spilled_sgprs, conf->spilled_vgprs,
4350                         shader->info.private_mem_vgprs,
4351                         si_get_shader_binary_size(sscreen, shader),
4352                         conf->lds_size, conf->scratch_bytes_per_wave,
4353                         shader->info.max_simd_waves);
4354         }
4355 }
4356
4357 const char *si_get_shader_name(const struct si_shader *shader)
4358 {
4359         switch (shader->selector->type) {
4360         case PIPE_SHADER_VERTEX:
4361                 if (shader->key.as_es)
4362                         return "Vertex Shader as ES";
4363                 else if (shader->key.as_ls)
4364                         return "Vertex Shader as LS";
4365                 else if (shader->key.opt.vs_as_prim_discard_cs)
4366                         return "Vertex Shader as Primitive Discard CS";
4367                 else if (shader->key.as_ngg)
4368                         return "Vertex Shader as ESGS";
4369                 else
4370                         return "Vertex Shader as VS";
4371         case PIPE_SHADER_TESS_CTRL:
4372                 return "Tessellation Control Shader";
4373         case PIPE_SHADER_TESS_EVAL:
4374                 if (shader->key.as_es)
4375                         return "Tessellation Evaluation Shader as ES";
4376                 else if (shader->key.as_ngg)
4377                         return "Tessellation Evaluation Shader as ESGS";
4378                 else
4379                         return "Tessellation Evaluation Shader as VS";
4380         case PIPE_SHADER_GEOMETRY:
4381                 if (shader->is_gs_copy_shader)
4382                         return "GS Copy Shader as VS";
4383                 else
4384                         return "Geometry Shader";
4385         case PIPE_SHADER_FRAGMENT:
4386                 return "Pixel Shader";
4387         case PIPE_SHADER_COMPUTE:
4388                 return "Compute Shader";
4389         default:
4390                 return "Unknown Shader";
4391         }
4392 }
4393
4394 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
4395                     struct pipe_debug_callback *debug,
4396                     FILE *file, bool check_debug_option)
4397 {
4398         enum pipe_shader_type shader_type = shader->selector->type;
4399
4400         if (!check_debug_option ||
4401             si_can_dump_shader(sscreen, shader_type))
4402                 si_dump_shader_key(shader, file);
4403
4404         if (!check_debug_option && shader->binary.llvm_ir_string) {
4405                 if (shader->previous_stage &&
4406                     shader->previous_stage->binary.llvm_ir_string) {
4407                         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
4408                                 si_get_shader_name(shader));
4409                         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
4410                 }
4411
4412                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
4413                         si_get_shader_name(shader));
4414                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
4415         }
4416
4417         if (!check_debug_option ||
4418             (si_can_dump_shader(sscreen, shader_type) &&
4419              !(sscreen->debug_flags & DBG(NO_ASM)))) {
4420                 unsigned wave_size = si_get_shader_wave_size(shader);
4421
4422                 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
4423
4424                 if (shader->prolog)
4425                         si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
4426                                                    shader_type, wave_size, debug, "prolog", file);
4427                 if (shader->previous_stage)
4428                         si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
4429                                                    shader_type, wave_size, debug, "previous stage", file);
4430                 if (shader->prolog2)
4431                         si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
4432                                                    shader_type, wave_size, debug, "prolog2", file);
4433
4434                 si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
4435                                            wave_size, debug, "main", file);
4436
4437                 if (shader->epilog)
4438                         si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
4439                                                    shader_type, wave_size, debug, "epilog", file);
4440                 fprintf(file, "\n");
4441         }
4442
4443         si_shader_dump_stats(sscreen, shader, file, check_debug_option);
4444 }
4445
4446 static int si_compile_llvm(struct si_screen *sscreen,
4447                            struct si_shader_binary *binary,
4448                            struct ac_shader_config *conf,
4449                            struct ac_llvm_compiler *compiler,
4450                            LLVMModuleRef mod,
4451                            struct pipe_debug_callback *debug,
4452                            enum pipe_shader_type shader_type,
4453                            unsigned wave_size,
4454                            const char *name,
4455                            bool less_optimized)
4456 {
4457         unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
4458
4459         if (si_can_dump_shader(sscreen, shader_type)) {
4460                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
4461
4462                 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
4463                         fprintf(stderr, "%s LLVM IR:\n\n", name);
4464                         ac_dump_module(mod);
4465                         fprintf(stderr, "\n");
4466                 }
4467         }
4468
4469         if (sscreen->record_llvm_ir) {
4470                 char *ir = LLVMPrintModuleToString(mod);
4471                 binary->llvm_ir_string = strdup(ir);
4472                 LLVMDisposeMessage(ir);
4473         }
4474
4475         if (!si_replace_shader(count, binary)) {
4476                 unsigned r = si_llvm_compile(mod, binary, compiler, debug,
4477                                              less_optimized, wave_size);
4478                 if (r)
4479                         return r;
4480         }
4481
4482         struct ac_rtld_binary rtld;
4483         if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
4484                         .info = &sscreen->info,
4485                         .shader_type = tgsi_processor_to_shader_stage(shader_type),
4486                         .wave_size = wave_size,
4487                         .num_parts = 1,
4488                         .elf_ptrs = &binary->elf_buffer,
4489                         .elf_sizes = &binary->elf_size }))
4490                 return -1;
4491
4492         bool ok = ac_rtld_read_config(&rtld, conf);
4493         ac_rtld_close(&rtld);
4494         if (!ok)
4495                 return -1;
4496
4497         /* Enable 64-bit and 16-bit denormals, because there is no performance
4498          * cost.
4499          *
4500          * If denormals are enabled, all floating-point output modifiers are
4501          * ignored.
4502          *
4503          * Don't enable denormals for 32-bit floats, because:
4504          * - Floating-point output modifiers would be ignored by the hw.
4505          * - Some opcodes don't support denormals, such as v_mad_f32. We would
4506          *   have to stop using those.
4507          * - GFX6 & GFX7 would be very slow.
4508          */
4509         conf->float_mode |= V_00B028_FP_64_DENORMS;
4510
4511         return 0;
4512 }
4513
4514 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
4515 {
4516         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
4517                 LLVMBuildRetVoid(ctx->ac.builder);
4518         else
4519                 LLVMBuildRet(ctx->ac.builder, ret);
4520 }
4521
4522 /* Generate code for the hardware VS shader stage to go with a geometry shader */
4523 struct si_shader *
4524 si_generate_gs_copy_shader(struct si_screen *sscreen,
4525                            struct ac_llvm_compiler *compiler,
4526                            struct si_shader_selector *gs_selector,
4527                            struct pipe_debug_callback *debug)
4528 {
4529         struct si_shader_context ctx;
4530         struct si_shader *shader;
4531         LLVMBuilderRef builder;
4532         struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
4533         struct tgsi_shader_info *gsinfo = &gs_selector->info;
4534         int i;
4535
4536
4537         shader = CALLOC_STRUCT(si_shader);
4538         if (!shader)
4539                 return NULL;
4540
4541         /* We can leave the fence as permanently signaled because the GS copy
4542          * shader only becomes visible globally after it has been compiled. */
4543         util_queue_fence_init(&shader->ready);
4544
4545         shader->selector = gs_selector;
4546         shader->is_gs_copy_shader = true;
4547
4548         si_llvm_context_init(&ctx, sscreen, compiler,
4549                              si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false),
4550                              64);
4551         ctx.shader = shader;
4552         ctx.type = PIPE_SHADER_VERTEX;
4553
4554         builder = ctx.ac.builder;
4555
4556         create_function(&ctx);
4557         preload_ring_buffers(&ctx);
4558
4559         LLVMValueRef voffset =
4560                 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
4561                              LLVMConstInt(ctx.i32, 4, 0), "");
4562
4563         /* Fetch the vertex stream ID.*/
4564         LLVMValueRef stream_id;
4565
4566         if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
4567                 stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
4568         else
4569                 stream_id = ctx.i32_0;
4570
4571         /* Fill in output information. */
4572         for (i = 0; i < gsinfo->num_outputs; ++i) {
4573                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
4574                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
4575
4576                 for (int chan = 0; chan < 4; chan++) {
4577                         outputs[i].vertex_stream[chan] =
4578                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
4579                 }
4580         }
4581
4582         LLVMBasicBlockRef end_bb;
4583         LLVMValueRef switch_inst;
4584
4585         end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
4586         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
4587
4588         for (int stream = 0; stream < 4; stream++) {
4589                 LLVMBasicBlockRef bb;
4590                 unsigned offset;
4591
4592                 if (!gsinfo->num_stream_output_components[stream])
4593                         continue;
4594
4595                 if (stream > 0 && !gs_selector->so.num_outputs)
4596                         continue;
4597
4598                 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
4599                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
4600                 LLVMPositionBuilderAtEnd(builder, bb);
4601
4602                 /* Fetch vertex data from GSVS ring */
4603                 offset = 0;
4604                 for (i = 0; i < gsinfo->num_outputs; ++i) {
4605                         for (unsigned chan = 0; chan < 4; chan++) {
4606                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
4607                                     outputs[i].vertex_stream[chan] != stream) {
4608                                         outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
4609                                         continue;
4610                                 }
4611
4612                                 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
4613                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
4614                                 offset++;
4615
4616                                 outputs[i].values[chan] =
4617                                         ac_build_buffer_load(&ctx.ac,
4618                                                              ctx.gsvs_ring[0], 1,
4619                                                              ctx.i32_0, voffset,
4620                                                              soffset, 0, ac_glc | ac_slc,
4621                                                              true, false);
4622                         }
4623                 }
4624
4625                 /* Streamout and exports. */
4626                 if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
4627                         si_llvm_emit_streamout(&ctx, outputs,
4628                                                gsinfo->num_outputs,
4629                                                stream);
4630                 }
4631
4632                 if (stream == 0)
4633                         si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
4634
4635                 LLVMBuildBr(builder, end_bb);
4636         }
4637
4638         LLVMPositionBuilderAtEnd(builder, end_bb);
4639
4640         LLVMBuildRetVoid(ctx.ac.builder);
4641
4642         ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
4643         si_llvm_optimize_module(&ctx);
4644
4645         bool ok = false;
4646         if (si_compile_llvm(sscreen, &ctx.shader->binary,
4647                             &ctx.shader->config, ctx.compiler,
4648                             ctx.ac.module,
4649                             debug, PIPE_SHADER_GEOMETRY, ctx.ac.wave_size,
4650                             "GS Copy Shader", false) == 0) {
4651                 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
4652                         fprintf(stderr, "GS Copy Shader:\n");
4653                 si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
4654
4655                 if (!ctx.shader->config.scratch_bytes_per_wave)
4656                         ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
4657                 else
4658                         ok = true;
4659         }
4660
4661         si_llvm_dispose(&ctx);
4662
4663         if (!ok) {
4664                 FREE(shader);
4665                 shader = NULL;
4666         } else {
4667                 si_fix_resource_usage(sscreen, shader);
4668         }
4669         return shader;
4670 }
4671
4672 static void si_dump_shader_key_vs(const struct si_shader_key *key,
4673                                   const struct si_vs_prolog_bits *prolog,
4674                                   const char *prefix, FILE *f)
4675 {
4676         fprintf(f, "  %s.instance_divisor_is_one = %u\n",
4677                 prefix, prolog->instance_divisor_is_one);
4678         fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
4679                 prefix, prolog->instance_divisor_is_fetched);
4680         fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n",
4681                 prefix, prolog->unpack_instance_id_from_vertex_id);
4682         fprintf(f, "  %s.ls_vgpr_fix = %u\n",
4683                 prefix, prolog->ls_vgpr_fix);
4684
4685         fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
4686         fprintf(f, "  mono.vs.fix_fetch = {");
4687         for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
4688                 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
4689                 if (i)
4690                         fprintf(f, ", ");
4691                 if (!fix.bits)
4692                         fprintf(f, "0");
4693                 else
4694                         fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
4695                                 fix.u.num_channels_m1, fix.u.format);
4696         }
4697         fprintf(f, "}\n");
4698 }
4699
4700 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
4701 {
4702         const struct si_shader_key *key = &shader->key;
4703         enum pipe_shader_type shader_type = shader->selector->type;
4704
4705         fprintf(f, "SHADER KEY\n");
4706
4707         switch (shader_type) {
4708         case PIPE_SHADER_VERTEX:
4709                 si_dump_shader_key_vs(key, &key->part.vs.prolog,
4710                                       "part.vs.prolog", f);
4711                 fprintf(f, "  as_es = %u\n", key->as_es);
4712                 fprintf(f, "  as_ls = %u\n", key->as_ls);
4713                 fprintf(f, "  as_ngg = %u\n", key->as_ngg);
4714                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
4715                         key->mono.u.vs_export_prim_id);
4716                 fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n",
4717                         key->opt.vs_as_prim_discard_cs);
4718                 fprintf(f, "  opt.cs_prim_type = %s\n",
4719                         tgsi_primitive_names[key->opt.cs_prim_type]);
4720                 fprintf(f, "  opt.cs_indexed = %u\n",
4721                         key->opt.cs_indexed);
4722                 fprintf(f, "  opt.cs_instancing = %u\n",
4723                         key->opt.cs_instancing);
4724                 fprintf(f, "  opt.cs_primitive_restart = %u\n",
4725                         key->opt.cs_primitive_restart);
4726                 fprintf(f, "  opt.cs_provoking_vertex_first = %u\n",
4727                         key->opt.cs_provoking_vertex_first);
4728                 fprintf(f, "  opt.cs_need_correct_orientation = %u\n",
4729                         key->opt.cs_need_correct_orientation);
4730                 fprintf(f, "  opt.cs_cull_front = %u\n",
4731                         key->opt.cs_cull_front);
4732                 fprintf(f, "  opt.cs_cull_back = %u\n",
4733                         key->opt.cs_cull_back);
4734                 fprintf(f, "  opt.cs_cull_z = %u\n",
4735                         key->opt.cs_cull_z);
4736                 fprintf(f, "  opt.cs_halfz_clip_space = %u\n",
4737                         key->opt.cs_halfz_clip_space);
4738                 break;
4739
4740         case PIPE_SHADER_TESS_CTRL:
4741                 if (shader->selector->screen->info.chip_class >= GFX9) {
4742                         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
4743                                               "part.tcs.ls_prolog", f);
4744                 }
4745                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
4746                 fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
4747                 break;
4748
4749         case PIPE_SHADER_TESS_EVAL:
4750                 fprintf(f, "  as_es = %u\n", key->as_es);
4751                 fprintf(f, "  as_ngg = %u\n", key->as_ngg);
4752                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
4753                         key->mono.u.vs_export_prim_id);
4754                 break;
4755
4756         case PIPE_SHADER_GEOMETRY:
4757                 if (shader->is_gs_copy_shader)
4758                         break;
4759
4760                 if (shader->selector->screen->info.chip_class >= GFX9 &&
4761                     key->part.gs.es->type == PIPE_SHADER_VERTEX) {
4762                         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
4763                                               "part.gs.vs_prolog", f);
4764                 }
4765                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
4766                 fprintf(f, "  part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
4767                 fprintf(f, "  as_ngg = %u\n", key->as_ngg);
4768                 break;
4769
4770         case PIPE_SHADER_COMPUTE:
4771                 break;
4772
4773         case PIPE_SHADER_FRAGMENT:
4774                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
4775                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
4776                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
4777                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
4778                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
4779                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
4780                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
4781                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
4782                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
4783                 fprintf(f, "  part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
4784                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
4785                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
4786                 fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
4787                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
4788                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
4789                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
4790                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
4791                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
4792                 fprintf(f, "  mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
4793                 fprintf(f, "  mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
4794                 fprintf(f, "  mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
4795                 fprintf(f, "  mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
4796                 break;
4797
4798         default:
4799                 assert(0);
4800         }
4801
4802         if ((shader_type == PIPE_SHADER_GEOMETRY ||
4803              shader_type == PIPE_SHADER_TESS_EVAL ||
4804              shader_type == PIPE_SHADER_VERTEX) &&
4805             !key->as_es && !key->as_ls) {
4806                 fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
4807                 fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
4808         }
4809 }
4810
4811 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
4812 {
4813         struct si_shader *shader = ctx->shader;
4814         struct tgsi_shader_info *info = &shader->selector->info;
4815
4816         if ((ctx->type != PIPE_SHADER_VERTEX &&
4817              ctx->type != PIPE_SHADER_TESS_EVAL) ||
4818             shader->key.as_ls ||
4819             shader->key.as_es)
4820                 return;
4821
4822         ac_optimize_vs_outputs(&ctx->ac,
4823                                ctx->main_fn,
4824                                shader->info.vs_output_param_offset,
4825                                info->num_outputs,
4826                                &shader->info.nr_param_exports);
4827 }
4828
4829 static void si_init_exec_from_input(struct si_shader_context *ctx,
4830                                     struct ac_arg param, unsigned bitoffset)
4831 {
4832         LLVMValueRef args[] = {
4833                 ac_get_arg(&ctx->ac, param),
4834                 LLVMConstInt(ctx->i32, bitoffset, 0),
4835         };
4836         ac_build_intrinsic(&ctx->ac,
4837                            "llvm.amdgcn.init.exec.from.input",
4838                            ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
4839 }
4840
4841 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
4842                                const struct si_vs_prolog_bits *key)
4843 {
4844         /* VGPR initialization fixup for Vega10 and Raven is always done in the
4845          * VS prolog. */
4846         return sel->vs_needs_prolog ||
4847                key->ls_vgpr_fix ||
4848                key->unpack_instance_id_from_vertex_id;
4849 }
4850
4851 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
4852 {
4853         /* Return true if the current thread should execute an ES thread. */
4854         return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
4855                              ac_get_thread_id(&ctx->ac),
4856                              si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
4857 }
4858
4859 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
4860 {
4861         /* Return true if the current thread should execute a GS thread. */
4862         return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
4863                              ac_get_thread_id(&ctx->ac),
4864                              si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
4865 }
4866
4867 static void si_llvm_emit_kill(struct ac_shader_abi *abi, LLVMValueRef visible)
4868 {
4869         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4870         LLVMBuilderRef builder = ctx->ac.builder;
4871
4872         if (ctx->shader->selector->force_correct_derivs_after_kill) {
4873                 /* Kill immediately while maintaining WQM. */
4874                 ac_build_kill_if_false(&ctx->ac,
4875                                        ac_build_wqm_vote(&ctx->ac, visible));
4876
4877                 LLVMValueRef mask = LLVMBuildLoad(builder, ctx->postponed_kill, "");
4878                 mask = LLVMBuildAnd(builder, mask, visible, "");
4879                 LLVMBuildStore(builder, mask, ctx->postponed_kill);
4880                 return;
4881         }
4882
4883         ac_build_kill_if_false(&ctx->ac, visible);
4884 }
4885
4886 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
4887                                  struct nir_shader *nir, bool free_nir)
4888 {
4889         struct si_shader *shader = ctx->shader;
4890         struct si_shader_selector *sel = shader->selector;
4891
4892         // TODO clean all this up!
4893         switch (ctx->type) {
4894         case PIPE_SHADER_VERTEX:
4895                 if (shader->key.as_ls)
4896                         ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
4897                 else if (shader->key.as_es)
4898                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
4899                 else if (shader->key.opt.vs_as_prim_discard_cs)
4900                         ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
4901                 else if (shader->key.as_ngg)
4902                         ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
4903                 else
4904                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
4905                 ctx->abi.load_base_vertex = get_base_vertex;
4906                 break;
4907         case PIPE_SHADER_TESS_CTRL:
4908                 ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
4909                 ctx->abi.load_tess_level = si_load_tess_level;
4910                 ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
4911                 ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
4912                 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
4913                 break;
4914         case PIPE_SHADER_TESS_EVAL:
4915                 ctx->abi.load_tess_varyings = si_nir_load_input_tes;
4916                 ctx->abi.load_tess_coord = si_load_tess_coord;
4917                 ctx->abi.load_tess_level = si_load_tess_level;
4918                 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
4919                 if (shader->key.as_es)
4920                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
4921                 else if (shader->key.as_ngg)
4922                         ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
4923                 else
4924                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
4925                 break;
4926         case PIPE_SHADER_GEOMETRY:
4927                 ctx->abi.load_inputs = si_nir_load_input_gs;
4928                 ctx->abi.emit_vertex = si_llvm_emit_vertex;
4929                 ctx->abi.emit_primitive = si_llvm_emit_primitive;
4930                 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
4931                 break;
4932         case PIPE_SHADER_FRAGMENT:
4933                 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
4934                 ctx->abi.load_sample_position = load_sample_position;
4935                 ctx->abi.load_sample_mask_in = load_sample_mask_in;
4936                 ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
4937                 ctx->abi.emit_kill = si_llvm_emit_kill;
4938                 break;
4939         case PIPE_SHADER_COMPUTE:
4940                 ctx->abi.load_local_group_size = get_block_size;
4941                 break;
4942         default:
4943                 assert(!"Unsupported shader type");
4944                 return false;
4945         }
4946
4947         ctx->abi.load_ubo = load_ubo;
4948         ctx->abi.load_ssbo = load_ssbo;
4949
4950         create_function(ctx);
4951         preload_ring_buffers(ctx);
4952
4953         if (ctx->type == PIPE_SHADER_TESS_CTRL &&
4954             sel->tcs_info.tessfactors_are_def_in_all_invocs) {
4955                 for (unsigned i = 0; i < 6; i++) {
4956                         ctx->invoc0_tess_factors[i] =
4957                                 ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
4958                 }
4959         }
4960
4961         if (ctx->type == PIPE_SHADER_GEOMETRY) {
4962                 for (unsigned i = 0; i < 4; i++) {
4963                         ctx->gs_next_vertex[i] =
4964                                 ac_build_alloca(&ctx->ac, ctx->i32, "");
4965                 }
4966                 if (shader->key.as_ngg) {
4967                         for (unsigned i = 0; i < 4; ++i) {
4968                                 ctx->gs_curprim_verts[i] =
4969                                         ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
4970                                 ctx->gs_generated_prims[i] =
4971                                         ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
4972                         }
4973
4974                         unsigned scratch_size = 8;
4975                         if (sel->so.num_outputs)
4976                                 scratch_size = 44;
4977
4978                         LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, scratch_size);
4979                         ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
4980                                 ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
4981                         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
4982                         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
4983
4984                         ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
4985                                 LLVMArrayType(ctx->i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
4986                         LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
4987                         LLVMSetAlignment(ctx->gs_ngg_emit, 4);
4988                 }
4989         }
4990
4991         if (ctx->type != PIPE_SHADER_GEOMETRY &&
4992             (shader->key.as_ngg && !shader->key.as_es)) {
4993                 /* Unconditionally declare scratch space base for streamout and
4994                  * vertex compaction. Whether space is actually allocated is
4995                  * determined during linking / PM4 creation.
4996                  *
4997                  * Add an extra dword per vertex to ensure an odd stride, which
4998                  * avoids bank conflicts for SoA accesses.
4999                  */
5000                 if (!gfx10_is_ngg_passthrough(shader))
5001                         declare_esgs_ring(ctx);
5002
5003                 /* This is really only needed when streamout and / or vertex
5004                  * compaction is enabled.
5005                  */
5006                 if (sel->so.num_outputs && !ctx->gs_ngg_scratch) {
5007                         LLVMTypeRef asi32 = LLVMArrayType(ctx->i32, 8);
5008                         ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
5009                                 asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
5010                         LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
5011                         LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
5012                 }
5013         }
5014
5015         /* For GFX9 merged shaders:
5016          * - Set EXEC for the first shader. If the prolog is present, set
5017          *   EXEC there instead.
5018          * - Add a barrier before the second shader.
5019          * - In the second shader, reset EXEC to ~0 and wrap the main part in
5020          *   an if-statement. This is required for correctness in geometry
5021          *   shaders, to ensure that empty GS waves do not send GS_EMIT and
5022          *   GS_CUT messages.
5023          *
5024          * For monolithic merged shaders, the first shader is wrapped in an
5025          * if-block together with its prolog in si_build_wrapper_function.
5026          *
5027          * NGG vertex and tess eval shaders running as the last
5028          * vertex/geometry stage handle execution explicitly using
5029          * if-statements.
5030          */
5031         if (ctx->screen->info.chip_class >= GFX9) {
5032                 if (!shader->is_monolithic &&
5033                     sel->info.num_instructions > 1 && /* not empty shader */
5034                     (shader->key.as_es || shader->key.as_ls) &&
5035                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
5036                      (ctx->type == PIPE_SHADER_VERTEX &&
5037                       !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
5038                         si_init_exec_from_input(ctx,
5039                                                 ctx->merged_wave_info, 0);
5040                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5041                            ctx->type == PIPE_SHADER_GEOMETRY ||
5042                            (shader->key.as_ngg && !shader->key.as_es)) {
5043                         LLVMValueRef thread_enabled;
5044                         bool nested_barrier;
5045
5046                         if (!shader->is_monolithic ||
5047                             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5048                              (shader->key.as_ngg && !shader->key.as_es)))
5049                                 ac_init_exec_full_mask(&ctx->ac);
5050
5051                         if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5052                             ctx->type == PIPE_SHADER_GEOMETRY) {
5053                                 if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
5054                                         gfx10_ngg_gs_emit_prologue(ctx);
5055                                         nested_barrier = false;
5056                                 } else {
5057                                         nested_barrier = true;
5058                                 }
5059
5060                                 thread_enabled = si_is_gs_thread(ctx);
5061                         } else {
5062                                 thread_enabled = si_is_es_thread(ctx);
5063                                 nested_barrier = false;
5064                         }
5065
5066                         ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
5067                         ctx->merged_wrap_if_label = 11500;
5068                         ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
5069
5070                         if (nested_barrier) {
5071                                 /* Execute a barrier before the second shader in
5072                                  * a merged shader.
5073                                  *
5074                                  * Execute the barrier inside the conditional block,
5075                                  * so that empty waves can jump directly to s_endpgm,
5076                                  * which will also signal the barrier.
5077                                  *
5078                                  * This is possible in gfx9, because an empty wave
5079                                  * for the second shader does not participate in
5080                                  * the epilogue. With NGG, empty waves may still
5081                                  * be required to export data (e.g. GS output vertices),
5082                                  * so we cannot let them exit early.
5083                                  *
5084                                  * If the shader is TCS and the TCS epilog is present
5085                                  * and contains a barrier, it will wait there and then
5086                                  * reach s_endpgm.
5087                                  */
5088                                 si_llvm_emit_barrier(ctx);
5089                         }
5090                 }
5091         }
5092
5093         if (sel->force_correct_derivs_after_kill) {
5094                 ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
5095                 /* true = don't kill. */
5096                 LLVMBuildStore(ctx->ac.builder, ctx->i1true,
5097                                ctx->postponed_kill);
5098         }
5099
5100         bool success = si_nir_build_llvm(ctx, nir);
5101         if (free_nir)
5102                 ralloc_free(nir);
5103         if (!success) {
5104                 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
5105                 return false;
5106         }
5107
5108         si_llvm_build_ret(ctx, ctx->return_value);
5109         return true;
5110 }
5111
5112 /**
5113  * Compute the VS prolog key, which contains all the information needed to
5114  * build the VS prolog function, and set shader->info bits where needed.
5115  *
5116  * \param info             Shader info of the vertex shader.
5117  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
5118  * \param prolog_key       Key of the VS prolog
5119  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
5120  * \param key              Output shader part key.
5121  */
5122 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5123                                  unsigned num_input_sgprs,
5124                                  const struct si_vs_prolog_bits *prolog_key,
5125                                  struct si_shader *shader_out,
5126                                  union si_shader_part_key *key)
5127 {
5128         memset(key, 0, sizeof(*key));
5129         key->vs_prolog.states = *prolog_key;
5130         key->vs_prolog.num_input_sgprs = num_input_sgprs;
5131         key->vs_prolog.num_inputs = info->num_inputs;
5132         key->vs_prolog.as_ls = shader_out->key.as_ls;
5133         key->vs_prolog.as_es = shader_out->key.as_es;
5134         key->vs_prolog.as_ngg = shader_out->key.as_ngg;
5135
5136         if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5137                 key->vs_prolog.as_ls = 1;
5138                 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5139         } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5140                 key->vs_prolog.as_es = 1;
5141                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5142         } else if (shader_out->key.as_ngg) {
5143                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5144         }
5145
5146         /* Enable loading the InstanceID VGPR. */
5147         uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5148
5149         if ((key->vs_prolog.states.instance_divisor_is_one |
5150              key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5151                 shader_out->info.uses_instanceid = true;
5152 }
5153
5154 /**
5155  * Compute the PS prolog key, which contains all the information needed to
5156  * build the PS prolog function, and set related bits in shader->config.
5157  */
5158 static void si_get_ps_prolog_key(struct si_shader *shader,
5159                                  union si_shader_part_key *key,
5160                                  bool separate_prolog)
5161 {
5162         struct tgsi_shader_info *info = &shader->selector->info;
5163
5164         memset(key, 0, sizeof(*key));
5165         key->ps_prolog.states = shader->key.part.ps.prolog;
5166         key->ps_prolog.colors_read = info->colors_read;
5167         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5168         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5169         key->ps_prolog.wqm = info->uses_derivatives &&
5170                 (key->ps_prolog.colors_read ||
5171                  key->ps_prolog.states.force_persp_sample_interp ||
5172                  key->ps_prolog.states.force_linear_sample_interp ||
5173                  key->ps_prolog.states.force_persp_center_interp ||
5174                  key->ps_prolog.states.force_linear_center_interp ||
5175                  key->ps_prolog.states.bc_optimize_for_persp ||
5176                  key->ps_prolog.states.bc_optimize_for_linear);
5177         key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
5178
5179         if (info->colors_read) {
5180                 unsigned *color = shader->selector->color_attr_index;
5181
5182                 if (shader->key.part.ps.prolog.color_two_side) {
5183                         /* BCOLORs are stored after the last input. */
5184                         key->ps_prolog.num_interp_inputs = info->num_inputs;
5185                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5186                         if (separate_prolog)
5187                                 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5188                 }
5189
5190                 for (unsigned i = 0; i < 2; i++) {
5191                         unsigned interp = info->input_interpolate[color[i]];
5192                         unsigned location = info->input_interpolate_loc[color[i]];
5193
5194                         if (!(info->colors_read & (0xf << i*4)))
5195                                 continue;
5196
5197                         key->ps_prolog.color_attr_index[i] = color[i];
5198
5199                         if (shader->key.part.ps.prolog.flatshade_colors &&
5200                             interp == TGSI_INTERPOLATE_COLOR)
5201                                 interp = TGSI_INTERPOLATE_CONSTANT;
5202
5203                         switch (interp) {
5204                         case TGSI_INTERPOLATE_CONSTANT:
5205                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5206                                 break;
5207                         case TGSI_INTERPOLATE_PERSPECTIVE:
5208                         case TGSI_INTERPOLATE_COLOR:
5209                                 /* Force the interpolation location for colors here. */
5210                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5211                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
5212                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
5213                                         location = TGSI_INTERPOLATE_LOC_CENTER;
5214
5215                                 switch (location) {
5216                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
5217                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
5218                                         if (separate_prolog) {
5219                                                 shader->config.spi_ps_input_ena |=
5220                                                         S_0286CC_PERSP_SAMPLE_ENA(1);
5221                                         }
5222                                         break;
5223                                 case TGSI_INTERPOLATE_LOC_CENTER:
5224                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
5225                                         if (separate_prolog) {
5226                                                 shader->config.spi_ps_input_ena |=
5227                                                         S_0286CC_PERSP_CENTER_ENA(1);
5228                                         }
5229                                         break;
5230                                 case TGSI_INTERPOLATE_LOC_CENTROID:
5231                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
5232                                         if (separate_prolog) {
5233                                                 shader->config.spi_ps_input_ena |=
5234                                                         S_0286CC_PERSP_CENTROID_ENA(1);
5235                                         }
5236                                         break;
5237                                 default:
5238                                         assert(0);
5239                                 }
5240                                 break;
5241                         case TGSI_INTERPOLATE_LINEAR:
5242                                 /* Force the interpolation location for colors here. */
5243                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5244                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
5245                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
5246                                         location = TGSI_INTERPOLATE_LOC_CENTER;
5247
5248                                 /* The VGPR assignment for non-monolithic shaders
5249                                  * works because InitialPSInputAddr is set on the
5250                                  * main shader and PERSP_PULL_MODEL is never used.
5251                                  */
5252                                 switch (location) {
5253                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
5254                                         key->ps_prolog.color_interp_vgpr_index[i] =
5255                                                 separate_prolog ? 6 : 9;
5256                                         if (separate_prolog) {
5257                                                 shader->config.spi_ps_input_ena |=
5258                                                         S_0286CC_LINEAR_SAMPLE_ENA(1);
5259                                         }
5260                                         break;
5261                                 case TGSI_INTERPOLATE_LOC_CENTER:
5262                                         key->ps_prolog.color_interp_vgpr_index[i] =
5263                                                 separate_prolog ? 8 : 11;
5264                                         if (separate_prolog) {
5265                                                 shader->config.spi_ps_input_ena |=
5266                                                         S_0286CC_LINEAR_CENTER_ENA(1);
5267                                         }
5268                                         break;
5269                                 case TGSI_INTERPOLATE_LOC_CENTROID:
5270                                         key->ps_prolog.color_interp_vgpr_index[i] =
5271                                                 separate_prolog ? 10 : 13;
5272                                         if (separate_prolog) {
5273                                                 shader->config.spi_ps_input_ena |=
5274                                                         S_0286CC_LINEAR_CENTROID_ENA(1);
5275                                         }
5276                                         break;
5277                                 default:
5278                                         assert(0);
5279                                 }
5280                                 break;
5281                         default:
5282                                 assert(0);
5283                         }
5284                 }
5285         }
5286 }
5287
5288 /**
5289  * Check whether a PS prolog is required based on the key.
5290  */
5291 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5292 {
5293         return key->ps_prolog.colors_read ||
5294                key->ps_prolog.states.force_persp_sample_interp ||
5295                key->ps_prolog.states.force_linear_sample_interp ||
5296                key->ps_prolog.states.force_persp_center_interp ||
5297                key->ps_prolog.states.force_linear_center_interp ||
5298                key->ps_prolog.states.bc_optimize_for_persp ||
5299                key->ps_prolog.states.bc_optimize_for_linear ||
5300                key->ps_prolog.states.poly_stipple ||
5301                key->ps_prolog.states.samplemask_log_ps_iter;
5302 }
5303
5304 /**
5305  * Compute the PS epilog key, which contains all the information needed to
5306  * build the PS epilog function.
5307  */
5308 static void si_get_ps_epilog_key(struct si_shader *shader,
5309                                  union si_shader_part_key *key)
5310 {
5311         struct tgsi_shader_info *info = &shader->selector->info;
5312         memset(key, 0, sizeof(*key));
5313         key->ps_epilog.colors_written = info->colors_written;
5314         key->ps_epilog.writes_z = info->writes_z;
5315         key->ps_epilog.writes_stencil = info->writes_stencil;
5316         key->ps_epilog.writes_samplemask = info->writes_samplemask;
5317         key->ps_epilog.states = shader->key.part.ps.epilog;
5318 }
5319
5320 /**
5321  * Build the GS prolog function. Rotate the input vertices for triangle strips
5322  * with adjacency.
5323  */
5324 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
5325                                         union si_shader_part_key *key)
5326 {
5327         unsigned num_sgprs, num_vgprs;
5328         LLVMBuilderRef builder = ctx->ac.builder;
5329         LLVMTypeRef returns[AC_MAX_ARGS];
5330         LLVMValueRef func, ret;
5331
5332         memset(&ctx->args, 0, sizeof(ctx->args));
5333
5334         if (ctx->screen->info.chip_class >= GFX9) {
5335                 if (key->gs_prolog.states.gfx9_prev_is_vs)
5336                         num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
5337                 else
5338                         num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
5339                 num_vgprs = 5; /* ES inputs are not needed by GS */
5340         } else {
5341                 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
5342                 num_vgprs = 8;
5343         }
5344
5345         for (unsigned i = 0; i < num_sgprs; ++i) {
5346                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
5347                 returns[i] = ctx->i32;
5348         }
5349
5350         for (unsigned i = 0; i < num_vgprs; ++i) {
5351                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
5352                 returns[num_sgprs + i] = ctx->f32;
5353         }
5354
5355         /* Create the function. */
5356         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
5357                            0);
5358         func = ctx->main_fn;
5359
5360         /* Set the full EXEC mask for the prolog, because we are only fiddling
5361          * with registers here. The main shader part will set the correct EXEC
5362          * mask.
5363          */
5364         if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
5365                 ac_init_exec_full_mask(&ctx->ac);
5366
5367         /* Copy inputs to outputs. This should be no-op, as the registers match,
5368          * but it will prevent the compiler from overwriting them unintentionally.
5369          */
5370         ret = ctx->return_value;
5371         for (unsigned i = 0; i < num_sgprs; i++) {
5372                 LLVMValueRef p = LLVMGetParam(func, i);
5373                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
5374         }
5375         for (unsigned i = 0; i < num_vgprs; i++) {
5376                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
5377                 p = ac_to_float(&ctx->ac, p);
5378                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
5379         }
5380
5381         if (key->gs_prolog.states.tri_strip_adj_fix) {
5382                 /* Remap the input vertices for every other primitive. */
5383                 const struct ac_arg gfx6_vtx_params[6] = {
5384                         { .used = true, .arg_index = num_sgprs },
5385                         { .used = true, .arg_index = num_sgprs + 1 },
5386                         { .used = true, .arg_index = num_sgprs + 3 },
5387                         { .used = true, .arg_index = num_sgprs + 4 },
5388                         { .used = true, .arg_index = num_sgprs + 5 },
5389                         { .used = true, .arg_index = num_sgprs + 6 },
5390                 };
5391                 const struct ac_arg gfx9_vtx_params[3] = {
5392                         { .used = true, .arg_index = num_sgprs },
5393                         { .used = true, .arg_index = num_sgprs + 1 },
5394                         { .used = true, .arg_index = num_sgprs + 4 },
5395                 };
5396                 LLVMValueRef vtx_in[6], vtx_out[6];
5397                 LLVMValueRef prim_id, rotate;
5398
5399                 if (ctx->screen->info.chip_class >= GFX9) {
5400                         for (unsigned i = 0; i < 3; i++) {
5401                                 vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
5402                                 vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
5403                         }
5404                 } else {
5405                         for (unsigned i = 0; i < 6; i++)
5406                                 vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
5407                 }
5408
5409                 prim_id = LLVMGetParam(func, num_sgprs + 2);
5410                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
5411
5412                 for (unsigned i = 0; i < 6; ++i) {
5413                         LLVMValueRef base, rotated;
5414                         base = vtx_in[i];
5415                         rotated = vtx_in[(i + 4) % 6];
5416                         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
5417                 }
5418
5419                 if (ctx->screen->info.chip_class >= GFX9) {
5420                         for (unsigned i = 0; i < 3; i++) {
5421                                 LLVMValueRef hi, out;
5422
5423                                 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
5424                                                   LLVMConstInt(ctx->i32, 16, 0), "");
5425                                 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
5426                                 out = ac_to_float(&ctx->ac, out);
5427                                 ret = LLVMBuildInsertValue(builder, ret, out,
5428                                                            gfx9_vtx_params[i].arg_index, "");
5429                         }
5430                 } else {
5431                         for (unsigned i = 0; i < 6; i++) {
5432                                 LLVMValueRef out;
5433
5434                                 out = ac_to_float(&ctx->ac, vtx_out[i]);
5435                                 ret = LLVMBuildInsertValue(builder, ret, out,
5436                                                            gfx6_vtx_params[i].arg_index, "");
5437                         }
5438                 }
5439         }
5440
5441         LLVMBuildRet(builder, ret);
5442 }
5443
5444 /**
5445  * Given a list of shader part functions, build a wrapper function that
5446  * runs them in sequence to form a monolithic shader.
5447  */
5448 static void si_build_wrapper_function(struct si_shader_context *ctx,
5449                                       LLVMValueRef *parts,
5450                                       unsigned num_parts,
5451                                       unsigned main_part,
5452                                       unsigned next_shader_first_part)
5453 {
5454         LLVMBuilderRef builder = ctx->ac.builder;
5455         /* PS epilog has one arg per color component; gfx9 merged shader
5456          * prologs need to forward 40 SGPRs.
5457          */
5458         LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
5459         LLVMTypeRef function_type;
5460         unsigned num_first_params;
5461         unsigned num_out, initial_num_out;
5462         ASSERTED unsigned num_out_sgpr; /* used in debug checks */
5463         ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
5464         unsigned num_sgprs, num_vgprs;
5465         unsigned gprs;
5466
5467         memset(&ctx->args, 0, sizeof(ctx->args));
5468
5469         for (unsigned i = 0; i < num_parts; ++i) {
5470                 ac_add_function_attr(ctx->ac.context, parts[i], -1,
5471                                      AC_FUNC_ATTR_ALWAYSINLINE);
5472                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
5473         }
5474
5475         /* The parameters of the wrapper function correspond to those of the
5476          * first part in terms of SGPRs and VGPRs, but we use the types of the
5477          * main part to get the right types. This is relevant for the
5478          * dereferenceable attribute on descriptor table pointers.
5479          */
5480         num_sgprs = 0;
5481         num_vgprs = 0;
5482
5483         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
5484         num_first_params = LLVMCountParamTypes(function_type);
5485
5486         for (unsigned i = 0; i < num_first_params; ++i) {
5487                 LLVMValueRef param = LLVMGetParam(parts[0], i);
5488
5489                 if (ac_is_sgpr_param(param)) {
5490                         assert(num_vgprs == 0);
5491                         num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
5492                 } else {
5493                         num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
5494                 }
5495         }
5496
5497         gprs = 0;
5498         while (gprs < num_sgprs + num_vgprs) {
5499                 LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
5500                 LLVMTypeRef type = LLVMTypeOf(param);
5501                 unsigned size = ac_get_type_size(type) / 4;
5502
5503                 /* This is going to get casted anyways, so we don't have to
5504                  * have the exact same type. But we do have to preserve the
5505                  * pointer-ness so that LLVM knows about it.
5506                  */
5507                 enum ac_arg_type arg_type = AC_ARG_INT;
5508                 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
5509                         arg_type = AC_ARG_CONST_PTR;
5510                 }
5511
5512                 ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
5513                            size, arg_type, NULL);
5514
5515                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
5516                 assert(gprs + size <= num_sgprs + num_vgprs &&
5517                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
5518
5519                 gprs += size;
5520         }
5521
5522         /* Prepare the return type. */
5523         unsigned num_returns = 0;
5524         LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
5525
5526         last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
5527         return_type = LLVMGetReturnType(last_func_type);
5528
5529         switch (LLVMGetTypeKind(return_type)) {
5530         case LLVMStructTypeKind:
5531                 num_returns = LLVMCountStructElementTypes(return_type);
5532                 assert(num_returns <= ARRAY_SIZE(returns));
5533                 LLVMGetStructElementTypes(return_type, returns);
5534                 break;
5535         case LLVMVoidTypeKind:
5536                 break;
5537         default:
5538                 unreachable("unexpected type");
5539         }
5540
5541         si_create_function(ctx, "wrapper", returns, num_returns,
5542                            si_get_max_workgroup_size(ctx->shader));
5543
5544         if (is_merged_shader(ctx))
5545                 ac_init_exec_full_mask(&ctx->ac);
5546
5547         /* Record the arguments of the function as if they were an output of
5548          * a previous part.
5549          */
5550         num_out = 0;
5551         num_out_sgpr = 0;
5552
5553         for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
5554                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
5555                 LLVMTypeRef param_type = LLVMTypeOf(param);
5556                 LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->i32 : ctx->f32;
5557                 unsigned size = ac_get_type_size(param_type) / 4;
5558
5559                 if (size == 1) {
5560                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
5561                                 param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
5562                                 param_type = ctx->i32;
5563                         }
5564
5565                         if (param_type != out_type)
5566                                 param = LLVMBuildBitCast(builder, param, out_type, "");
5567                         out[num_out++] = param;
5568                 } else {
5569                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
5570
5571                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
5572                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
5573                                 param_type = ctx->i64;
5574                         }
5575
5576                         if (param_type != vector_type)
5577                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
5578
5579                         for (unsigned j = 0; j < size; ++j)
5580                                 out[num_out++] = LLVMBuildExtractElement(
5581                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
5582                 }
5583
5584                 if (ctx->args.args[i].file == AC_ARG_SGPR)
5585                         num_out_sgpr = num_out;
5586         }
5587
5588         memcpy(initial, out, sizeof(out));
5589         initial_num_out = num_out;
5590         initial_num_out_sgpr = num_out_sgpr;
5591
5592         /* Now chain the parts. */
5593         LLVMValueRef ret = NULL;
5594         for (unsigned part = 0; part < num_parts; ++part) {
5595                 LLVMValueRef in[AC_MAX_ARGS];
5596                 LLVMTypeRef ret_type;
5597                 unsigned out_idx = 0;
5598                 unsigned num_params = LLVMCountParams(parts[part]);
5599
5600                 /* Merged shaders are executed conditionally depending
5601                  * on the number of enabled threads passed in the input SGPRs. */
5602                 if (is_multi_part_shader(ctx) && part == 0) {
5603                         LLVMValueRef ena, count = initial[3];
5604
5605                         count = LLVMBuildAnd(builder, count,
5606                                              LLVMConstInt(ctx->i32, 0x7f, 0), "");
5607                         ena = LLVMBuildICmp(builder, LLVMIntULT,
5608                                             ac_get_thread_id(&ctx->ac), count, "");
5609                         ac_build_ifcc(&ctx->ac, ena, 6506);
5610                 }
5611
5612                 /* Derive arguments for the next part from outputs of the
5613                  * previous one.
5614                  */
5615                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
5616                         LLVMValueRef param;
5617                         LLVMTypeRef param_type;
5618                         bool is_sgpr;
5619                         unsigned param_size;
5620                         LLVMValueRef arg = NULL;
5621
5622                         param = LLVMGetParam(parts[part], param_idx);
5623                         param_type = LLVMTypeOf(param);
5624                         param_size = ac_get_type_size(param_type) / 4;
5625                         is_sgpr = ac_is_sgpr_param(param);
5626
5627                         if (is_sgpr) {
5628                                 ac_add_function_attr(ctx->ac.context, parts[part],
5629                                                      param_idx + 1, AC_FUNC_ATTR_INREG);
5630                         } else if (out_idx < num_out_sgpr) {
5631                                 /* Skip returned SGPRs the current part doesn't
5632                                  * declare on the input. */
5633                                 out_idx = num_out_sgpr;
5634                         }
5635
5636                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
5637
5638                         if (param_size == 1)
5639                                 arg = out[out_idx];
5640                         else
5641                                 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
5642
5643                         if (LLVMTypeOf(arg) != param_type) {
5644                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
5645                                         if (LLVMGetPointerAddressSpace(param_type) ==
5646                                             AC_ADDR_SPACE_CONST_32BIT) {
5647                                                 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
5648                                                 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
5649                                         } else {
5650                                                 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
5651                                                 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
5652                                         }
5653                                 } else {
5654                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
5655                                 }
5656                         }
5657
5658                         in[param_idx] = arg;
5659                         out_idx += param_size;
5660                 }
5661
5662                 ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
5663
5664                 if (is_multi_part_shader(ctx) &&
5665                     part + 1 == next_shader_first_part) {
5666                         ac_build_endif(&ctx->ac, 6506);
5667
5668                         /* The second half of the merged shader should use
5669                          * the inputs from the toplevel (wrapper) function,
5670                          * not the return value from the last call.
5671                          *
5672                          * That's because the last call was executed condi-
5673                          * tionally, so we can't consume it in the main
5674                          * block.
5675                          */
5676                         memcpy(out, initial, sizeof(initial));
5677                         num_out = initial_num_out;
5678                         num_out_sgpr = initial_num_out_sgpr;
5679                         continue;
5680                 }
5681
5682                 /* Extract the returned GPRs. */
5683                 ret_type = LLVMTypeOf(ret);
5684                 num_out = 0;
5685                 num_out_sgpr = 0;
5686
5687                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
5688                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
5689
5690                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
5691
5692                         for (unsigned i = 0; i < ret_size; ++i) {
5693                                 LLVMValueRef val =
5694                                         LLVMBuildExtractValue(builder, ret, i, "");
5695
5696                                 assert(num_out < ARRAY_SIZE(out));
5697                                 out[num_out++] = val;
5698
5699                                 if (LLVMTypeOf(val) == ctx->i32) {
5700                                         assert(num_out_sgpr + 1 == num_out);
5701                                         num_out_sgpr = num_out;
5702                                 }
5703                         }
5704                 }
5705         }
5706
5707         /* Return the value from the last part. */
5708         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5709                 LLVMBuildRetVoid(builder);
5710         else
5711                 LLVMBuildRet(builder, ret);
5712 }
5713
5714 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
5715                                     struct si_shader_selector *sel)
5716 {
5717         if (!compiler->low_opt_passes)
5718                 return false;
5719
5720         /* Assume a slow CPU. */
5721         assert(!sel->screen->info.has_dedicated_vram &&
5722                sel->screen->info.chip_class <= GFX8);
5723
5724         /* For a crazy dEQP test containing 2597 memory opcodes, mostly
5725          * buffer stores. */
5726         return sel->type == PIPE_SHADER_COMPUTE &&
5727                sel->info.num_memory_instructions > 1000;
5728 }
5729
5730 static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
5731                                          bool *free_nir)
5732 {
5733         *free_nir = false;
5734
5735         if (sel->nir) {
5736                 return sel->nir;
5737         } else if (sel->nir_binary) {
5738                 struct pipe_screen *screen = &sel->screen->b;
5739                 const void *options =
5740                         screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
5741                                                      sel->type);
5742
5743                 struct blob_reader blob_reader;
5744                 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
5745                 *free_nir = true;
5746                 return nir_deserialize(NULL, options, &blob_reader);
5747         }
5748         return NULL;
5749 }
5750
5751 int si_compile_shader(struct si_screen *sscreen,
5752                       struct ac_llvm_compiler *compiler,
5753                       struct si_shader *shader,
5754                       struct pipe_debug_callback *debug)
5755 {
5756         struct si_shader_selector *sel = shader->selector;
5757         struct si_shader_context ctx;
5758         bool free_nir;
5759         struct nir_shader *nir = get_nir_shader(sel, &free_nir);
5760         int r = -1;
5761
5762         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
5763          * conversion fails. */
5764         if (si_can_dump_shader(sscreen, sel->type) &&
5765             !(sscreen->debug_flags & DBG(NO_TGSI))) {
5766                 nir_print_shader(nir, stderr);
5767                 si_dump_streamout(&sel->so);
5768         }
5769
5770         si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader), 64);
5771         si_llvm_context_set_ir(&ctx, shader);
5772
5773         memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
5774                sizeof(shader->info.vs_output_param_offset));
5775
5776         shader->info.uses_instanceid = sel->info.uses_instanceid;
5777
5778         if (!si_compile_tgsi_main(&ctx, nir, free_nir)) {
5779                 si_llvm_dispose(&ctx);
5780                 return -1;
5781         }
5782
5783         if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
5784                 LLVMValueRef parts[2];
5785                 bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
5786
5787                 parts[1] = ctx.main_fn;
5788
5789                 if (need_prolog) {
5790                         union si_shader_part_key prolog_key;
5791                         si_get_vs_prolog_key(&sel->info,
5792                                              shader->info.num_input_sgprs,
5793                                              &shader->key.part.vs.prolog,
5794                                              shader, &prolog_key);
5795                         prolog_key.vs_prolog.is_monolithic = true;
5796                         si_build_vs_prolog_function(&ctx, &prolog_key);
5797                         parts[0] = ctx.main_fn;
5798                 }
5799
5800                 si_build_wrapper_function(&ctx, parts + !need_prolog,
5801                                           1 + need_prolog, need_prolog, 0);
5802
5803                 if (ctx.shader->key.opt.vs_as_prim_discard_cs)
5804                         si_build_prim_discard_compute_shader(&ctx);
5805         } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
5806                 if (sscreen->info.chip_class >= GFX9) {
5807                         struct si_shader_selector *ls = shader->key.part.tcs.ls;
5808                         LLVMValueRef parts[4];
5809                         bool vs_needs_prolog =
5810                                 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
5811
5812                         /* TCS main part */
5813                         parts[2] = ctx.main_fn;
5814
5815                         /* TCS epilog */
5816                         union si_shader_part_key tcs_epilog_key;
5817                         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
5818                         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
5819                         si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
5820                         parts[3] = ctx.main_fn;
5821
5822                         /* VS as LS main part */
5823                         nir = get_nir_shader(ls, &free_nir);
5824                         struct si_shader shader_ls = {};
5825                         shader_ls.selector = ls;
5826                         shader_ls.key.as_ls = 1;
5827                         shader_ls.key.mono = shader->key.mono;
5828                         shader_ls.key.opt = shader->key.opt;
5829                         shader_ls.is_monolithic = true;
5830                         si_llvm_context_set_ir(&ctx, &shader_ls);
5831
5832                         if (!si_compile_tgsi_main(&ctx, nir, free_nir)) {
5833                                 si_llvm_dispose(&ctx);
5834                                 return -1;
5835                         }
5836                         shader->info.uses_instanceid |= ls->info.uses_instanceid;
5837                         parts[1] = ctx.main_fn;
5838
5839                         /* LS prolog */
5840                         if (vs_needs_prolog) {
5841                                 union si_shader_part_key vs_prolog_key;
5842                                 si_get_vs_prolog_key(&ls->info,
5843                                                      shader_ls.info.num_input_sgprs,
5844                                                      &shader->key.part.tcs.ls_prolog,
5845                                                      shader, &vs_prolog_key);
5846                                 vs_prolog_key.vs_prolog.is_monolithic = true;
5847                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
5848                                 parts[0] = ctx.main_fn;
5849                         }
5850
5851                         /* Reset the shader context. */
5852                         ctx.shader = shader;
5853                         ctx.type = PIPE_SHADER_TESS_CTRL;
5854
5855                         si_build_wrapper_function(&ctx,
5856                                                   parts + !vs_needs_prolog,
5857                                                   4 - !vs_needs_prolog, vs_needs_prolog,
5858                                                   vs_needs_prolog ? 2 : 1);
5859                 } else {
5860                         LLVMValueRef parts[2];
5861                         union si_shader_part_key epilog_key;
5862
5863                         parts[0] = ctx.main_fn;
5864
5865                         memset(&epilog_key, 0, sizeof(epilog_key));
5866                         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
5867                         si_build_tcs_epilog_function(&ctx, &epilog_key);
5868                         parts[1] = ctx.main_fn;
5869
5870                         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
5871                 }
5872         } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
5873                 if (ctx.screen->info.chip_class >= GFX9) {
5874                         struct si_shader_selector *es = shader->key.part.gs.es;
5875                         LLVMValueRef es_prolog = NULL;
5876                         LLVMValueRef es_main = NULL;
5877                         LLVMValueRef gs_prolog = NULL;
5878                         LLVMValueRef gs_main = ctx.main_fn;
5879
5880                         /* GS prolog */
5881                         union si_shader_part_key gs_prolog_key;
5882                         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
5883                         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
5884                         gs_prolog_key.gs_prolog.is_monolithic = true;
5885                         gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
5886                         si_build_gs_prolog_function(&ctx, &gs_prolog_key);
5887                         gs_prolog = ctx.main_fn;
5888
5889                         /* ES main part */
5890                         nir = get_nir_shader(es, &free_nir);
5891                         struct si_shader shader_es = {};
5892                         shader_es.selector = es;
5893                         shader_es.key.as_es = 1;
5894                         shader_es.key.as_ngg = shader->key.as_ngg;
5895                         shader_es.key.mono = shader->key.mono;
5896                         shader_es.key.opt = shader->key.opt;
5897                         shader_es.is_monolithic = true;
5898                         si_llvm_context_set_ir(&ctx, &shader_es);
5899
5900                         if (!si_compile_tgsi_main(&ctx, nir, free_nir)) {
5901                                 si_llvm_dispose(&ctx);
5902                                 return -1;
5903                         }
5904                         shader->info.uses_instanceid |= es->info.uses_instanceid;
5905                         es_main = ctx.main_fn;
5906
5907                         /* ES prolog */
5908                         if (es->type == PIPE_SHADER_VERTEX &&
5909                             si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
5910                                 union si_shader_part_key vs_prolog_key;
5911                                 si_get_vs_prolog_key(&es->info,
5912                                                      shader_es.info.num_input_sgprs,
5913                                                      &shader->key.part.gs.vs_prolog,
5914                                                      shader, &vs_prolog_key);
5915                                 vs_prolog_key.vs_prolog.is_monolithic = true;
5916                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
5917                                 es_prolog = ctx.main_fn;
5918                         }
5919
5920                         /* Reset the shader context. */
5921                         ctx.shader = shader;
5922                         ctx.type = PIPE_SHADER_GEOMETRY;
5923
5924                         /* Prepare the array of shader parts. */
5925                         LLVMValueRef parts[4];
5926                         unsigned num_parts = 0, main_part, next_first_part;
5927
5928                         if (es_prolog)
5929                                 parts[num_parts++] = es_prolog;
5930
5931                         parts[main_part = num_parts++] = es_main;
5932                         parts[next_first_part = num_parts++] = gs_prolog;
5933                         parts[num_parts++] = gs_main;
5934
5935                         si_build_wrapper_function(&ctx, parts, num_parts,
5936                                                   main_part, next_first_part);
5937                 } else {
5938                         LLVMValueRef parts[2];
5939                         union si_shader_part_key prolog_key;
5940
5941                         parts[1] = ctx.main_fn;
5942
5943                         memset(&prolog_key, 0, sizeof(prolog_key));
5944                         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
5945                         si_build_gs_prolog_function(&ctx, &prolog_key);
5946                         parts[0] = ctx.main_fn;
5947
5948                         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
5949                 }
5950         } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
5951                 LLVMValueRef parts[3];
5952                 union si_shader_part_key prolog_key;
5953                 union si_shader_part_key epilog_key;
5954                 bool need_prolog;
5955
5956                 si_get_ps_prolog_key(shader, &prolog_key, false);
5957                 need_prolog = si_need_ps_prolog(&prolog_key);
5958
5959                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
5960
5961                 if (need_prolog) {
5962                         si_build_ps_prolog_function(&ctx, &prolog_key);
5963                         parts[0] = ctx.main_fn;
5964                 }
5965
5966                 si_get_ps_epilog_key(shader, &epilog_key);
5967                 si_build_ps_epilog_function(&ctx, &epilog_key);
5968                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
5969
5970                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
5971                                           need_prolog ? 1 : 0, 0);
5972         }
5973
5974         si_llvm_optimize_module(&ctx);
5975
5976         /* Post-optimization transformations and analysis. */
5977         si_optimize_vs_outputs(&ctx);
5978
5979         if ((debug && debug->debug_message) ||
5980             si_can_dump_shader(sscreen, ctx.type)) {
5981                 ctx.shader->info.private_mem_vgprs =
5982                         ac_count_scratch_private_memory(ctx.main_fn);
5983         }
5984
5985         /* Make sure the input is a pointer and not integer followed by inttoptr. */
5986         assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
5987                LLVMPointerTypeKind);
5988
5989         /* Compile to bytecode. */
5990         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
5991                             ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
5992                             si_get_shader_name(shader),
5993                             si_should_optimize_less(compiler, shader->selector));
5994         si_llvm_dispose(&ctx);
5995         if (r) {
5996                 fprintf(stderr, "LLVM failed to compile shader\n");
5997                 return r;
5998         }
5999
6000         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6001          * LLVM 3.9svn has this bug.
6002          */
6003         if (sel->type == PIPE_SHADER_COMPUTE) {
6004                 unsigned wave_size = sscreen->compute_wave_size;
6005                 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd *
6006                                      (wave_size == 32 ? 2 : 1);
6007                 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
6008                 unsigned max_sgprs_per_wave = 128;
6009                 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
6010                 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
6011                 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
6012                 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
6013
6014                 max_vgprs = max_vgprs / waves_per_simd;
6015                 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
6016
6017                 if (shader->config.num_sgprs > max_sgprs ||
6018                     shader->config.num_vgprs > max_vgprs) {
6019                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
6020                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6021                                 shader->config.num_sgprs, shader->config.num_vgprs,
6022                                 max_sgprs, max_vgprs);
6023
6024                         /* Just terminate the process, because dependent
6025                          * shaders can hang due to bad input data, but use
6026                          * the env var to allow shader-db to work.
6027                          */
6028                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6029                                 abort();
6030                 }
6031         }
6032
6033         /* Add the scratch offset to input SGPRs. */
6034         if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx))
6035                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6036
6037         /* Calculate the number of fragment input VGPRs. */
6038         if (ctx.type == PIPE_SHADER_FRAGMENT) {
6039                 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config,
6040                                                 &shader->info.face_vgpr_index,
6041                                                 &shader->info.ancillary_vgpr_index);
6042         }
6043
6044         si_calculate_max_simd_waves(shader);
6045         si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
6046         return 0;
6047 }
6048
6049 /**
6050  * Create, compile and return a shader part (prolog or epilog).
6051  *
6052  * \param sscreen       screen
6053  * \param list          list of shader parts of the same category
6054  * \param type          shader type
6055  * \param key           shader part key
6056  * \param prolog        whether the part being requested is a prolog
6057  * \param tm            LLVM target machine
6058  * \param debug         debug callback
6059  * \param build         the callback responsible for building the main function
6060  * \return              non-NULL on success
6061  */
6062 static struct si_shader_part *
6063 si_get_shader_part(struct si_screen *sscreen,
6064                    struct si_shader_part **list,
6065                    enum pipe_shader_type type,
6066                    bool prolog,
6067                    union si_shader_part_key *key,
6068                    struct ac_llvm_compiler *compiler,
6069                    struct pipe_debug_callback *debug,
6070                    void (*build)(struct si_shader_context *,
6071                                  union si_shader_part_key *),
6072                    const char *name)
6073 {
6074         struct si_shader_part *result;
6075
6076         simple_mtx_lock(&sscreen->shader_parts_mutex);
6077
6078         /* Find existing. */
6079         for (result = *list; result; result = result->next) {
6080                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6081                         simple_mtx_unlock(&sscreen->shader_parts_mutex);
6082                         return result;
6083                 }
6084         }
6085
6086         /* Compile a new one. */
6087         result = CALLOC_STRUCT(si_shader_part);
6088         result->key = *key;
6089
6090         struct si_shader shader = {};
6091
6092         switch (type) {
6093         case PIPE_SHADER_VERTEX:
6094                 shader.key.as_ls = key->vs_prolog.as_ls;
6095                 shader.key.as_es = key->vs_prolog.as_es;
6096                 shader.key.as_ngg = key->vs_prolog.as_ngg;
6097                 break;
6098         case PIPE_SHADER_TESS_CTRL:
6099                 assert(!prolog);
6100                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6101                 break;
6102         case PIPE_SHADER_GEOMETRY:
6103                 assert(prolog);
6104                 shader.key.as_ngg = key->gs_prolog.as_ngg;
6105                 break;
6106         case PIPE_SHADER_FRAGMENT:
6107                 if (prolog)
6108                         shader.key.part.ps.prolog = key->ps_prolog.states;
6109                 else
6110                         shader.key.part.ps.epilog = key->ps_epilog.states;
6111                 break;
6112         default:
6113                 unreachable("bad shader part");
6114         }
6115
6116         struct si_shader_context ctx;
6117         si_llvm_context_init(&ctx, sscreen, compiler,
6118                              si_get_wave_size(sscreen, type, shader.key.as_ngg,
6119                                               shader.key.as_es),
6120                              64);
6121         ctx.shader = &shader;
6122         ctx.type = type;
6123
6124         build(&ctx, key);
6125
6126         /* Compile. */
6127         si_llvm_optimize_module(&ctx);
6128
6129         if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
6130                             ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
6131                             name, false)) {
6132                 FREE(result);
6133                 result = NULL;
6134                 goto out;
6135         }
6136
6137         result->next = *list;
6138         *list = result;
6139
6140 out:
6141         si_llvm_dispose(&ctx);
6142         simple_mtx_unlock(&sscreen->shader_parts_mutex);
6143         return result;
6144 }
6145
6146 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6147 {
6148         LLVMValueRef ptr[2], list;
6149         bool merged_shader = is_merged_shader(ctx);
6150
6151         ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
6152         list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
6153                                  ac_array_in_const32_addr_space(ctx->v4i32), "");
6154         return list;
6155 }
6156
6157 /**
6158  * Build the vertex shader prolog function.
6159  *
6160  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6161  * All inputs are returned unmodified. The vertex load indices are
6162  * stored after them, which will be used by the API VS for fetching inputs.
6163  *
6164  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6165  *   input_v0,
6166  *   input_v1,
6167  *   input_v2,
6168  *   input_v3,
6169  *   (VertexID + BaseVertex),
6170  *   (InstanceID + StartInstance),
6171  *   (InstanceID / 2 + StartInstance)
6172  */
6173 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6174                                         union si_shader_part_key *key)
6175 {
6176         LLVMTypeRef *returns;
6177         LLVMValueRef ret, func;
6178         int num_returns, i;
6179         unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
6180         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6181         struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
6182         struct ac_arg input_vgpr_param[9];
6183         LLVMValueRef input_vgprs[9];
6184         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6185                                       num_input_vgprs;
6186         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6187
6188         memset(&ctx->args, 0, sizeof(ctx->args));
6189
6190         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6191         returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
6192                          sizeof(LLVMTypeRef));
6193         num_returns = 0;
6194
6195         /* Declare input and output SGPRs. */
6196         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6197                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6198                            &input_sgpr_param[i]);
6199                 returns[num_returns++] = ctx->i32;
6200         }
6201
6202         struct ac_arg merged_wave_info = input_sgpr_param[3];
6203
6204         /* Preloaded VGPRs (outputs must be floats) */
6205         for (i = 0; i < num_input_vgprs; i++) {
6206                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
6207                 returns[num_returns++] = ctx->f32;
6208         }
6209
6210         /* Vertex load indices. */
6211         for (i = 0; i < key->vs_prolog.num_inputs; i++)
6212                 returns[num_returns++] = ctx->f32;
6213
6214         /* Create the function. */
6215         si_create_function(ctx, "vs_prolog", returns, num_returns, 0);
6216         func = ctx->main_fn;
6217
6218         for (i = 0; i < num_input_vgprs; i++) {
6219                 input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
6220         }
6221
6222         if (key->vs_prolog.num_merged_next_stage_vgprs) {
6223                 if (!key->vs_prolog.is_monolithic)
6224                         si_init_exec_from_input(ctx, merged_wave_info, 0);
6225
6226                 if (key->vs_prolog.as_ls &&
6227                     ctx->screen->info.has_ls_vgpr_init_bug) {
6228                         /* If there are no HS threads, SPI loads the LS VGPRs
6229                          * starting at VGPR 0. Shift them back to where they
6230                          * belong.
6231                          */
6232                         LLVMValueRef has_hs_threads =
6233                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
6234                                     si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
6235                                     ctx->i32_0, "");
6236
6237                         for (i = 4; i > 0; --i) {
6238                                 input_vgprs[i + 1] =
6239                                         LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
6240                                                         input_vgprs[i + 1],
6241                                                         input_vgprs[i - 1], "");
6242                         }
6243                 }
6244         }
6245
6246         unsigned vertex_id_vgpr = first_vs_vgpr;
6247         unsigned instance_id_vgpr =
6248                 ctx->screen->info.chip_class >= GFX10 ?
6249                         first_vs_vgpr + 3 :
6250                         first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
6251
6252         ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
6253         ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
6254
6255         /* InstanceID = VertexID >> 16;
6256          * VertexID   = VertexID & 0xffff;
6257          */
6258         if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
6259                 ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
6260                                                      LLVMConstInt(ctx->i32, 16, 0), "");
6261                 ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
6262                                                   LLVMConstInt(ctx->i32, 0xffff, 0), "");
6263         }
6264
6265         /* Copy inputs to outputs. This should be no-op, as the registers match,
6266          * but it will prevent the compiler from overwriting them unintentionally.
6267          */
6268         ret = ctx->return_value;
6269         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6270                 LLVMValueRef p = LLVMGetParam(func, i);
6271                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
6272         }
6273         for (i = 0; i < num_input_vgprs; i++) {
6274                 LLVMValueRef p = input_vgprs[i];
6275
6276                 if (i == vertex_id_vgpr)
6277                         p = ctx->abi.vertex_id;
6278                 else if (i == instance_id_vgpr)
6279                         p = ctx->abi.instance_id;
6280
6281                 p = ac_to_float(&ctx->ac, p);
6282                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
6283                                            key->vs_prolog.num_input_sgprs + i, "");
6284         }
6285
6286         /* Compute vertex load indices from instance divisors. */
6287         LLVMValueRef instance_divisor_constbuf = NULL;
6288
6289         if (key->vs_prolog.states.instance_divisor_is_fetched) {
6290                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6291                 LLVMValueRef buf_index =
6292                         LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6293                 instance_divisor_constbuf =
6294                         ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
6295         }
6296
6297         for (i = 0; i < key->vs_prolog.num_inputs; i++) {
6298                 bool divisor_is_one =
6299                         key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6300                 bool divisor_is_fetched =
6301                         key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6302                 LLVMValueRef index = NULL;
6303
6304                 if (divisor_is_one) {
6305                         index = ctx->abi.instance_id;
6306                 } else if (divisor_is_fetched) {
6307                         LLVMValueRef udiv_factors[4];
6308
6309                         for (unsigned j = 0; j < 4; j++) {
6310                                 udiv_factors[j] =
6311                                         buffer_load_const(ctx, instance_divisor_constbuf,
6312                                                           LLVMConstInt(ctx->i32, i*16 + j*4, 0));
6313                                 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
6314                         }
6315                         /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
6316                          * Such InstanceID might not be achievable in a reasonable time though.
6317                          */
6318                         index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
6319                                                        udiv_factors[0], udiv_factors[1],
6320                                                        udiv_factors[2], udiv_factors[3]);
6321                 }
6322
6323                 if (divisor_is_one || divisor_is_fetched) {
6324                         /* Add StartInstance. */
6325                         index = LLVMBuildAdd(ctx->ac.builder, index,
6326                                              LLVMGetParam(ctx->main_fn, user_sgpr_base +
6327                                                           SI_SGPR_START_INSTANCE), "");
6328                 } else {
6329                         /* VertexID + BaseVertex */
6330                         index = LLVMBuildAdd(ctx->ac.builder,
6331                                              ctx->abi.vertex_id,
6332                                              LLVMGetParam(func, user_sgpr_base +
6333                                                                 SI_SGPR_BASE_VERTEX), "");
6334                 }
6335
6336                 index = ac_to_float(&ctx->ac, index);
6337                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
6338                                            ctx->args.arg_count + i, "");
6339         }
6340
6341         si_llvm_build_ret(ctx, ret);
6342 }
6343
6344 static bool si_get_vs_prolog(struct si_screen *sscreen,
6345                              struct ac_llvm_compiler *compiler,
6346                              struct si_shader *shader,
6347                              struct pipe_debug_callback *debug,
6348                              struct si_shader *main_part,
6349                              const struct si_vs_prolog_bits *key)
6350 {
6351         struct si_shader_selector *vs = main_part->selector;
6352
6353         if (!si_vs_needs_prolog(vs, key))
6354                 return true;
6355
6356         /* Get the prolog. */
6357         union si_shader_part_key prolog_key;
6358         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6359                              key, shader, &prolog_key);
6360
6361         shader->prolog =
6362                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6363                                    PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
6364                                    debug, si_build_vs_prolog_function,
6365                                    "Vertex Shader Prolog");
6366         return shader->prolog != NULL;
6367 }
6368
6369 /**
6370  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6371  */
6372 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6373                                       struct ac_llvm_compiler *compiler,
6374                                       struct si_shader *shader,
6375                                       struct pipe_debug_callback *debug)
6376 {
6377         return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
6378                                 &shader->key.part.vs.prolog);
6379 }
6380
6381 /**
6382  * Compile the TCS epilog function. This writes tesselation factors to memory
6383  * based on the output primitive type of the tesselator (determined by TES).
6384  */
6385 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6386                                          union si_shader_part_key *key)
6387 {
6388         memset(&ctx->args, 0, sizeof(ctx->args));
6389
6390         if (ctx->screen->info.chip_class >= GFX9) {
6391                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6392                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6393                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6394                            &ctx->tcs_offchip_offset);
6395                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
6396                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6397                            &ctx->tcs_factor_offset);
6398                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6399                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6400                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6401                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6402                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6403                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6404                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6405                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6406                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6407                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6408                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6409                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6410                            &ctx->tcs_offchip_layout);
6411                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6412                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6413                            &ctx->tcs_out_lds_layout);
6414         } else {
6415                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6416                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6417                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6418                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6419                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6420                            &ctx->tcs_offchip_layout);
6421                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6422                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6423                            &ctx->tcs_out_lds_layout);
6424                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6425                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6426                            &ctx->tcs_offchip_offset);
6427                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6428                            &ctx->tcs_factor_offset);
6429         }
6430
6431         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
6432         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
6433         struct ac_arg rel_patch_id; /* patch index within the wave (REL_PATCH_ID) */
6434         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &rel_patch_id);
6435         struct ac_arg invocation_id; /* invocation ID within the patch */
6436         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &invocation_id);
6437         struct ac_arg tcs_out_current_patch_data_offset; /* LDS offset where tess factors should be loaded from */
6438         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
6439                    &tcs_out_current_patch_data_offset);
6440
6441         struct ac_arg tess_factors[6];
6442         for (unsigned i = 0; i < 6; i++)
6443                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
6444
6445         /* Create the function. */
6446         si_create_function(ctx, "tcs_epilog", NULL, 0,
6447                            ctx->screen->info.chip_class >= GFX7 ? 128 : 0);
6448         ac_declare_lds_as_pointer(&ctx->ac);
6449
6450         LLVMValueRef invoc0_tess_factors[6];
6451         for (unsigned i = 0; i < 6; i++)
6452                 invoc0_tess_factors[i] = ac_get_arg(&ctx->ac, tess_factors[i]);
6453
6454         si_write_tess_factors(ctx,
6455                               ac_get_arg(&ctx->ac, rel_patch_id),
6456                               ac_get_arg(&ctx->ac, invocation_id),
6457                               ac_get_arg(&ctx->ac, tcs_out_current_patch_data_offset),
6458                               invoc0_tess_factors, invoc0_tess_factors + 4);
6459
6460         LLVMBuildRetVoid(ctx->ac.builder);
6461 }
6462
6463 /**
6464  * Select and compile (or reuse) TCS parts (epilog).
6465  */
6466 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6467                                        struct ac_llvm_compiler *compiler,
6468                                        struct si_shader *shader,
6469                                        struct pipe_debug_callback *debug)
6470 {
6471         if (sscreen->info.chip_class >= GFX9) {
6472                 struct si_shader *ls_main_part =
6473                         shader->key.part.tcs.ls->main_shader_part_ls;
6474
6475                 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
6476                                       &shader->key.part.tcs.ls_prolog))
6477                         return false;
6478
6479                 shader->previous_stage = ls_main_part;
6480         }
6481
6482         /* Get the epilog. */
6483         union si_shader_part_key epilog_key;
6484         memset(&epilog_key, 0, sizeof(epilog_key));
6485         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6486
6487         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6488                                             PIPE_SHADER_TESS_CTRL, false,
6489                                             &epilog_key, compiler, debug,
6490                                             si_build_tcs_epilog_function,
6491                                             "Tessellation Control Shader Epilog");
6492         return shader->epilog != NULL;
6493 }
6494
6495 /**
6496  * Select and compile (or reuse) GS parts (prolog).
6497  */
6498 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
6499                                       struct ac_llvm_compiler *compiler,
6500                                       struct si_shader *shader,
6501                                       struct pipe_debug_callback *debug)
6502 {
6503         if (sscreen->info.chip_class >= GFX9) {
6504                 struct si_shader *es_main_part;
6505                 enum pipe_shader_type es_type = shader->key.part.gs.es->type;
6506
6507                 if (shader->key.as_ngg)
6508                         es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
6509                 else
6510                         es_main_part = shader->key.part.gs.es->main_shader_part_es;
6511
6512                 if (es_type == PIPE_SHADER_VERTEX &&
6513                     !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
6514                                       &shader->key.part.gs.vs_prolog))
6515                         return false;
6516
6517                 shader->previous_stage = es_main_part;
6518         }
6519
6520         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
6521                 return true;
6522
6523         union si_shader_part_key prolog_key;
6524         memset(&prolog_key, 0, sizeof(prolog_key));
6525         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6526         prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
6527
6528         shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
6529                                             PIPE_SHADER_GEOMETRY, true,
6530                                             &prolog_key, compiler, debug,
6531                                             si_build_gs_prolog_function,
6532                                             "Geometry Shader Prolog");
6533         return shader->prolog2 != NULL;
6534 }
6535
6536 /**
6537  * Build the pixel shader prolog function. This handles:
6538  * - two-side color selection and interpolation
6539  * - overriding interpolation parameters for the API PS
6540  * - polygon stippling
6541  *
6542  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6543  * overriden by other states. (e.g. per-sample interpolation)
6544  * Interpolated colors are stored after the preloaded VGPRs.
6545  */
6546 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
6547                                         union si_shader_part_key *key)
6548 {
6549         LLVMValueRef ret, func;
6550         int num_returns, i, num_color_channels;
6551
6552         assert(si_need_ps_prolog(key));
6553
6554         memset(&ctx->args, 0, sizeof(ctx->args));
6555
6556         /* Declare inputs. */
6557         LLVMTypeRef return_types[AC_MAX_ARGS];
6558         num_returns = 0;
6559         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6560         assert(key->ps_prolog.num_input_sgprs +
6561                key->ps_prolog.num_input_vgprs +
6562                num_color_channels <= AC_MAX_ARGS);
6563         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) {
6564                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6565                 return_types[num_returns++] = ctx->i32;
6566
6567         }
6568
6569         struct ac_arg pos_fixed_pt;
6570         struct ac_arg ancillary;
6571         struct ac_arg param_sample_mask;
6572         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) {
6573                 struct ac_arg *arg = NULL;
6574                 if (i == key->ps_prolog.ancillary_vgpr_index) {
6575                         arg = &ancillary;
6576                 } else if (i == key->ps_prolog.ancillary_vgpr_index + 1) {
6577                         arg = &param_sample_mask;
6578                 } else if (i == key->ps_prolog.num_input_vgprs - 1) {
6579                         /* POS_FIXED_PT is always last. */
6580                         arg = &pos_fixed_pt;
6581                 }
6582                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg);
6583                 return_types[num_returns++] = ctx->f32;
6584         }
6585
6586         /* Declare outputs (same as inputs + add colors if needed) */
6587         for (i = 0; i < num_color_channels; i++)
6588                 return_types[num_returns++] = ctx->f32;
6589
6590         /* Create the function. */
6591         si_create_function(ctx, "ps_prolog", return_types, num_returns, 0);
6592         func = ctx->main_fn;
6593
6594         /* Copy inputs to outputs. This should be no-op, as the registers match,
6595          * but it will prevent the compiler from overwriting them unintentionally.
6596          */
6597         ret = ctx->return_value;
6598         for (i = 0; i < ctx->args.arg_count; i++) {
6599                 LLVMValueRef p = LLVMGetParam(func, i);
6600                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
6601         }
6602
6603         /* Polygon stippling. */
6604         if (key->ps_prolog.states.poly_stipple) {
6605                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6606
6607                 si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt);
6608         }
6609
6610         if (key->ps_prolog.states.bc_optimize_for_persp ||
6611             key->ps_prolog.states.bc_optimize_for_linear) {
6612                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6613                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
6614
6615                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
6616                  * The hw doesn't compute CENTROID if the whole wave only
6617                  * contains fully-covered quads.
6618                  *
6619                  * PRIM_MASK is after user SGPRs.
6620                  */
6621                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6622                 bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
6623                                             LLVMConstInt(ctx->i32, 31, 0), "");
6624                 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
6625                                              ctx->i1, "");
6626
6627                 if (key->ps_prolog.states.bc_optimize_for_persp) {
6628                         /* Read PERSP_CENTER. */
6629                         for (i = 0; i < 2; i++)
6630                                 center[i] = LLVMGetParam(func, base + 2 + i);
6631                         /* Read PERSP_CENTROID. */
6632                         for (i = 0; i < 2; i++)
6633                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
6634                         /* Select PERSP_CENTROID. */
6635                         for (i = 0; i < 2; i++) {
6636                                 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
6637                                                       center[i], centroid[i], "");
6638                                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6639                                                            tmp, base + 4 + i, "");
6640                         }
6641                 }
6642                 if (key->ps_prolog.states.bc_optimize_for_linear) {
6643                         /* Read LINEAR_CENTER. */
6644                         for (i = 0; i < 2; i++)
6645                                 center[i] = LLVMGetParam(func, base + 8 + i);
6646                         /* Read LINEAR_CENTROID. */
6647                         for (i = 0; i < 2; i++)
6648                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
6649                         /* Select LINEAR_CENTROID. */
6650                         for (i = 0; i < 2; i++) {
6651                                 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
6652                                                       center[i], centroid[i], "");
6653                                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6654                                                            tmp, base + 10 + i, "");
6655                         }
6656                 }
6657         }
6658
6659         /* Force per-sample interpolation. */
6660         if (key->ps_prolog.states.force_persp_sample_interp) {
6661                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6662                 LLVMValueRef persp_sample[2];
6663
6664                 /* Read PERSP_SAMPLE. */
6665                 for (i = 0; i < 2; i++)
6666                         persp_sample[i] = LLVMGetParam(func, base + i);
6667                 /* Overwrite PERSP_CENTER. */
6668                 for (i = 0; i < 2; i++)
6669                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6670                                                    persp_sample[i], base + 2 + i, "");
6671                 /* Overwrite PERSP_CENTROID. */
6672                 for (i = 0; i < 2; i++)
6673                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6674                                                    persp_sample[i], base + 4 + i, "");
6675         }
6676         if (key->ps_prolog.states.force_linear_sample_interp) {
6677                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6678                 LLVMValueRef linear_sample[2];
6679
6680                 /* Read LINEAR_SAMPLE. */
6681                 for (i = 0; i < 2; i++)
6682                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
6683                 /* Overwrite LINEAR_CENTER. */
6684                 for (i = 0; i < 2; i++)
6685                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6686                                                    linear_sample[i], base + 8 + i, "");
6687                 /* Overwrite LINEAR_CENTROID. */
6688                 for (i = 0; i < 2; i++)
6689                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6690                                                    linear_sample[i], base + 10 + i, "");
6691         }
6692
6693         /* Force center interpolation. */
6694         if (key->ps_prolog.states.force_persp_center_interp) {
6695                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6696                 LLVMValueRef persp_center[2];
6697
6698                 /* Read PERSP_CENTER. */
6699                 for (i = 0; i < 2; i++)
6700                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
6701                 /* Overwrite PERSP_SAMPLE. */
6702                 for (i = 0; i < 2; i++)
6703                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6704                                                    persp_center[i], base + i, "");
6705                 /* Overwrite PERSP_CENTROID. */
6706                 for (i = 0; i < 2; i++)
6707                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6708                                                    persp_center[i], base + 4 + i, "");
6709         }
6710         if (key->ps_prolog.states.force_linear_center_interp) {
6711                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6712                 LLVMValueRef linear_center[2];
6713
6714                 /* Read LINEAR_CENTER. */
6715                 for (i = 0; i < 2; i++)
6716                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
6717                 /* Overwrite LINEAR_SAMPLE. */
6718                 for (i = 0; i < 2; i++)
6719                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6720                                                    linear_center[i], base + 6 + i, "");
6721                 /* Overwrite LINEAR_CENTROID. */
6722                 for (i = 0; i < 2; i++)
6723                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
6724                                                    linear_center[i], base + 10 + i, "");
6725         }
6726
6727         /* Interpolate colors. */
6728         unsigned color_out_idx = 0;
6729         for (i = 0; i < 2; i++) {
6730                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
6731                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
6732                                      key->ps_prolog.face_vgpr_index;
6733                 LLVMValueRef interp[2], color[4];
6734                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
6735
6736                 if (!writemask)
6737                         continue;
6738
6739                 /* If the interpolation qualifier is not CONSTANT (-1). */
6740                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
6741                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
6742                                                key->ps_prolog.color_interp_vgpr_index[i];
6743
6744                         /* Get the (i,j) updated by bc_optimize handling. */
6745                         interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
6746                                                           interp_vgpr, "");
6747                         interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
6748                                                           interp_vgpr + 1, "");
6749                         interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
6750                 }
6751
6752                 /* Use the absolute location of the input. */
6753                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6754
6755                 if (key->ps_prolog.states.color_two_side) {
6756                         face = LLVMGetParam(func, face_vgpr);
6757                         face = ac_to_integer(&ctx->ac, face);
6758                 }
6759
6760                 interp_fs_color(ctx,
6761                                 key->ps_prolog.color_attr_index[i], i,
6762                                 key->ps_prolog.num_interp_inputs,
6763                                 key->ps_prolog.colors_read, interp_ij,
6764                                 prim_mask, face, color);
6765
6766                 while (writemask) {
6767                         unsigned chan = u_bit_scan(&writemask);
6768                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
6769                                                    ctx->args.arg_count + color_out_idx++, "");
6770                 }
6771         }
6772
6773         /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
6774          * says:
6775          *
6776          *    "When per-sample shading is active due to the use of a fragment
6777          *     input qualified by sample or due to the use of the gl_SampleID
6778          *     or gl_SamplePosition variables, only the bit for the current
6779          *     sample is set in gl_SampleMaskIn. When state specifies multiple
6780          *     fragment shader invocations for a given fragment, the sample
6781          *     mask for any single fragment shader invocation may specify a
6782          *     subset of the covered samples for the fragment. In this case,
6783          *     the bit corresponding to each covered sample will be set in
6784          *     exactly one fragment shader invocation."
6785          *
6786          * The samplemask loaded by hardware is always the coverage of the
6787          * entire pixel/fragment, so mask bits out based on the sample ID.
6788          */
6789         if (key->ps_prolog.states.samplemask_log_ps_iter) {
6790                 /* The bit pattern matches that used by fixed function fragment
6791                  * processing. */
6792                 static const uint16_t ps_iter_masks[] = {
6793                         0xffff, /* not used */
6794                         0x5555,
6795                         0x1111,
6796                         0x0101,
6797                         0x0001,
6798                 };
6799                 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
6800
6801                 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
6802                 LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4);
6803                 LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask);
6804
6805                 samplemask = ac_to_integer(&ctx->ac, samplemask);
6806                 samplemask = LLVMBuildAnd(
6807                         ctx->ac.builder,
6808                         samplemask,
6809                         LLVMBuildShl(ctx->ac.builder,
6810                                      LLVMConstInt(ctx->i32, ps_iter_mask, false),
6811                                      sampleid, ""),
6812                         "");
6813                 samplemask = ac_to_float(&ctx->ac, samplemask);
6814
6815                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
6816                                            param_sample_mask.arg_index, "");
6817         }
6818
6819         /* Tell LLVM to insert WQM instruction sequence when needed. */
6820         if (key->ps_prolog.wqm) {
6821                 LLVMAddTargetDependentFunctionAttr(func,
6822                                                    "amdgpu-ps-wqm-outputs", "");
6823         }
6824
6825         si_llvm_build_ret(ctx, ret);
6826 }
6827
6828 /**
6829  * Build the pixel shader epilog function. This handles everything that must be
6830  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
6831  */
6832 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
6833                                         union si_shader_part_key *key)
6834 {
6835         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
6836         int i;
6837         struct si_ps_exports exp = {};
6838
6839         memset(&ctx->args, 0, sizeof(ctx->args));
6840
6841         /* Declare input SGPRs. */
6842         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->rw_buffers);
6843         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6844                    &ctx->bindless_samplers_and_images);
6845         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6846                    &ctx->const_and_shader_buffers);
6847         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
6848                    &ctx->samplers_and_images);
6849         add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT,
6850                         NULL, SI_PARAM_ALPHA_REF);
6851
6852         /* Declare input VGPRs. */
6853         unsigned required_num_params =
6854                      ctx->args.num_sgprs_used +
6855                      util_bitcount(key->ps_epilog.colors_written) * 4 +
6856                      key->ps_epilog.writes_z +
6857                      key->ps_epilog.writes_stencil +
6858                      key->ps_epilog.writes_samplemask;
6859
6860         required_num_params = MAX2(required_num_params,
6861                                    ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6862
6863         while (ctx->args.arg_count < required_num_params)
6864                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
6865
6866         /* Create the function. */
6867         si_create_function(ctx, "ps_epilog", NULL, 0, 0);
6868         /* Disable elimination of unused inputs. */
6869         ac_llvm_add_target_dep_function_attr(ctx->main_fn,
6870                                              "InitialPSInputAddr", 0xffffff);
6871
6872         /* Process colors. */
6873         unsigned vgpr = ctx->args.num_sgprs_used;
6874         unsigned colors_written = key->ps_epilog.colors_written;
6875         int last_color_export = -1;
6876
6877         /* Find the last color export. */
6878         if (!key->ps_epilog.writes_z &&
6879             !key->ps_epilog.writes_stencil &&
6880             !key->ps_epilog.writes_samplemask) {
6881                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
6882
6883                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
6884                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
6885                         /* Just set this if any of the colorbuffers are enabled. */
6886                         if (spi_format &
6887                             ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
6888                                 last_color_export = 0;
6889                 } else {
6890                         for (i = 0; i < 8; i++)
6891                                 if (colors_written & (1 << i) &&
6892                                     (spi_format >> (i * 4)) & 0xf)
6893                                         last_color_export = i;
6894                 }
6895         }
6896
6897         while (colors_written) {
6898                 LLVMValueRef color[4];
6899                 int mrt = u_bit_scan(&colors_written);
6900
6901                 for (i = 0; i < 4; i++)
6902                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
6903
6904                 si_export_mrt_color(ctx, color, mrt,
6905                                     ctx->args.arg_count - 1,
6906                                     mrt == last_color_export, &exp);
6907         }
6908
6909         /* Process depth, stencil, samplemask. */
6910         if (key->ps_epilog.writes_z)
6911                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
6912         if (key->ps_epilog.writes_stencil)
6913                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
6914         if (key->ps_epilog.writes_samplemask)
6915                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
6916
6917         if (depth || stencil || samplemask)
6918                 si_export_mrt_z(ctx, depth, stencil, samplemask, &exp);
6919         else if (last_color_export == -1)
6920                 ac_build_export_null(&ctx->ac);
6921
6922         if (exp.num)
6923                 si_emit_ps_exports(ctx, &exp);
6924
6925         /* Compile. */
6926         LLVMBuildRetVoid(ctx->ac.builder);
6927 }
6928
6929 /**
6930  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
6931  */
6932 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
6933                                       struct ac_llvm_compiler *compiler,
6934                                       struct si_shader *shader,
6935                                       struct pipe_debug_callback *debug)
6936 {
6937         union si_shader_part_key prolog_key;
6938         union si_shader_part_key epilog_key;
6939
6940         /* Get the prolog. */
6941         si_get_ps_prolog_key(shader, &prolog_key, true);
6942
6943         /* The prolog is a no-op if these aren't set. */
6944         if (si_need_ps_prolog(&prolog_key)) {
6945                 shader->prolog =
6946                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
6947                                            PIPE_SHADER_FRAGMENT, true,
6948                                            &prolog_key, compiler, debug,
6949                                            si_build_ps_prolog_function,
6950                                            "Fragment Shader Prolog");
6951                 if (!shader->prolog)
6952                         return false;
6953         }
6954
6955         /* Get the epilog. */
6956         si_get_ps_epilog_key(shader, &epilog_key);
6957
6958         shader->epilog =
6959                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
6960                                    PIPE_SHADER_FRAGMENT, false,
6961                                    &epilog_key, compiler, debug,
6962                                    si_build_ps_epilog_function,
6963                                    "Fragment Shader Epilog");
6964         if (!shader->epilog)
6965                 return false;
6966
6967         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
6968         if (shader->key.part.ps.prolog.poly_stipple) {
6969                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
6970                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
6971         }
6972
6973         /* Set up the enable bits for per-sample shading if needed. */
6974         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
6975             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
6976              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
6977                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
6978                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
6979                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
6980         }
6981         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
6982             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
6983              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
6984                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
6985                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
6986                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
6987         }
6988         if (shader->key.part.ps.prolog.force_persp_center_interp &&
6989             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
6990              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
6991                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
6992                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
6993                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
6994         }
6995         if (shader->key.part.ps.prolog.force_linear_center_interp &&
6996             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
6997              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
6998                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
6999                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7000                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7001         }
7002
7003         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7004         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7005             !(shader->config.spi_ps_input_ena & 0xf)) {
7006                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7007                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7008         }
7009
7010         /* At least one pair of interpolation weights must be enabled. */
7011         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7012                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7013                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7014         }
7015
7016         /* Samplemask fixup requires the sample ID. */
7017         if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
7018                 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
7019                 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
7020         }
7021
7022         /* The sample mask input is always enabled, because the API shader always
7023          * passes it through to the epilog. Disable it here if it's unused.
7024          */
7025         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7026             !shader->selector->info.reads_samplemask)
7027                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7028
7029         return true;
7030 }
7031
7032 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7033                                       unsigned *lds_size)
7034 {
7035         /* If tessellation is all offchip and on-chip GS isn't used, this
7036          * workaround is not needed.
7037          */
7038         return;
7039
7040         /* SPI barrier management bug:
7041          *   Make sure we have at least 4k of LDS in use to avoid the bug.
7042          *   It applies to workgroup sizes of more than one wavefront.
7043          */
7044         if (sscreen->info.family == CHIP_BONAIRE ||
7045             sscreen->info.family == CHIP_KABINI)
7046                 *lds_size = MAX2(*lds_size, 8);
7047 }
7048
7049 static void si_fix_resource_usage(struct si_screen *sscreen,
7050                                   struct si_shader *shader)
7051 {
7052         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7053
7054         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7055
7056         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7057             si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
7058                 si_multiwave_lds_size_workaround(sscreen,
7059                                                  &shader->config.lds_size);
7060         }
7061 }
7062
7063 bool si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
7064                      struct si_shader *shader,
7065                      struct pipe_debug_callback *debug)
7066 {
7067         struct si_shader_selector *sel = shader->selector;
7068         struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7069         int r;
7070
7071         /* LS, ES, VS are compiled on demand if the main part hasn't been
7072          * compiled for that stage.
7073          *
7074          * GS are compiled on demand if the main part hasn't been compiled
7075          * for the chosen NGG-ness.
7076          *
7077          * Vertex shaders are compiled on demand when a vertex fetch
7078          * workaround must be applied.
7079          */
7080         if (shader->is_monolithic) {
7081                 /* Monolithic shader (compiled as a whole, has many variants,
7082                  * may take a long time to compile).
7083                  */
7084                 r = si_compile_shader(sscreen, compiler, shader, debug);
7085                 if (r)
7086                         return false;
7087         } else {
7088                 /* The shader consists of several parts:
7089                  *
7090                  * - the middle part is the user shader, it has 1 variant only
7091                  *   and it was compiled during the creation of the shader
7092                  *   selector
7093                  * - the prolog part is inserted at the beginning
7094                  * - the epilog part is inserted at the end
7095                  *
7096                  * The prolog and epilog have many (but simple) variants.
7097                  *
7098                  * Starting with gfx9, geometry and tessellation control
7099                  * shaders also contain the prolog and user shader parts of
7100                  * the previous shader stage.
7101                  */
7102
7103                 if (!mainp)
7104                         return false;
7105
7106                 /* Copy the compiled TGSI shader data over. */
7107                 shader->is_binary_shared = true;
7108                 shader->binary = mainp->binary;
7109                 shader->config = mainp->config;
7110                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7111                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7112                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7113                 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
7114                 memcpy(shader->info.vs_output_param_offset,
7115                        mainp->info.vs_output_param_offset,
7116                        sizeof(mainp->info.vs_output_param_offset));
7117                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7118                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7119                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7120
7121                 /* Select prologs and/or epilogs. */
7122                 switch (sel->type) {
7123                 case PIPE_SHADER_VERTEX:
7124                         if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
7125                                 return false;
7126                         break;
7127                 case PIPE_SHADER_TESS_CTRL:
7128                         if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
7129                                 return false;
7130                         break;
7131                 case PIPE_SHADER_TESS_EVAL:
7132                         break;
7133                 case PIPE_SHADER_GEOMETRY:
7134                         if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
7135                                 return false;
7136                         break;
7137                 case PIPE_SHADER_FRAGMENT:
7138                         if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
7139                                 return false;
7140
7141                         /* Make sure we have at least as many VGPRs as there
7142                          * are allocated inputs.
7143                          */
7144                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7145                                                         shader->info.num_input_vgprs);
7146                         break;
7147                 default:;
7148                 }
7149
7150                 /* Update SGPR and VGPR counts. */
7151                 if (shader->prolog) {
7152                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7153                                                         shader->prolog->config.num_sgprs);
7154                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7155                                                         shader->prolog->config.num_vgprs);
7156                 }
7157                 if (shader->previous_stage) {
7158                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7159                                                         shader->previous_stage->config.num_sgprs);
7160                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7161                                                         shader->previous_stage->config.num_vgprs);
7162                         shader->config.spilled_sgprs =
7163                                 MAX2(shader->config.spilled_sgprs,
7164                                      shader->previous_stage->config.spilled_sgprs);
7165                         shader->config.spilled_vgprs =
7166                                 MAX2(shader->config.spilled_vgprs,
7167                                      shader->previous_stage->config.spilled_vgprs);
7168                         shader->info.private_mem_vgprs =
7169                                 MAX2(shader->info.private_mem_vgprs,
7170                                      shader->previous_stage->info.private_mem_vgprs);
7171                         shader->config.scratch_bytes_per_wave =
7172                                 MAX2(shader->config.scratch_bytes_per_wave,
7173                                      shader->previous_stage->config.scratch_bytes_per_wave);
7174                         shader->info.uses_instanceid |=
7175                                 shader->previous_stage->info.uses_instanceid;
7176                 }
7177                 if (shader->prolog2) {
7178                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7179                                                         shader->prolog2->config.num_sgprs);
7180                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7181                                                         shader->prolog2->config.num_vgprs);
7182                 }
7183                 if (shader->epilog) {
7184                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7185                                                         shader->epilog->config.num_sgprs);
7186                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7187                                                         shader->epilog->config.num_vgprs);
7188                 }
7189                 si_calculate_max_simd_waves(shader);
7190         }
7191
7192         if (shader->key.as_ngg) {
7193                 assert(!shader->key.as_es && !shader->key.as_ls);
7194                 gfx10_ngg_calculate_subgroup_info(shader);
7195         } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
7196                 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
7197         }
7198
7199         si_fix_resource_usage(sscreen, shader);
7200         si_shader_dump(sscreen, shader, debug, stderr, true);
7201
7202         /* Upload. */
7203         if (!si_shader_binary_upload(sscreen, shader, 0)) {
7204                 fprintf(stderr, "LLVM failed to upload shader\n");
7205                 return false;
7206         }
7207
7208         return true;
7209 }
7210
7211 void si_shader_destroy(struct si_shader *shader)
7212 {
7213         if (shader->scratch_bo)
7214                 si_resource_reference(&shader->scratch_bo, NULL);
7215
7216         si_resource_reference(&shader->bo, NULL);
7217
7218         if (!shader->is_binary_shared)
7219                 si_shader_binary_clean(&shader->binary);
7220
7221         free(shader->shader_log);
7222 }