src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "util/u_memory.h"
  26 #include "util/u_string.h"
  27 #include "tgsi/tgsi_build.h"
  28 #include "tgsi/tgsi_util.h"
  29 #include "tgsi/tgsi_dump.h"
  30
  31 #include "ac_exp_param.h"
  32 #include "ac_shader_util.h"
  33 #include "ac_llvm_util.h"
  34 #include "si_shader_internal.h"
  35 #include "si_pipe.h"
  36 #include "sid.h"
  37
  38 #include "compiler/nir/nir.h"
  39
  40 static const char *scratch_rsrc_dword0_symbol =
  41         "SCRATCH_RSRC_DWORD0";
  42
  43 static const char *scratch_rsrc_dword1_symbol =
  44         "SCRATCH_RSRC_DWORD1";
  45
  46 struct si_shader_output_values
  47 {
  48         LLVMValueRef values[4];
  49         unsigned semantic_name;
  50         unsigned semantic_index;
  51         ubyte vertex_stream[4];
  52 };
  53
  54 /**
  55  * Used to collect types and other info about arguments of the LLVM function
  56  * before the function is created.
  57  */
  58 struct si_function_info {
  59         LLVMTypeRef types[100];
  60         LLVMValueRef *assign[100];
  61         unsigned num_sgpr_params;
  62         unsigned num_params;
  63 };
  64
  65 enum si_arg_regfile {
  66         ARG_SGPR,
  67         ARG_VGPR
  68 };
  69
  70 static void si_init_shader_ctx(struct si_shader_context *ctx,
  71                                struct si_screen *sscreen,
  72                                struct ac_llvm_compiler *compiler);
  73
  74 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  75                                  struct lp_build_tgsi_context *bld_base,
  76                                  struct lp_build_emit_data *emit_data);
  77
  78 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
  79                                FILE *f);
  80
  81 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  82                                         union si_shader_part_key *key);
  83 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  84                                          union si_shader_part_key *key);
  85 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  86                                         union si_shader_part_key *key);
  87 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  88                                         union si_shader_part_key *key);
  89 static void si_fix_resource_usage(struct si_screen *sscreen,
  90                                   struct si_shader *shader);
  91
  92 /* Ideally pass the sample mask input to the PS epilog as v14, which
  93  * is its usual location, so that the shader doesn't have to add v_mov.
  94  */
  95 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
  96
  97 static bool llvm_type_is_64bit(struct si_shader_context *ctx,
  98                                LLVMTypeRef type)
  99 {
 100         if (type == ctx->ac.i64 || type == ctx->ac.f64)
 101                 return true;
 102
 103         return false;
 104 }
 105
 106 static bool is_merged_shader(struct si_shader_context *ctx)
 107 {
 108         if (ctx->screen->info.chip_class <= VI)
 109                 return false;
 110
 111         return ctx->shader->key.as_ls ||
 112                ctx->shader->key.as_es ||
 113                ctx->type == PIPE_SHADER_TESS_CTRL ||
 114                ctx->type == PIPE_SHADER_GEOMETRY;
 115 }
 116
 117 static void si_init_function_info(struct si_function_info *fninfo)
 118 {
 119         fninfo->num_params = 0;
 120         fninfo->num_sgpr_params = 0;
 121 }
 122
 123 static unsigned add_arg_assign(struct si_function_info *fninfo,
 124                         enum si_arg_regfile regfile, LLVMTypeRef type,
 125                         LLVMValueRef *assign)
 126 {
 127         assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
 128
 129         unsigned idx = fninfo->num_params++;
 130         assert(idx < ARRAY_SIZE(fninfo->types));
 131
 132         if (regfile == ARG_SGPR)
 133                 fninfo->num_sgpr_params = fninfo->num_params;
 134
 135         fninfo->types[idx] = type;
 136         fninfo->assign[idx] = assign;
 137         return idx;
 138 }
 139
 140 static unsigned add_arg(struct si_function_info *fninfo,
 141                         enum si_arg_regfile regfile, LLVMTypeRef type)
 142 {
 143         return add_arg_assign(fninfo, regfile, type, NULL);
 144 }
 145
 146 static void add_arg_assign_checked(struct si_function_info *fninfo,
 147                                    enum si_arg_regfile regfile, LLVMTypeRef type,
 148                                    LLVMValueRef *assign, unsigned idx)
 149 {
 150         MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
 151         assert(actual == idx);
 152 }
 153
 154 static void add_arg_checked(struct si_function_info *fninfo,
 155                             enum si_arg_regfile regfile, LLVMTypeRef type,
 156                             unsigned idx)
 157 {
 158         add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
 159 }
 160
 161 /**
 162  * Returns a unique index for a per-patch semantic name and index. The index
 163  * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
 164  * can be calculated.
 165  */
 166 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
 167 {
 168         switch (semantic_name) {
 169         case TGSI_SEMANTIC_TESSOUTER:
 170                 return 0;
 171         case TGSI_SEMANTIC_TESSINNER:
 172                 return 1;
 173         case TGSI_SEMANTIC_PATCH:
 174                 assert(index < 30);
 175                 return 2 + index;
 176
 177         default:
 178                 assert(!"invalid semantic name");
 179                 return 0;
 180         }
 181 }
 182
 183 /**
 184  * Returns a unique index for a semantic name and index. The index must be
 185  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 186  * calculated.
 187  */
 188 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
 189                                        unsigned is_varying)
 190 {
 191         switch (semantic_name) {
 192         case TGSI_SEMANTIC_POSITION:
 193                 return 0;
 194         case TGSI_SEMANTIC_GENERIC:
 195                 /* Since some shader stages use the the highest used IO index
 196                  * to determine the size to allocate for inputs/outputs
 197                  * (in LDS, tess and GS rings). GENERIC should be placed right
 198                  * after POSITION to make that size as small as possible.
 199                  */
 200                 if (index < SI_MAX_IO_GENERIC)
 201                         return 1 + index;
 202
 203                 assert(!"invalid generic index");
 204                 return 0;
 205         case TGSI_SEMANTIC_PSIZE:
 206                 return SI_MAX_IO_GENERIC + 1;
 207         case TGSI_SEMANTIC_CLIPDIST:
 208                 assert(index <= 1);
 209                 return SI_MAX_IO_GENERIC + 2 + index;
 210         case TGSI_SEMANTIC_FOG:
 211                 return SI_MAX_IO_GENERIC + 4;
 212         case TGSI_SEMANTIC_LAYER:
 213                 return SI_MAX_IO_GENERIC + 5;
 214         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 215                 return SI_MAX_IO_GENERIC + 6;
 216         case TGSI_SEMANTIC_PRIMID:
 217                 return SI_MAX_IO_GENERIC + 7;
 218         case TGSI_SEMANTIC_COLOR:
 219                 assert(index < 2);
 220                 return SI_MAX_IO_GENERIC + 8 + index;
 221         case TGSI_SEMANTIC_BCOLOR:
 222                 assert(index < 2);
 223                 /* If it's a varying, COLOR and BCOLOR alias. */
 224                 if (is_varying)
 225                         return SI_MAX_IO_GENERIC + 8 + index;
 226                 else
 227                         return SI_MAX_IO_GENERIC + 10 + index;
 228         case TGSI_SEMANTIC_TEXCOORD:
 229                 assert(index < 8);
 230                 STATIC_ASSERT(SI_MAX_IO_GENERIC + 12 + 8 <= 63);
 231                 return SI_MAX_IO_GENERIC + 12 + index;
 232         case TGSI_SEMANTIC_CLIPVERTEX:
 233                 return 63;
 234         default:
 235                 fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
 236                 assert(!"invalid semantic name");
 237                 return 0;
 238         }
 239 }
 240
 241 /**
 242  * Get the value of a shader input parameter and extract a bitfield.
 243  */
 244 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
 245                                       LLVMValueRef value, unsigned rshift,
 246                                       unsigned bitwidth)
 247 {
 248         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 249                 value = ac_to_integer(&ctx->ac, value);
 250
 251         if (rshift)
 252                 value = LLVMBuildLShr(ctx->ac.builder, value,
 253                                       LLVMConstInt(ctx->i32, rshift, 0), "");
 254
 255         if (rshift + bitwidth < 32) {
 256                 unsigned mask = (1 << bitwidth) - 1;
 257                 value = LLVMBuildAnd(ctx->ac.builder, value,
 258                                      LLVMConstInt(ctx->i32, mask, 0), "");
 259         }
 260
 261         return value;
 262 }
 263
 264 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
 265                              unsigned param, unsigned rshift,
 266                              unsigned bitwidth)
 267 {
 268         LLVMValueRef value = LLVMGetParam(ctx->main_fn, param);
 269
 270         return unpack_llvm_param(ctx, value, rshift, bitwidth);
 271 }
 272
 273 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 274 {
 275         switch (ctx->type) {
 276         case PIPE_SHADER_TESS_CTRL:
 277                 return unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 0, 8);
 278
 279         case PIPE_SHADER_TESS_EVAL:
 280                 return LLVMGetParam(ctx->main_fn,
 281                                     ctx->param_tes_rel_patch_id);
 282
 283         default:
 284                 assert(0);
 285                 return NULL;
 286         }
 287 }
 288
 289 /* Tessellation shaders pass outputs to the next shader using LDS.
 290  *
 291  * LS outputs = TCS inputs
 292  * TCS outputs = TES inputs
 293  *
 294  * The LDS layout is:
 295  * - TCS inputs for patch 0
 296  * - TCS inputs for patch 1
 297  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 298  * - ...
 299  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 300  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 301  * - TCS outputs for patch 1
 302  * - Per-patch TCS outputs for patch 1
 303  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 304  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 305  * - ...
 306  *
 307  * All three shaders VS(LS), TCS, TES share the same LDS space.
 308  */
 309
 310 static LLVMValueRef
 311 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 312 {
 313         return si_unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 314 }
 315
 316 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
 317 {
 318         assert(ctx->type == PIPE_SHADER_TESS_CTRL);
 319
 320         if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
 321                 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
 322
 323         return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
 324 }
 325
 326 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
 327 {
 328         unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 329
 330         return LLVMConstInt(ctx->i32, stride, 0);
 331 }
 332
 333 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
 334 {
 335         if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
 336                 return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 337
 338         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 339         unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
 340         unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 341         unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
 342         unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
 343                                    num_patch_outputs * 4;
 344         return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
 345 }
 346
 347 static LLVMValueRef
 348 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 349 {
 350         return LLVMBuildMul(ctx->ac.builder,
 351                             si_unpack_param(ctx,
 352                                             ctx->param_tcs_out_lds_offsets,
 353                                             0, 16),
 354                             LLVMConstInt(ctx->i32, 4, 0), "");
 355 }
 356
 357 static LLVMValueRef
 358 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 359 {
 360         return LLVMBuildMul(ctx->ac.builder,
 361                             si_unpack_param(ctx,
 362                                             ctx->param_tcs_out_lds_offsets,
 363                                             16, 16),
 364                             LLVMConstInt(ctx->i32, 4, 0), "");
 365 }
 366
 367 static LLVMValueRef
 368 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 369 {
 370         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 371         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 372
 373         return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
 374 }
 375
 376 static LLVMValueRef
 377 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 378 {
 379         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 380         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 381         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 382
 383         return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
 384 }
 385
 386 static LLVMValueRef
 387 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 388 {
 389         LLVMValueRef patch0_patch_data_offset =
 390                 get_tcs_out_patch0_patch_data_offset(ctx);
 391         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 392         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 393
 394         return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
 395 }
 396
 397 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
 398 {
 399         unsigned tcs_out_vertices =
 400                 ctx->shader->selector ?
 401                 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
 402
 403         /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
 404         if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
 405                 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
 406
 407         return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
 408 }
 409
 410 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 411 {
 412         unsigned stride;
 413
 414         switch (ctx->type) {
 415         case PIPE_SHADER_VERTEX:
 416                 stride = ctx->shader->selector->lshs_vertex_stride / 4;
 417                 return LLVMConstInt(ctx->i32, stride, 0);
 418
 419         case PIPE_SHADER_TESS_CTRL:
 420                 if (ctx->screen->info.chip_class >= GFX9 &&
 421                     ctx->shader->is_monolithic) {
 422                         stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
 423                         return LLVMConstInt(ctx->i32, stride, 0);
 424                 }
 425                 return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 426
 427         default:
 428                 assert(0);
 429                 return NULL;
 430         }
 431 }
 432
 433 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
 434                                  LLVMValueRef i32, unsigned index)
 435 {
 436         assert(index <= 1);
 437
 438         if (index == 1)
 439                 return LLVMBuildAShr(ctx->ac.builder, i32,
 440                                      LLVMConstInt(ctx->i32, 16, 0), "");
 441
 442         return LLVMBuildSExt(ctx->ac.builder,
 443                              LLVMBuildTrunc(ctx->ac.builder, i32,
 444                                             ctx->ac.i16, ""),
 445                              ctx->i32, "");
 446 }
 447
 448 void si_llvm_load_input_vs(
 449         struct si_shader_context *ctx,
 450         unsigned input_index,
 451         LLVMValueRef out[4])
 452 {
 453         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 454         unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
 455
 456         if (vs_blit_property) {
 457                 LLVMValueRef vertex_id = ctx->abi.vertex_id;
 458                 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
 459                                                     LLVMIntULE, vertex_id,
 460                                                     ctx->i32_1, "");
 461                 /* Use LLVMIntNE, because we have 3 vertices and only
 462                  * the middle one should use y2.
 463                  */
 464                 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
 465                                                     LLVMIntNE, vertex_id,
 466                                                     ctx->i32_1, "");
 467
 468                 if (input_index == 0) {
 469                         /* Position: */
 470                         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
 471                                                          ctx->param_vs_blit_inputs);
 472                         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
 473                                                          ctx->param_vs_blit_inputs + 1);
 474
 475                         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
 476                         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
 477                         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
 478                         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
 479
 480                         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 481                                                          x1, x2, "");
 482                         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 483                                                          y1, y2, "");
 484
 485                         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
 486                         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
 487                         out[2] = LLVMGetParam(ctx->main_fn,
 488                                               ctx->param_vs_blit_inputs + 2);
 489                         out[3] = ctx->ac.f32_1;
 490                         return;
 491                 }
 492
 493                 /* Color or texture coordinates: */
 494                 assert(input_index == 1);
 495
 496                 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
 497                         for (int i = 0; i < 4; i++) {
 498                                 out[i] = LLVMGetParam(ctx->main_fn,
 499                                                       ctx->param_vs_blit_inputs + 3 + i);
 500                         }
 501                 } else {
 502                         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
 503                         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
 504                                                        ctx->param_vs_blit_inputs + 3);
 505                         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
 506                                                        ctx->param_vs_blit_inputs + 4);
 507                         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
 508                                                        ctx->param_vs_blit_inputs + 5);
 509                         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
 510                                                        ctx->param_vs_blit_inputs + 6);
 511
 512                         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 513                                                  x1, x2, "");
 514                         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 515                                                  y1, y2, "");
 516                         out[2] = LLVMGetParam(ctx->main_fn,
 517                                               ctx->param_vs_blit_inputs + 7);
 518                         out[3] = LLVMGetParam(ctx->main_fn,
 519                                               ctx->param_vs_blit_inputs + 8);
 520                 }
 521                 return;
 522         }
 523
 524         union si_vs_fix_fetch fix_fetch;
 525         LLVMValueRef t_list_ptr;
 526         LLVMValueRef t_offset;
 527         LLVMValueRef t_list;
 528         LLVMValueRef vertex_index;
 529         LLVMValueRef tmp;
 530
 531         /* Load the T list */
 532         t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 533
 534         t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 535
 536         t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 537
 538         vertex_index = LLVMGetParam(ctx->main_fn,
 539                                     ctx->param_vertex_index0 +
 540                                     input_index);
 541
 542         /* Use the open-coded implementation for all loads of doubles and
 543          * of dword-sized data that needs fixups. We need to insert conversion
 544          * code anyway, and the amd/common code does it for us.
 545          *
 546          * Note: On LLVM <= 8, we can only open-code formats with
 547          * channel size >= 4 bytes.
 548          */
 549         bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
 550         fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
 551         if (opencode ||
 552             (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
 553             (fix_fetch.u.log_size == 2)) {
 554                 tmp = ac_build_opencoded_load_format(
 555                                 &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
 556                                 fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
 557                                 t_list, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0,
 558                                 false, false, true);
 559                 for (unsigned i = 0; i < 4; ++i)
 560                         out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
 561                 return;
 562         }
 563
 564         /* Do multiple loads for special formats. */
 565         unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
 566         LLVMValueRef fetches[4];
 567         unsigned num_fetches;
 568         unsigned fetch_stride;
 569         unsigned channels_per_fetch;
 570
 571         if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
 572                 num_fetches = MIN2(required_channels, 3);
 573                 fetch_stride = 1 << fix_fetch.u.log_size;
 574                 channels_per_fetch = 1;
 575         } else {
 576                 num_fetches = 1;
 577                 fetch_stride = 0;
 578                 channels_per_fetch = required_channels;
 579         }
 580
 581         for (unsigned i = 0; i < num_fetches; ++i) {
 582                 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
 583                 fetches[i] = ac_build_buffer_load_format(&ctx->ac, t_list, vertex_index, voffset,
 584                                                          channels_per_fetch, false, true);
 585         }
 586
 587         if (num_fetches == 1 && channels_per_fetch > 1) {
 588                 LLVMValueRef fetch = fetches[0];
 589                 for (unsigned i = 0; i < channels_per_fetch; ++i) {
 590                         tmp = LLVMConstInt(ctx->i32, i, false);
 591                         fetches[i] = LLVMBuildExtractElement(
 592                                 ctx->ac.builder, fetch, tmp, "");
 593                 }
 594                 num_fetches = channels_per_fetch;
 595                 channels_per_fetch = 1;
 596         }
 597
 598         for (unsigned i = num_fetches; i < 4; ++i)
 599                 fetches[i] = LLVMGetUndef(ctx->f32);
 600
 601         if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
 602             required_channels == 4) {
 603                 if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
 604                         fetches[3] = ctx->ac.i32_1;
 605                 else
 606                         fetches[3] = ctx->ac.f32_1;
 607         } else if (fix_fetch.u.log_size == 3 &&
 608                    (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
 609                     fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
 610                     fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
 611                    required_channels == 4) {
 612                 /* For 2_10_10_10, the hardware returns an unsigned value;
 613                  * convert it to a signed one.
 614                  */
 615                 LLVMValueRef tmp = fetches[3];
 616                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 617
 618                 /* First, recover the sign-extended signed integer value. */
 619                 if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
 620                         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
 621                 else
 622                         tmp = ac_to_integer(&ctx->ac, tmp);
 623
 624                 /* For the integer-like cases, do a natural sign extension.
 625                  *
 626                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 627                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 628                  * exponent.
 629                  */
 630                 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
 631                                    fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
 632                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 633                 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 634
 635                 /* Convert back to the right type. */
 636                 if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
 637                         LLVMValueRef clamp;
 638                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 639                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 640                         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
 641                         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
 642                 } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
 643                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 644                 }
 645
 646                 fetches[3] = tmp;
 647         }
 648
 649         for (unsigned i = 0; i < 4; ++i)
 650                 out[i] = ac_to_float(&ctx->ac, fetches[i]);
 651 }
 652
 653 static void declare_input_vs(
 654         struct si_shader_context *ctx,
 655         unsigned input_index,
 656         const struct tgsi_full_declaration *decl,
 657         LLVMValueRef out[4])
 658 {
 659         si_llvm_load_input_vs(ctx, input_index, out);
 660 }
 661
 662 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
 663                                      unsigned swizzle)
 664 {
 665         if (swizzle > 0)
 666                 return ctx->i32_0;
 667
 668         switch (ctx->type) {
 669         case PIPE_SHADER_VERTEX:
 670                 return LLVMGetParam(ctx->main_fn,
 671                                     ctx->param_vs_prim_id);
 672         case PIPE_SHADER_TESS_CTRL:
 673                 return ctx->abi.tcs_patch_id;
 674         case PIPE_SHADER_TESS_EVAL:
 675                 return ctx->abi.tes_patch_id;
 676         case PIPE_SHADER_GEOMETRY:
 677                 return ctx->abi.gs_prim_id;
 678         default:
 679                 assert(0);
 680                 return ctx->i32_0;
 681         }
 682 }
 683
 684 /**
 685  * Return the value of tgsi_ind_register for indexing.
 686  * This is the indirect index with the constant offset added to it.
 687  */
 688 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
 689                                    const struct tgsi_ind_register *ind,
 690                                    unsigned addr_mul,
 691                                    int rel_index)
 692 {
 693         LLVMValueRef result;
 694
 695         if (ind->File == TGSI_FILE_ADDRESS) {
 696                 result = ctx->addrs[ind->Index][ind->Swizzle];
 697                 result = LLVMBuildLoad(ctx->ac.builder, result, "");
 698         } else {
 699                 struct tgsi_full_src_register src = {};
 700
 701                 src.Register.File = ind->File;
 702                 src.Register.Index = ind->Index;
 703
 704                 /* Set the second index to 0 for constants. */
 705                 if (ind->File == TGSI_FILE_CONSTANT)
 706                         src.Register.Dimension = 1;
 707
 708                 result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src,
 709                                                                    TGSI_TYPE_SIGNED,
 710                                                                    ind->Swizzle);
 711                 result = ac_to_integer(&ctx->ac, result);
 712         }
 713
 714         return ac_build_imad(&ctx->ac, result, LLVMConstInt(ctx->i32, addr_mul, 0),
 715                              LLVMConstInt(ctx->i32, rel_index, 0));
 716 }
 717
 718 /**
 719  * Like si_get_indirect_index, but restricts the return value to a (possibly
 720  * undefined) value inside [0..num).
 721  */
 722 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
 723                                            const struct tgsi_ind_register *ind,
 724                                            int rel_index, unsigned num)
 725 {
 726         LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index);
 727
 728         return si_llvm_bound_index(ctx, result, num);
 729 }
 730
 731 static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
 732                                                         LLVMValueRef vertex_dw_stride,
 733                                                         LLVMValueRef base_addr,
 734                                                         LLVMValueRef vertex_index,
 735                                                         LLVMValueRef param_index,
 736                                                         unsigned input_index,
 737                                                         ubyte *name,
 738                                                         ubyte *index,
 739                                                         bool is_patch)
 740 {
 741         if (vertex_dw_stride) {
 742                 base_addr = ac_build_imad(&ctx->ac, vertex_index,
 743                                           vertex_dw_stride, base_addr);
 744         }
 745
 746         if (param_index) {
 747                 base_addr = ac_build_imad(&ctx->ac, param_index,
 748                                           LLVMConstInt(ctx->i32, 4, 0), base_addr);
 749         }
 750
 751         int param = is_patch ?
 752                 si_shader_io_get_unique_index_patch(name[input_index],
 753                                                     index[input_index]) :
 754                 si_shader_io_get_unique_index(name[input_index],
 755                                               index[input_index], false);
 756
 757         /* Add the base address of the element. */
 758         return LLVMBuildAdd(ctx->ac.builder, base_addr,
 759                             LLVMConstInt(ctx->i32, param * 4, 0), "");
 760 }
 761
 762 /**
 763  * Calculate a dword address given an input or output register and a stride.
 764  */
 765 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 766                                    const struct tgsi_full_dst_register *dst,
 767                                    const struct tgsi_full_src_register *src,
 768                                    LLVMValueRef vertex_dw_stride,
 769                                    LLVMValueRef base_addr)
 770 {
 771         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 772         ubyte *name, *index, *array_first;
 773         int input_index;
 774         struct tgsi_full_dst_register reg;
 775         LLVMValueRef vertex_index = NULL;
 776         LLVMValueRef ind_index = NULL;
 777
 778         /* Set the register description. The address computation is the same
 779          * for sources and destinations. */
 780         if (src) {
 781                 reg.Register.File = src->Register.File;
 782                 reg.Register.Index = src->Register.Index;
 783                 reg.Register.Indirect = src->Register.Indirect;
 784                 reg.Register.Dimension = src->Register.Dimension;
 785                 reg.Indirect = src->Indirect;
 786                 reg.Dimension = src->Dimension;
 787                 reg.DimIndirect = src->DimIndirect;
 788         } else
 789                 reg = *dst;
 790
 791         /* If the register is 2-dimensional (e.g. an array of vertices
 792          * in a primitive), calculate the base address of the vertex. */
 793         if (reg.Register.Dimension) {
 794                 if (reg.Dimension.Indirect)
 795                         vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
 796                                                       1, reg.Dimension.Index);
 797                 else
 798                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 799         }
 800
 801         /* Get information about the register. */
 802         if (reg.Register.File == TGSI_FILE_INPUT) {
 803                 name = info->input_semantic_name;
 804                 index = info->input_semantic_index;
 805                 array_first = info->input_array_first;
 806         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 807                 name = info->output_semantic_name;
 808                 index = info->output_semantic_index;
 809                 array_first = info->output_array_first;
 810         } else {
 811                 assert(0);
 812                 return NULL;
 813         }
 814
 815         if (reg.Register.Indirect) {
 816                 /* Add the relative address of the element. */
 817                 if (reg.Indirect.ArrayID)
 818                         input_index = array_first[reg.Indirect.ArrayID];
 819                 else
 820                         input_index = reg.Register.Index;
 821
 822                 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
 823                                                   1, reg.Register.Index - input_index);
 824         } else {
 825                 input_index = reg.Register.Index;
 826         }
 827
 828         return get_dw_address_from_generic_indices(ctx, vertex_dw_stride,
 829                                                    base_addr, vertex_index,
 830                                                    ind_index, input_index,
 831                                                    name, index,
 832                                                    !reg.Register.Dimension);
 833 }
 834
 835 /* The offchip buffer layout for TCS->TES is
 836  *
 837  * - attribute 0 of patch 0 vertex 0
 838  * - attribute 0 of patch 0 vertex 1
 839  * - attribute 0 of patch 0 vertex 2
 840  *   ...
 841  * - attribute 0 of patch 1 vertex 0
 842  * - attribute 0 of patch 1 vertex 1
 843  *   ...
 844  * - attribute 1 of patch 0 vertex 0
 845  * - attribute 1 of patch 0 vertex 1
 846  *   ...
 847  * - per patch attribute 0 of patch 0
 848  * - per patch attribute 0 of patch 1
 849  *   ...
 850  *
 851  * Note that every attribute has 4 components.
 852  */
 853 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 854                                                LLVMValueRef rel_patch_id,
 855                                                LLVMValueRef vertex_index,
 856                                                LLVMValueRef param_index)
 857 {
 858         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 859         LLVMValueRef param_stride, constant16;
 860
 861         vertices_per_patch = get_num_tcs_out_vertices(ctx);
 862         num_patches = si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
 863         total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
 864                                       num_patches, "");
 865
 866         constant16 = LLVMConstInt(ctx->i32, 16, 0);
 867         if (vertex_index) {
 868                 base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
 869                                           vertices_per_patch, vertex_index);
 870                 param_stride = total_vertices;
 871         } else {
 872                 base_addr = rel_patch_id;
 873                 param_stride = num_patches;
 874         }
 875
 876         base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
 877         base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
 878
 879         if (!vertex_index) {
 880                 LLVMValueRef patch_data_offset =
 881                            si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
 882
 883                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 884                                          patch_data_offset, "");
 885         }
 886         return base_addr;
 887 }
 888
 889 /* This is a generic helper that can be shared by the NIR and TGSI backends */
 890 static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
 891                                         struct si_shader_context *ctx,
 892                                         LLVMValueRef vertex_index,
 893                                         LLVMValueRef param_index,
 894                                         unsigned param_base,
 895                                         ubyte *name,
 896                                         ubyte *index,
 897                                         bool is_patch)
 898 {
 899         unsigned param_index_base;
 900
 901         param_index_base = is_patch ?
 902                 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) :
 903                 si_shader_io_get_unique_index(name[param_base], index[param_base], false);
 904
 905         if (param_index) {
 906                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
 907                                            LLVMConstInt(ctx->i32, param_index_base, 0),
 908                                            "");
 909         } else {
 910                 param_index = LLVMConstInt(ctx->i32, param_index_base, 0);
 911         }
 912
 913         return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
 914                                           vertex_index, param_index);
 915 }
 916
 917 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 918                                        struct si_shader_context *ctx,
 919                                        const struct tgsi_full_dst_register *dst,
 920                                        const struct tgsi_full_src_register *src)
 921 {
 922         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 923         ubyte *name, *index, *array_first;
 924         struct tgsi_full_src_register reg;
 925         LLVMValueRef vertex_index = NULL;
 926         LLVMValueRef param_index = NULL;
 927         unsigned param_base;
 928
 929         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 930
 931         if (reg.Register.Dimension) {
 932
 933                 if (reg.Dimension.Indirect)
 934                         vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
 935                                                              1, reg.Dimension.Index);
 936                 else
 937                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 938         }
 939
 940         /* Get information about the register. */
 941         if (reg.Register.File == TGSI_FILE_INPUT) {
 942                 name = info->input_semantic_name;
 943                 index = info->input_semantic_index;
 944                 array_first = info->input_array_first;
 945         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 946                 name = info->output_semantic_name;
 947                 index = info->output_semantic_index;
 948                 array_first = info->output_array_first;
 949         } else {
 950                 assert(0);
 951                 return NULL;
 952         }
 953
 954         if (reg.Register.Indirect) {
 955                 if (reg.Indirect.ArrayID)
 956                         param_base = array_first[reg.Indirect.ArrayID];
 957                 else
 958                         param_base = reg.Register.Index;
 959
 960                 param_index = si_get_indirect_index(ctx, &reg.Indirect,
 961                                                     1, reg.Register.Index - param_base);
 962
 963         } else {
 964                 param_base = reg.Register.Index;
 965         }
 966
 967         return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
 968                                                                param_index, param_base,
 969                                                                name, index, !reg.Register.Dimension);
 970 }
 971
 972 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 973                                 LLVMTypeRef type, unsigned swizzle,
 974                                 LLVMValueRef buffer, LLVMValueRef offset,
 975                                 LLVMValueRef base, bool can_speculate)
 976 {
 977         struct si_shader_context *ctx = si_shader_context(bld_base);
 978         LLVMValueRef value, value2;
 979         LLVMTypeRef vec_type = LLVMVectorType(type, 4);
 980
 981         if (swizzle == ~0) {
 982                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 983                                              0, 1, 0, can_speculate, false);
 984
 985                 return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
 986         }
 987
 988         if (!llvm_type_is_64bit(ctx, type)) {
 989                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 990                                              0, 1, 0, can_speculate, false);
 991
 992                 value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
 993                 return LLVMBuildExtractElement(ctx->ac.builder, value,
 994                                     LLVMConstInt(ctx->i32, swizzle, 0), "");
 995         }
 996
 997         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 998                                   swizzle * 4, 1, 0, can_speculate, false);
 999
1000         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1001                                    swizzle * 4 + 4, 1, 0, can_speculate, false);
1002
1003         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1004 }
1005
1006 /**
1007  * Load from LDS.
1008  *
1009  * \param type          output value type
1010  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
1011  * \param dw_addr       address in dwords
1012  */
1013 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
1014                              LLVMTypeRef type, unsigned swizzle,
1015                              LLVMValueRef dw_addr)
1016 {
1017         struct si_shader_context *ctx = si_shader_context(bld_base);
1018         LLVMValueRef value;
1019
1020         if (swizzle == ~0) {
1021                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1022
1023                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1024                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
1025
1026                 return ac_build_gather_values(&ctx->ac, values,
1027                                               TGSI_NUM_CHANNELS);
1028         }
1029
1030         /* Split 64-bit loads. */
1031         if (llvm_type_is_64bit(ctx, type)) {
1032                 LLVMValueRef lo, hi;
1033
1034                 lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr);
1035                 hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr);
1036                 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
1037         }
1038
1039         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1040                                LLVMConstInt(ctx->i32, swizzle, 0), "");
1041
1042         value = ac_lds_load(&ctx->ac, dw_addr);
1043
1044         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1045 }
1046
1047 /**
1048  * Store to LDS.
1049  *
1050  * \param swizzle       offset (typically 0..3)
1051  * \param dw_addr       address in dwords
1052  * \param value         value to store
1053  */
1054 static void lds_store(struct si_shader_context *ctx,
1055                       unsigned dw_offset_imm, LLVMValueRef dw_addr,
1056                       LLVMValueRef value)
1057 {
1058         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1059                                LLVMConstInt(ctx->i32, dw_offset_imm, 0), "");
1060
1061         ac_lds_store(&ctx->ac, dw_addr, value);
1062 }
1063
1064 enum si_tess_ring {
1065         TCS_FACTOR_RING,
1066         TESS_OFFCHIP_RING_TCS,
1067         TESS_OFFCHIP_RING_TES,
1068 };
1069
1070 static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
1071                                              enum si_tess_ring ring)
1072 {
1073         LLVMBuilderRef builder = ctx->ac.builder;
1074         unsigned param = ring == TESS_OFFCHIP_RING_TES ? ctx->param_tes_offchip_addr :
1075                                                          ctx->param_tcs_out_lds_layout;
1076         LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1077
1078         /* TCS only receives high 13 bits of the address. */
1079         if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
1080                 addr = LLVMBuildAnd(builder, addr,
1081                                     LLVMConstInt(ctx->i32, 0xfff80000, 0), "");
1082         }
1083
1084         if (ring == TCS_FACTOR_RING) {
1085                 unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
1086                 addr = LLVMBuildAdd(builder, addr,
1087                                     LLVMConstInt(ctx->i32, tf_offset, 0), "");
1088         }
1089
1090         LLVMValueRef desc[4];
1091         desc[0] = addr;
1092         desc[1] = LLVMConstInt(ctx->i32,
1093                                S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
1094         desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0);
1095         desc[3] = LLVMConstInt(ctx->i32,
1096                                S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1097                                S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1098                                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1099                                S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1100                                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1101                                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0);
1102
1103         return ac_build_gather_values(&ctx->ac, desc, 4);
1104 }
1105
1106 static LLVMValueRef fetch_input_tcs(
1107         struct lp_build_tgsi_context *bld_base,
1108         const struct tgsi_full_src_register *reg,
1109         enum tgsi_opcode_type type, unsigned swizzle_in)
1110 {
1111         struct si_shader_context *ctx = si_shader_context(bld_base);
1112         LLVMValueRef dw_addr, stride;
1113         unsigned swizzle = swizzle_in & 0xffff;
1114         stride = get_tcs_in_vertex_dw_stride(ctx);
1115         dw_addr = get_tcs_in_current_patch_offset(ctx);
1116         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1117
1118         return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1119 }
1120
1121 static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
1122                                              LLVMTypeRef type,
1123                                              LLVMValueRef vertex_index,
1124                                              LLVMValueRef param_index,
1125                                              unsigned const_index,
1126                                              unsigned location,
1127                                              unsigned driver_location,
1128                                              unsigned component,
1129                                              unsigned num_components,
1130                                              bool is_patch,
1131                                              bool is_compact,
1132                                              bool load_input)
1133 {
1134         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1135         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1136         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1137         LLVMValueRef dw_addr, stride;
1138
1139         driver_location = driver_location / 4;
1140
1141         if (load_input) {
1142                 stride = get_tcs_in_vertex_dw_stride(ctx);
1143                 dw_addr = get_tcs_in_current_patch_offset(ctx);
1144         } else {
1145                 if (is_patch) {
1146                         stride = NULL;
1147                         dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1148                 } else {
1149                         stride = get_tcs_out_vertex_dw_stride(ctx);
1150                         dw_addr = get_tcs_out_current_patch_offset(ctx);
1151                 }
1152         }
1153
1154         if (param_index) {
1155                 /* Add the constant index to the indirect index */
1156                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1157                                            LLVMConstInt(ctx->i32, const_index, 0), "");
1158         } else {
1159                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
1160         }
1161
1162         ubyte *names;
1163         ubyte *indices;
1164         if (load_input) {
1165                 names = info->input_semantic_name;
1166                 indices = info->input_semantic_index;
1167         } else {
1168                 names = info->output_semantic_name;
1169                 indices = info->output_semantic_index;
1170         }
1171
1172         dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1173                                                       vertex_index, param_index,
1174                                                       driver_location,
1175                                                       names, indices,
1176                                                       is_patch);
1177
1178         LLVMValueRef value[4];
1179         for (unsigned i = 0; i < num_components; i++) {
1180                 unsigned offset = i;
1181                 if (llvm_type_is_64bit(ctx, type))
1182                         offset *= 2;
1183
1184                 offset += component;
1185                 value[i + component] = lds_load(bld_base, type, offset, dw_addr);
1186         }
1187
1188         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1189 }
1190
1191 static LLVMValueRef fetch_output_tcs(
1192                 struct lp_build_tgsi_context *bld_base,
1193                 const struct tgsi_full_src_register *reg,
1194                 enum tgsi_opcode_type type, unsigned swizzle_in)
1195 {
1196         struct si_shader_context *ctx = si_shader_context(bld_base);
1197         LLVMValueRef dw_addr, stride;
1198         unsigned swizzle = (swizzle_in & 0xffff);
1199
1200         if (reg->Register.Dimension) {
1201                 stride = get_tcs_out_vertex_dw_stride(ctx);
1202                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1203                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1204         } else {
1205                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1206                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1207         }
1208
1209         return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1210 }
1211
1212 static LLVMValueRef fetch_input_tes(
1213         struct lp_build_tgsi_context *bld_base,
1214         const struct tgsi_full_src_register *reg,
1215         enum tgsi_opcode_type type, unsigned swizzle_in)
1216 {
1217         struct si_shader_context *ctx = si_shader_context(bld_base);
1218         LLVMValueRef base, addr;
1219         unsigned swizzle = (swizzle_in & 0xffff);
1220
1221         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1222         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1223
1224         return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle,
1225                            ctx->tess_offchip_ring, base, addr, true);
1226 }
1227
1228 LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
1229                                    LLVMTypeRef type,
1230                                    LLVMValueRef vertex_index,
1231                                    LLVMValueRef param_index,
1232                                    unsigned const_index,
1233                                    unsigned location,
1234                                    unsigned driver_location,
1235                                    unsigned component,
1236                                    unsigned num_components,
1237                                    bool is_patch,
1238                                    bool is_compact,
1239                                    bool load_input)
1240 {
1241         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1242         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1243         LLVMValueRef base, addr;
1244
1245         driver_location = driver_location / 4;
1246
1247         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1248
1249         if (param_index) {
1250                 /* Add the constant index to the indirect index */
1251                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1252                                            LLVMConstInt(ctx->i32, const_index, 0), "");
1253         } else {
1254                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
1255         }
1256
1257         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1258                                                                param_index, driver_location,
1259                                                                info->input_semantic_name,
1260                                                                info->input_semantic_index,
1261                                                                is_patch);
1262
1263         /* TODO: This will generate rather ordinary llvm code, although it
1264          * should be easy for the optimiser to fix up. In future we might want
1265          * to refactor buffer_load(), but for now this maximises code sharing
1266          * between the NIR and TGSI backends.
1267          */
1268         LLVMValueRef value[4];
1269         for (unsigned i = 0; i < num_components; i++) {
1270                 unsigned offset = i;
1271                 if (llvm_type_is_64bit(ctx, type))
1272                         offset *= 2;
1273
1274                 offset += component;
1275                 value[i + component] = buffer_load(&ctx->bld_base, type, offset,
1276                                                    ctx->tess_offchip_ring, base, addr, true);
1277         }
1278
1279         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1280 }
1281
1282 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1283                              const struct tgsi_full_instruction *inst,
1284                              const struct tgsi_opcode_info *info,
1285                              unsigned index,
1286                              LLVMValueRef dst[4])
1287 {
1288         struct si_shader_context *ctx = si_shader_context(bld_base);
1289         const struct tgsi_full_dst_register *reg = &inst->Dst[index];
1290         const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1291         unsigned chan_index;
1292         LLVMValueRef dw_addr, stride;
1293         LLVMValueRef buffer, base, buf_addr;
1294         LLVMValueRef values[4];
1295         bool skip_lds_store;
1296         bool is_tess_factor = false, is_tess_inner = false;
1297
1298         /* Only handle per-patch and per-vertex outputs here.
1299          * Vectors will be lowered to scalars and this function will be called again.
1300          */
1301         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1302             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1303                 si_llvm_emit_store(bld_base, inst, info, index, dst);
1304                 return;
1305         }
1306
1307         if (reg->Register.Dimension) {
1308                 stride = get_tcs_out_vertex_dw_stride(ctx);
1309                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1310                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1311                 skip_lds_store = !sh_info->reads_pervertex_outputs;
1312         } else {
1313                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1314                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1315                 skip_lds_store = !sh_info->reads_perpatch_outputs;
1316
1317                 if (!reg->Register.Indirect) {
1318                         int name = sh_info->output_semantic_name[reg->Register.Index];
1319
1320                         /* Always write tess factors into LDS for the TCS epilog. */
1321                         if (name == TGSI_SEMANTIC_TESSINNER ||
1322                             name == TGSI_SEMANTIC_TESSOUTER) {
1323                                 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1324                                 skip_lds_store = !sh_info->reads_tessfactor_outputs &&
1325                                                  ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1326                                 is_tess_factor = true;
1327                                 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1328                         }
1329                 }
1330         }
1331
1332         buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1333
1334         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1335         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1336
1337         uint32_t writemask = reg->Register.WriteMask;
1338         while (writemask) {
1339                 chan_index = u_bit_scan(&writemask);
1340                 LLVMValueRef value = dst[chan_index];
1341
1342                 if (inst->Instruction.Saturate)
1343                         value = ac_build_clamp(&ctx->ac, value);
1344
1345                 /* Skip LDS stores if there is no LDS read of this output. */
1346                 if (!skip_lds_store)
1347                         lds_store(ctx, chan_index, dw_addr, value);
1348
1349                 value = ac_to_integer(&ctx->ac, value);
1350                 values[chan_index] = value;
1351
1352                 if (reg->Register.WriteMask != 0xF && !is_tess_factor) {
1353                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1354                                                     buf_addr, base,
1355                                                     4 * chan_index, 1, 0, true, false);
1356                 }
1357
1358                 /* Write tess factors into VGPRs for the epilog. */
1359                 if (is_tess_factor &&
1360                     ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1361                         if (!is_tess_inner) {
1362                                 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1363                                                ctx->invoc0_tess_factors[chan_index]);
1364                         } else if (chan_index < 2) {
1365                                 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1366                                                ctx->invoc0_tess_factors[4 + chan_index]);
1367                         }
1368                 }
1369         }
1370
1371         if (reg->Register.WriteMask == 0xF && !is_tess_factor) {
1372                 LLVMValueRef value = ac_build_gather_values(&ctx->ac,
1373                                                             values, 4);
1374                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1375                                             base, 0, 1, 0, true, false);
1376         }
1377 }
1378
1379 static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
1380                                     const struct nir_variable *var,
1381                                     LLVMValueRef vertex_index,
1382                                     LLVMValueRef param_index,
1383                                     unsigned const_index,
1384                                     LLVMValueRef src,
1385                                     unsigned writemask)
1386 {
1387         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1388         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1389         const unsigned component = var->data.location_frac;
1390         const bool is_patch = var->data.patch;
1391         unsigned driver_location = var->data.driver_location;
1392         LLVMValueRef dw_addr, stride;
1393         LLVMValueRef buffer, base, addr;
1394         LLVMValueRef values[4];
1395         bool skip_lds_store;
1396         bool is_tess_factor = false, is_tess_inner = false;
1397
1398         driver_location = driver_location / 4;
1399
1400         if (param_index) {
1401                 /* Add the constant index to the indirect index */
1402                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1403                                            LLVMConstInt(ctx->i32, const_index, 0), "");
1404         } else {
1405                 if (const_index != 0)
1406                         param_index = LLVMConstInt(ctx->i32, const_index, 0);
1407         }
1408
1409         if (!is_patch) {
1410                 stride = get_tcs_out_vertex_dw_stride(ctx);
1411                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1412                 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1413                                                               vertex_index, param_index,
1414                                                               driver_location,
1415                                                               info->output_semantic_name,
1416                                                               info->output_semantic_index,
1417                                                               is_patch);
1418
1419                 skip_lds_store = !info->reads_pervertex_outputs;
1420         } else {
1421                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1422                 dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
1423                                                               vertex_index, param_index,
1424                                                               driver_location,
1425                                                               info->output_semantic_name,
1426                                                               info->output_semantic_index,
1427                                                               is_patch);
1428
1429                 skip_lds_store = !info->reads_perpatch_outputs;
1430
1431                 if (!param_index) {
1432                         int name = info->output_semantic_name[driver_location];
1433
1434                         /* Always write tess factors into LDS for the TCS epilog. */
1435                         if (name == TGSI_SEMANTIC_TESSINNER ||
1436                             name == TGSI_SEMANTIC_TESSOUTER) {
1437                                 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1438                                 skip_lds_store = !info->reads_tessfactor_outputs &&
1439                                                  ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1440                                 is_tess_factor = true;
1441                                 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1442                         }
1443                 }
1444         }
1445
1446         buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1447
1448         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1449
1450         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1451                                                                param_index, driver_location,
1452                                                                info->output_semantic_name,
1453                                                                info->output_semantic_index,
1454                                                                is_patch);
1455
1456         for (unsigned chan = 0; chan < 4; chan++) {
1457                 if (!(writemask & (1 << chan)))
1458                         continue;
1459                 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
1460
1461                 /* Skip LDS stores if there is no LDS read of this output. */
1462                 if (!skip_lds_store)
1463                         lds_store(ctx, chan, dw_addr, value);
1464
1465                 value = ac_to_integer(&ctx->ac, value);
1466                 values[chan] = value;
1467
1468                 if (writemask != 0xF && !is_tess_factor) {
1469                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1470                                                     addr, base,
1471                                                     4 * chan, 1, 0, true, false);
1472                 }
1473
1474                 /* Write tess factors into VGPRs for the epilog. */
1475                 if (is_tess_factor &&
1476                     ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1477                         if (!is_tess_inner) {
1478                                 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1479                                                ctx->invoc0_tess_factors[chan]);
1480                         } else if (chan < 2) {
1481                                 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1482                                                ctx->invoc0_tess_factors[4 + chan]);
1483                         }
1484                 }
1485         }
1486
1487         if (writemask == 0xF && !is_tess_factor) {
1488                 LLVMValueRef value = ac_build_gather_values(&ctx->ac,
1489                                                             values, 4);
1490                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
1491                                             base, 0, 1, 0, true, false);
1492         }
1493 }
1494
1495 LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
1496                                    unsigned input_index,
1497                                    unsigned vtx_offset_param,
1498                                    LLVMTypeRef type,
1499                                    unsigned swizzle)
1500 {
1501         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1502         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1503         struct si_shader *shader = ctx->shader;
1504         LLVMValueRef vtx_offset, soffset;
1505         struct tgsi_shader_info *info = &shader->selector->info;
1506         unsigned semantic_name = info->input_semantic_name[input_index];
1507         unsigned semantic_index = info->input_semantic_index[input_index];
1508         unsigned param;
1509         LLVMValueRef value;
1510
1511         param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
1512
1513         /* GFX9 has the ESGS ring in LDS. */
1514         if (ctx->screen->info.chip_class >= GFX9) {
1515                 unsigned index = vtx_offset_param;
1516
1517                 switch (index / 2) {
1518                 case 0:
1519                         vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx01_offset,
1520                                                   index % 2 ? 16 : 0, 16);
1521                         break;
1522                 case 1:
1523                         vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx23_offset,
1524                                                   index % 2 ? 16 : 0, 16);
1525                         break;
1526                 case 2:
1527                         vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx45_offset,
1528                                                   index % 2 ? 16 : 0, 16);
1529                         break;
1530                 default:
1531                         assert(0);
1532                         return NULL;
1533                 }
1534
1535                 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
1536                                           LLVMConstInt(ctx->i32, param * 4, 0), "");
1537                 return lds_load(bld_base, type, swizzle, vtx_offset);
1538         }
1539
1540         /* GFX6: input load from the ESGS ring in memory. */
1541         if (swizzle == ~0) {
1542                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1543                 unsigned chan;
1544                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1545                         values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
1546                                                              type, chan);
1547                 }
1548                 return ac_build_gather_values(&ctx->ac, values,
1549                                               TGSI_NUM_CHANNELS);
1550         }
1551
1552         /* Get the vertex offset parameter on GFX6. */
1553         LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param];
1554
1555         vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
1556                                   LLVMConstInt(ctx->i32, 4, 0), "");
1557
1558         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1559
1560         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1561                                      vtx_offset, soffset, 0, 1, 0, true, false);
1562         if (llvm_type_is_64bit(ctx, type)) {
1563                 LLVMValueRef value2;
1564                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1565
1566                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1567                                               ctx->i32_0, vtx_offset, soffset,
1568                                               0, 1, 0, true, false);
1569                 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1570         }
1571         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1572 }
1573
1574 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
1575                                          unsigned location,
1576                                          unsigned driver_location,
1577                                          unsigned component,
1578                                          unsigned num_components,
1579                                          unsigned vertex_index,
1580                                          unsigned const_index,
1581                                          LLVMTypeRef type)
1582 {
1583         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1584
1585         LLVMValueRef value[4];
1586         for (unsigned i = 0; i < num_components; i++) {
1587                 unsigned offset = i;
1588                 if (llvm_type_is_64bit(ctx, type))
1589                         offset *= 2;
1590
1591                 offset += component;
1592                 value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4,
1593                                                              vertex_index, type, offset);
1594         }
1595
1596         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1597 }
1598
1599 static LLVMValueRef fetch_input_gs(
1600         struct lp_build_tgsi_context *bld_base,
1601         const struct tgsi_full_src_register *reg,
1602         enum tgsi_opcode_type type,
1603         unsigned swizzle_in)
1604 {
1605         struct si_shader_context *ctx = si_shader_context(bld_base);
1606         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1607         unsigned swizzle = swizzle_in & 0xffff;
1608
1609         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1610         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1611                 return get_primitive_id(ctx, swizzle);
1612
1613         if (!reg->Register.Dimension)
1614                 return NULL;
1615
1616         return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index,
1617                                      reg->Dimension.Index,
1618                                      tgsi2llvmtype(bld_base, type),
1619                                      swizzle);
1620 }
1621
1622 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1623 {
1624         switch (interpolate) {
1625         case TGSI_INTERPOLATE_CONSTANT:
1626                 return 0;
1627
1628         case TGSI_INTERPOLATE_LINEAR:
1629                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1630                         return SI_PARAM_LINEAR_SAMPLE;
1631                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1632                         return SI_PARAM_LINEAR_CENTROID;
1633                 else
1634                         return SI_PARAM_LINEAR_CENTER;
1635                 break;
1636         case TGSI_INTERPOLATE_COLOR:
1637         case TGSI_INTERPOLATE_PERSPECTIVE:
1638                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1639                         return SI_PARAM_PERSP_SAMPLE;
1640                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1641                         return SI_PARAM_PERSP_CENTROID;
1642                 else
1643                         return SI_PARAM_PERSP_CENTER;
1644                 break;
1645         default:
1646                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1647                 return -1;
1648         }
1649 }
1650
1651 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1652                                        unsigned attr_index, unsigned chan,
1653                                        LLVMValueRef prim_mask,
1654                                        LLVMValueRef i, LLVMValueRef j)
1655 {
1656         if (i || j) {
1657                 return ac_build_fs_interp(&ctx->ac,
1658                                           LLVMConstInt(ctx->i32, chan, 0),
1659                                           LLVMConstInt(ctx->i32, attr_index, 0),
1660                                           prim_mask, i, j);
1661         }
1662         return ac_build_fs_interp_mov(&ctx->ac,
1663                                       LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1664                                       LLVMConstInt(ctx->i32, chan, 0),
1665                                       LLVMConstInt(ctx->i32, attr_index, 0),
1666                                       prim_mask);
1667 }
1668
1669 /**
1670  * Interpolate a fragment shader input.
1671  *
1672  * @param ctx           context
1673  * @param input_index           index of the input in hardware
1674  * @param semantic_name         TGSI_SEMANTIC_*
1675  * @param semantic_index        semantic index
1676  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1677  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1678  * @param interp_param          interpolation weights (i,j)
1679  * @param prim_mask             SI_PARAM_PRIM_MASK
1680  * @param face                  SI_PARAM_FRONT_FACE
1681  * @param result                the return value (4 components)
1682  */
1683 static void interp_fs_input(struct si_shader_context *ctx,
1684                             unsigned input_index,
1685                             unsigned semantic_name,
1686                             unsigned semantic_index,
1687                             unsigned num_interp_inputs,
1688                             unsigned colors_read_mask,
1689                             LLVMValueRef interp_param,
1690                             LLVMValueRef prim_mask,
1691                             LLVMValueRef face,
1692                             LLVMValueRef result[4])
1693 {
1694         LLVMValueRef i = NULL, j = NULL;
1695         unsigned chan;
1696
1697         /* fs.constant returns the param from the middle vertex, so it's not
1698          * really useful for flat shading. It's meant to be used for custom
1699          * interpolation (but the intrinsic can't fetch from the other two
1700          * vertices).
1701          *
1702          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1703          * to do the right thing. The only reason we use fs.constant is that
1704          * fs.interp cannot be used on integers, because they can be equal
1705          * to NaN.
1706          *
1707          * When interp is false we will use fs.constant or for newer llvm,
1708          * amdgcn.interp.mov.
1709          */
1710         bool interp = interp_param != NULL;
1711
1712         if (interp) {
1713                 interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
1714                                                 LLVMVectorType(ctx->f32, 2), "");
1715
1716                 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1717                                                 ctx->i32_0, "");
1718                 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1719                                                 ctx->i32_1, "");
1720         }
1721
1722         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1723             ctx->shader->key.part.ps.prolog.color_two_side) {
1724                 LLVMValueRef is_face_positive;
1725
1726                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1727                  * otherwise it's at offset "num_inputs".
1728                  */
1729                 unsigned back_attr_offset = num_interp_inputs;
1730                 if (semantic_index == 1 && colors_read_mask & 0xf)
1731                         back_attr_offset += 1;
1732
1733                 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
1734                                                  face, ctx->i32_0, "");
1735
1736                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1737                         LLVMValueRef front, back;
1738
1739                         front = si_build_fs_interp(ctx,
1740                                                    input_index, chan,
1741                                                    prim_mask, i, j);
1742                         back = si_build_fs_interp(ctx,
1743                                                   back_attr_offset, chan,
1744                                                   prim_mask, i, j);
1745
1746                         result[chan] = LLVMBuildSelect(ctx->ac.builder,
1747                                                 is_face_positive,
1748                                                 front,
1749                                                 back,
1750                                                 "");
1751                 }
1752         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1753                 result[0] = si_build_fs_interp(ctx, input_index,
1754                                                0, prim_mask, i, j);
1755                 result[1] =
1756                 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1757                 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1758         } else {
1759                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1760                         result[chan] = si_build_fs_interp(ctx,
1761                                                           input_index, chan,
1762                                                           prim_mask, i, j);
1763                 }
1764         }
1765 }
1766
1767 void si_llvm_load_input_fs(
1768         struct si_shader_context *ctx,
1769         unsigned input_index,
1770         LLVMValueRef out[4])
1771 {
1772         struct si_shader *shader = ctx->shader;
1773         struct tgsi_shader_info *info = &shader->selector->info;
1774         LLVMValueRef main_fn = ctx->main_fn;
1775         LLVMValueRef interp_param = NULL;
1776         int interp_param_idx;
1777         enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1778         unsigned semantic_index = info->input_semantic_index[input_index];
1779         enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1780         enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1781
1782         /* Get colors from input VGPRs (set by the prolog). */
1783         if (semantic_name == TGSI_SEMANTIC_COLOR) {
1784                 unsigned colors_read = shader->selector->info.colors_read;
1785                 unsigned mask = colors_read >> (semantic_index * 4);
1786                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1787                                   (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1788                 LLVMValueRef undef = LLVMGetUndef(ctx->f32);
1789
1790                 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
1791                 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
1792                 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
1793                 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
1794                 return;
1795         }
1796
1797         interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1798         if (interp_param_idx == -1)
1799                 return;
1800         else if (interp_param_idx) {
1801                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1802         }
1803
1804         interp_fs_input(ctx, input_index, semantic_name,
1805                         semantic_index, 0, /* this param is unused */
1806                         shader->selector->info.colors_read, interp_param,
1807                         ctx->abi.prim_mask,
1808                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1809                         &out[0]);
1810 }
1811
1812 static void declare_input_fs(
1813         struct si_shader_context *ctx,
1814         unsigned input_index,
1815         const struct tgsi_full_declaration *decl,
1816         LLVMValueRef out[4])
1817 {
1818         si_llvm_load_input_fs(ctx, input_index, out);
1819 }
1820
1821 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
1822 {
1823         return si_unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1824 }
1825
1826 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
1827 {
1828         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1829
1830         /* For non-indexed draws, the base vertex set by the driver
1831          * (for direct draws) or the CP (for indirect draws) is the
1832          * first vertex ID, but GLSL expects 0 to be returned.
1833          */
1834         LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn,
1835                                              ctx->param_vs_state_bits);
1836         LLVMValueRef indexed;
1837
1838         indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
1839         indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
1840
1841         return LLVMBuildSelect(ctx->ac.builder, indexed, ctx->abi.base_vertex,
1842                                ctx->i32_0, "");
1843 }
1844
1845 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
1846 {
1847         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1848
1849         LLVMValueRef values[3];
1850         LLVMValueRef result;
1851         unsigned i;
1852         unsigned *properties = ctx->shader->selector->info.properties;
1853
1854         if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1855                 unsigned sizes[3] = {
1856                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1857                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1858                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1859                 };
1860
1861                 for (i = 0; i < 3; ++i)
1862                         values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1863
1864                 result = ac_build_gather_values(&ctx->ac, values, 3);
1865         } else {
1866                 result = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1867         }
1868
1869         return result;
1870 }
1871
1872 /**
1873  * Load a dword from a constant buffer.
1874  */
1875 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1876                                       LLVMValueRef resource,
1877                                       LLVMValueRef offset)
1878 {
1879         return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1880                                     0, 0, 0, true, true);
1881 }
1882
1883 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
1884 {
1885         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1886         LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1887         LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1888         LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
1889
1890         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1891         LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->i32, 8, 0), "");
1892         LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1893
1894         LLVMValueRef pos[4] = {
1895                 buffer_load_const(ctx, resource, offset0),
1896                 buffer_load_const(ctx, resource, offset1),
1897                 LLVMConstReal(ctx->f32, 0),
1898                 LLVMConstReal(ctx->f32, 0)
1899         };
1900
1901         return ac_build_gather_values(&ctx->ac, pos, 4);
1902 }
1903
1904 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
1905 {
1906         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1907         return ac_to_integer(&ctx->ac, abi->sample_coverage);
1908 }
1909
1910 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
1911 {
1912         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1913         LLVMValueRef coord[4] = {
1914                 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1915                 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1916                 ctx->ac.f32_0,
1917                 ctx->ac.f32_0
1918         };
1919
1920         /* For triangles, the vector should be (u, v, 1-u-v). */
1921         if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1922             PIPE_PRIM_TRIANGLES) {
1923                 coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
1924                                          LLVMBuildFAdd(ctx->ac.builder,
1925                                                        coord[0], coord[1], ""), "");
1926         }
1927         return ac_build_gather_values(&ctx->ac, coord, 4);
1928 }
1929
1930 static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
1931                                     unsigned semantic_name)
1932 {
1933         LLVMValueRef base, addr;
1934
1935         int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
1936
1937         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1938         addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1939                                           LLVMConstInt(ctx->i32, param, 0));
1940
1941         return buffer_load(&ctx->bld_base, ctx->f32,
1942                            ~0, ctx->tess_offchip_ring, base, addr, true);
1943
1944 }
1945
1946 static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
1947                                        unsigned varying_id)
1948 {
1949         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1950         unsigned semantic_name;
1951
1952         switch (varying_id) {
1953         case VARYING_SLOT_TESS_LEVEL_INNER:
1954                 semantic_name = TGSI_SEMANTIC_TESSINNER;
1955                 break;
1956         case VARYING_SLOT_TESS_LEVEL_OUTER:
1957                 semantic_name = TGSI_SEMANTIC_TESSOUTER;
1958                 break;
1959         default:
1960                 unreachable("unknown tess level");
1961         }
1962
1963         return load_tess_level(ctx, semantic_name);
1964
1965 }
1966
1967 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
1968 {
1969         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1970         if (ctx->type == PIPE_SHADER_TESS_CTRL)
1971                 return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 6);
1972         else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1973                 return get_num_tcs_out_vertices(ctx);
1974         else
1975                 unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1976 }
1977
1978 void si_load_system_value(struct si_shader_context *ctx,
1979                           unsigned index,
1980                           const struct tgsi_full_declaration *decl)
1981 {
1982         LLVMValueRef value = 0;
1983
1984         assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1985
1986         switch (decl->Semantic.Name) {
1987         case TGSI_SEMANTIC_INSTANCEID:
1988                 value = ctx->abi.instance_id;
1989                 break;
1990
1991         case TGSI_SEMANTIC_VERTEXID:
1992                 value = LLVMBuildAdd(ctx->ac.builder,
1993                                      ctx->abi.vertex_id,
1994                                      ctx->abi.base_vertex, "");
1995                 break;
1996
1997         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1998                 /* Unused. Clarify the meaning in indexed vs. non-indexed
1999                  * draws if this is ever used again. */
2000                 assert(false);
2001                 break;
2002
2003         case TGSI_SEMANTIC_BASEVERTEX:
2004                 value = get_base_vertex(&ctx->abi);
2005                 break;
2006
2007         case TGSI_SEMANTIC_BASEINSTANCE:
2008                 value = ctx->abi.start_instance;
2009                 break;
2010
2011         case TGSI_SEMANTIC_DRAWID:
2012                 value = ctx->abi.draw_id;
2013                 break;
2014
2015         case TGSI_SEMANTIC_INVOCATIONID:
2016                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
2017                         value = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
2018                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
2019                         value = ctx->abi.gs_invocation_id;
2020                 else
2021                         assert(!"INVOCATIONID not implemented");
2022                 break;
2023
2024         case TGSI_SEMANTIC_POSITION:
2025         {
2026                 LLVMValueRef pos[4] = {
2027                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
2028                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
2029                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
2030                         ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
2031                                       LLVMGetParam(ctx->main_fn, SI_PARAM_POS_W_FLOAT)),
2032                 };
2033                 value = ac_build_gather_values(&ctx->ac, pos, 4);
2034                 break;
2035         }
2036
2037         case TGSI_SEMANTIC_FACE:
2038                 value = ctx->abi.front_face;
2039                 break;
2040
2041         case TGSI_SEMANTIC_SAMPLEID:
2042                 value = si_get_sample_id(ctx);
2043                 break;
2044
2045         case TGSI_SEMANTIC_SAMPLEPOS: {
2046                 LLVMValueRef pos[4] = {
2047                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
2048                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
2049                         LLVMConstReal(ctx->f32, 0),
2050                         LLVMConstReal(ctx->f32, 0)
2051                 };
2052                 pos[0] = ac_build_fract(&ctx->ac, pos[0], 32);
2053                 pos[1] = ac_build_fract(&ctx->ac, pos[1], 32);
2054                 value = ac_build_gather_values(&ctx->ac, pos, 4);
2055                 break;
2056         }
2057
2058         case TGSI_SEMANTIC_SAMPLEMASK:
2059                 /* This can only occur with the OpenGL Core profile, which
2060                  * doesn't support smoothing.
2061                  */
2062                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
2063                 break;
2064
2065         case TGSI_SEMANTIC_TESSCOORD:
2066                 value = si_load_tess_coord(&ctx->abi);
2067                 break;
2068
2069         case TGSI_SEMANTIC_VERTICESIN:
2070                 value = si_load_patch_vertices_in(&ctx->abi);
2071                 break;
2072
2073         case TGSI_SEMANTIC_TESSINNER:
2074         case TGSI_SEMANTIC_TESSOUTER:
2075                 value = load_tess_level(ctx, decl->Semantic.Name);
2076                 break;
2077
2078         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
2079         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
2080         {
2081                 LLVMValueRef buf, slot, val[4];
2082                 int i, offset;
2083
2084                 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
2085                 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2086                 buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
2087                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
2088
2089                 for (i = 0; i < 4; i++)
2090                         val[i] = buffer_load_const(ctx, buf,
2091                                                    LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
2092                 value = ac_build_gather_values(&ctx->ac, val, 4);
2093                 break;
2094         }
2095
2096         case TGSI_SEMANTIC_PRIMID:
2097                 value = get_primitive_id(ctx, 0);
2098                 break;
2099
2100         case TGSI_SEMANTIC_GRID_SIZE:
2101                 value = ctx->abi.num_work_groups;
2102                 break;
2103
2104         case TGSI_SEMANTIC_BLOCK_SIZE:
2105                 value = get_block_size(&ctx->abi);
2106                 break;
2107
2108         case TGSI_SEMANTIC_BLOCK_ID:
2109         {
2110                 LLVMValueRef values[3];
2111
2112                 for (int i = 0; i < 3; i++) {
2113                         values[i] = ctx->i32_0;
2114                         if (ctx->abi.workgroup_ids[i]) {
2115                                 values[i] = ctx->abi.workgroup_ids[i];
2116                         }
2117                 }
2118                 value = ac_build_gather_values(&ctx->ac, values, 3);
2119                 break;
2120         }
2121
2122         case TGSI_SEMANTIC_THREAD_ID:
2123                 value = ctx->abi.local_invocation_ids;
2124                 break;
2125
2126         case TGSI_SEMANTIC_HELPER_INVOCATION:
2127                 value = ac_build_load_helper_invocation(&ctx->ac);
2128                 break;
2129
2130         case TGSI_SEMANTIC_SUBGROUP_SIZE:
2131                 value = LLVMConstInt(ctx->i32, 64, 0);
2132                 break;
2133
2134         case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
2135                 value = ac_get_thread_id(&ctx->ac);
2136                 break;
2137
2138         case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
2139         {
2140                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
2141                 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
2142                 value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
2143                 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
2144                 break;
2145         }
2146
2147         case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
2148         case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
2149         case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
2150         case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
2151         {
2152                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
2153                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
2154                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
2155                         /* All bits set except LSB */
2156                         value = LLVMConstInt(ctx->i64, -2, 0);
2157                 } else {
2158                         /* All bits set */
2159                         value = LLVMConstInt(ctx->i64, -1, 0);
2160                 }
2161                 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
2162                 value = LLVMBuildShl(ctx->ac.builder, value, id, "");
2163                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
2164                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
2165                         value = LLVMBuildNot(ctx->ac.builder, value, "");
2166                 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
2167                 break;
2168         }
2169
2170         case TGSI_SEMANTIC_CS_USER_DATA:
2171                 value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data);
2172                 break;
2173
2174         default:
2175                 assert(!"unknown system value");
2176                 return;
2177         }
2178
2179         ctx->system_values[index] = value;
2180 }
2181
2182 void si_declare_compute_memory(struct si_shader_context *ctx)
2183 {
2184         struct si_shader_selector *sel = ctx->shader->selector;
2185         unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
2186
2187         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
2188         LLVMValueRef var;
2189
2190         assert(!ctx->ac.lds);
2191
2192         var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2193                                           LLVMArrayType(ctx->i8, lds_size),
2194                                           "compute_lds",
2195                                           AC_ADDR_SPACE_LDS);
2196         LLVMSetAlignment(var, 4);
2197
2198         ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
2199 }
2200
2201 void si_tgsi_declare_compute_memory(struct si_shader_context *ctx,
2202                                     const struct tgsi_full_declaration *decl)
2203 {
2204         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
2205         assert(decl->Range.First == decl->Range.Last);
2206
2207         si_declare_compute_memory(ctx);
2208 }
2209
2210 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
2211 {
2212         LLVMValueRef ptr =
2213                 LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2214         struct si_shader_selector *sel = ctx->shader->selector;
2215
2216         /* Do the bounds checking with a descriptor, because
2217          * doing computation and manual bounds checking of 64-bit
2218          * addresses generates horrible VALU code with very high
2219          * VGPR usage and very low SIMD occupancy.
2220          */
2221         ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
2222
2223         LLVMValueRef desc0, desc1;
2224         desc0 = ptr;
2225         desc1 = LLVMConstInt(ctx->i32,
2226                              S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
2227
2228         LLVMValueRef desc_elems[] = {
2229                 desc0,
2230                 desc1,
2231                 LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
2232                 LLVMConstInt(ctx->i32,
2233                         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2234                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2235                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2236                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2237                         S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
2238                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
2239         };
2240
2241         return ac_build_gather_values(&ctx->ac, desc_elems, 4);
2242 }
2243
2244 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
2245 {
2246         LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
2247                                              ctx->param_const_and_shader_buffers);
2248
2249         return ac_build_load_to_sgpr(&ctx->ac, list_ptr,
2250                                      LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
2251 }
2252
2253 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
2254 {
2255         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2256         struct si_shader_selector *sel = ctx->shader->selector;
2257
2258         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2259
2260         if (sel->info.const_buffers_declared == 1 &&
2261             sel->info.shader_buffers_declared == 0) {
2262                 return load_const_buffer_desc_fast_path(ctx);
2263         }
2264
2265         index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
2266         index = LLVMBuildAdd(ctx->ac.builder, index,
2267                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2268
2269         return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2270 }
2271
2272 static LLVMValueRef
2273 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
2274 {
2275         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2276         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
2277                                              ctx->param_const_and_shader_buffers);
2278
2279         index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
2280         index = LLVMBuildSub(ctx->ac.builder,
2281                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
2282                              index, "");
2283
2284         return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
2285 }
2286
2287 static LLVMValueRef fetch_constant(
2288         struct lp_build_tgsi_context *bld_base,
2289         const struct tgsi_full_src_register *reg,
2290         enum tgsi_opcode_type type,
2291         unsigned swizzle_in)
2292 {
2293         struct si_shader_context *ctx = si_shader_context(bld_base);
2294         struct si_shader_selector *sel = ctx->shader->selector;
2295         const struct tgsi_ind_register *ireg = &reg->Indirect;
2296         unsigned buf, idx;
2297         unsigned swizzle = swizzle_in & 0xffff;
2298
2299         LLVMValueRef addr, bufp;
2300
2301         if (swizzle_in == LP_CHAN_ALL) {
2302                 unsigned chan;
2303                 LLVMValueRef values[4];
2304                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
2305                         values[chan] = fetch_constant(bld_base, reg, type, chan);
2306
2307                 return ac_build_gather_values(&ctx->ac, values, 4);
2308         }
2309
2310         /* Split 64-bit loads. */
2311         if (tgsi_type_is_64bit(type)) {
2312                 LLVMValueRef lo, hi;
2313
2314                 lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
2315                 hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16));
2316                 return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type),
2317                                                 lo, hi);
2318         }
2319
2320         idx = reg->Register.Index * 4 + swizzle;
2321         if (reg->Register.Indirect) {
2322                 addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
2323         } else {
2324                 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
2325         }
2326
2327         /* Fast path when user data SGPRs point to constant buffer 0 directly. */
2328         if (sel->info.const_buffers_declared == 1 &&
2329             sel->info.shader_buffers_declared == 0) {
2330                 LLVMValueRef desc = load_const_buffer_desc_fast_path(ctx);
2331                 LLVMValueRef result = buffer_load_const(ctx, desc, addr);
2332                 return bitcast(bld_base, type, result);
2333         }
2334
2335         assert(reg->Register.Dimension);
2336         buf = reg->Dimension.Index;
2337
2338         if (reg->Dimension.Indirect) {
2339                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2340                 LLVMValueRef index;
2341                 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
2342                                                       reg->Dimension.Index,
2343                                                       ctx->num_const_buffers);
2344                 index = LLVMBuildAdd(ctx->ac.builder, index,
2345                                      LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2346                 bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2347         } else
2348                 bufp = load_const_buffer_desc(ctx, buf);
2349
2350         return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
2351 }
2352
2353 /* Initialize arguments for the shader export intrinsic */
2354 static void si_llvm_init_export_args(struct si_shader_context *ctx,
2355                                      LLVMValueRef *values,
2356                                      unsigned target,
2357                                      struct ac_export_args *args)
2358 {
2359         LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
2360         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
2361         unsigned chan;
2362         bool is_int8, is_int10;
2363
2364         /* Default is 0xf. Adjusted below depending on the format. */
2365         args->enabled_channels = 0xf; /* writemask */
2366
2367         /* Specify whether the EXEC mask represents the valid mask */
2368         args->valid_mask = 0;
2369
2370         /* Specify whether this is the last export */
2371         args->done = 0;
2372
2373         /* Specify the target we are exporting */
2374         args->target = target;
2375
2376         if (ctx->type == PIPE_SHADER_FRAGMENT) {
2377                 const struct si_shader_key *key = &ctx->shader->key;
2378                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
2379                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
2380
2381                 assert(cbuf >= 0 && cbuf < 8);
2382                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2383                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
2384                 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
2385         }
2386
2387         args->compr = false;
2388         args->out[0] = f32undef;
2389         args->out[1] = f32undef;
2390         args->out[2] = f32undef;
2391         args->out[3] = f32undef;
2392
2393         LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
2394         LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
2395                               unsigned bits, bool hi) = NULL;
2396
2397         switch (spi_shader_col_format) {
2398         case V_028714_SPI_SHADER_ZERO:
2399                 args->enabled_channels = 0; /* writemask */
2400                 args->target = V_008DFC_SQ_EXP_NULL;
2401                 break;
2402
2403         case V_028714_SPI_SHADER_32_R:
2404                 args->enabled_channels = 1; /* writemask */
2405                 args->out[0] = values[0];
2406                 break;
2407
2408         case V_028714_SPI_SHADER_32_GR:
2409                 args->enabled_channels = 0x3; /* writemask */
2410                 args->out[0] = values[0];
2411                 args->out[1] = values[1];
2412                 break;
2413
2414         case V_028714_SPI_SHADER_32_AR:
2415                 args->enabled_channels = 0x9; /* writemask */
2416                 args->out[0] = values[0];
2417                 args->out[3] = values[3];
2418                 break;
2419
2420         case V_028714_SPI_SHADER_FP16_ABGR:
2421                 packf = ac_build_cvt_pkrtz_f16;
2422                 break;
2423
2424         case V_028714_SPI_SHADER_UNORM16_ABGR:
2425                 packf = ac_build_cvt_pknorm_u16;
2426                 break;
2427
2428         case V_028714_SPI_SHADER_SNORM16_ABGR:
2429                 packf = ac_build_cvt_pknorm_i16;
2430                 break;
2431
2432         case V_028714_SPI_SHADER_UINT16_ABGR:
2433                 packi = ac_build_cvt_pk_u16;
2434                 break;
2435
2436         case V_028714_SPI_SHADER_SINT16_ABGR:
2437                 packi = ac_build_cvt_pk_i16;
2438                 break;
2439
2440         case V_028714_SPI_SHADER_32_ABGR:
2441                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2442                 break;
2443         }
2444
2445         /* Pack f16 or norm_i16/u16. */
2446         if (packf) {
2447                 for (chan = 0; chan < 2; chan++) {
2448                         LLVMValueRef pack_args[2] = {
2449                                 values[2 * chan],
2450                                 values[2 * chan + 1]
2451                         };
2452                         LLVMValueRef packed;
2453
2454                         packed = packf(&ctx->ac, pack_args);
2455                         args->out[chan] = ac_to_float(&ctx->ac, packed);
2456                 }
2457                 args->compr = 1; /* COMPR flag */
2458         }
2459         /* Pack i16/u16. */
2460         if (packi) {
2461                 for (chan = 0; chan < 2; chan++) {
2462                         LLVMValueRef pack_args[2] = {
2463                                 ac_to_integer(&ctx->ac, values[2 * chan]),
2464                                 ac_to_integer(&ctx->ac, values[2 * chan + 1])
2465                         };
2466                         LLVMValueRef packed;
2467
2468                         packed = packi(&ctx->ac, pack_args,
2469                                        is_int8 ? 8 : is_int10 ? 10 : 16,
2470                                        chan == 1);
2471                         args->out[chan] = ac_to_float(&ctx->ac, packed);
2472                 }
2473                 args->compr = 1; /* COMPR flag */
2474         }
2475 }
2476
2477 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2478                           LLVMValueRef alpha)
2479 {
2480         struct si_shader_context *ctx = si_shader_context(bld_base);
2481
2482         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2483                 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
2484                         [PIPE_FUNC_LESS] = LLVMRealOLT,
2485                         [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
2486                         [PIPE_FUNC_LEQUAL] = LLVMRealOLE,
2487                         [PIPE_FUNC_GREATER] = LLVMRealOGT,
2488                         [PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
2489                         [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
2490                 };
2491                 LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
2492                 assert(cond);
2493
2494                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2495                                 SI_PARAM_ALPHA_REF);
2496                 LLVMValueRef alpha_pass =
2497                         LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
2498                 ac_build_kill_if_false(&ctx->ac, alpha_pass);
2499         } else {
2500                 ac_build_kill_if_false(&ctx->ac, ctx->i1false);
2501         }
2502 }
2503
2504 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2505                                                   LLVMValueRef alpha,
2506                                                   unsigned samplemask_param)
2507 {
2508         struct si_shader_context *ctx = si_shader_context(bld_base);
2509         LLVMValueRef coverage;
2510
2511         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2512         coverage = LLVMGetParam(ctx->main_fn,
2513                                 samplemask_param);
2514         coverage = ac_to_integer(&ctx->ac, coverage);
2515
2516         coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32",
2517                                    ctx->i32,
2518                                    &coverage, 1, AC_FUNC_ATTR_READNONE);
2519
2520         coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
2521                                    ctx->f32, "");
2522
2523         coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
2524                                  LLVMConstReal(ctx->f32,
2525                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2526
2527         return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
2528 }
2529
2530 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
2531                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
2532 {
2533         unsigned reg_index;
2534         unsigned chan;
2535         unsigned const_chan;
2536         LLVMValueRef base_elt;
2537         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2538         LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2539                                                    SI_VS_CONST_CLIP_PLANES, 0);
2540         LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
2541
2542         for (reg_index = 0; reg_index < 2; reg_index ++) {
2543                 struct ac_export_args *args = &pos[2 + reg_index];
2544
2545                 args->out[0] =
2546                 args->out[1] =
2547                 args->out[2] =
2548                 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2549
2550                 /* Compute dot products of position and user clip plane vectors */
2551                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2552                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2553                                 LLVMValueRef addr =
2554                                         LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2555                                                                 const_chan) * 4, 0);
2556                                 base_elt = buffer_load_const(ctx, const_resource,
2557                                                              addr);
2558                                 args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
2559                                                                 out_elts[const_chan], args->out[chan]);
2560                         }
2561                 }
2562
2563                 args->enabled_channels = 0xf;
2564                 args->valid_mask = 0;
2565                 args->done = 0;
2566                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2567                 args->compr = 0;
2568         }
2569 }
2570
2571 static void si_dump_streamout(struct pipe_stream_output_info *so)
2572 {
2573         unsigned i;
2574
2575         if (so->num_outputs)
2576                 fprintf(stderr, "STREAMOUT\n");
2577
2578         for (i = 0; i < so->num_outputs; i++) {
2579                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2580                                 so->output[i].start_component;
2581                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2582                         i, so->output[i].output_buffer,
2583                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2584                         so->output[i].register_index,
2585                         mask & 1 ? "x" : "",
2586                         mask & 2 ? "y" : "",
2587                         mask & 4 ? "z" : "",
2588                         mask & 8 ? "w" : "");
2589         }
2590 }
2591
2592 static void emit_streamout_output(struct si_shader_context *ctx,
2593                                   LLVMValueRef const *so_buffers,
2594                                   LLVMValueRef const *so_write_offsets,
2595                                   struct pipe_stream_output *stream_out,
2596                                   struct si_shader_output_values *shader_out)
2597 {
2598         unsigned buf_idx = stream_out->output_buffer;
2599         unsigned start = stream_out->start_component;
2600         unsigned num_comps = stream_out->num_components;
2601         LLVMValueRef out[4];
2602
2603         assert(num_comps && num_comps <= 4);
2604         if (!num_comps || num_comps > 4)
2605                 return;
2606
2607         /* Load the output as int. */
2608         for (int j = 0; j < num_comps; j++) {
2609                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2610
2611                 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
2612         }
2613
2614         /* Pack the output. */
2615         LLVMValueRef vdata = NULL;
2616
2617         switch (num_comps) {
2618         case 1: /* as i32 */
2619                 vdata = out[0];
2620                 break;
2621         case 2: /* as v2i32 */
2622         case 3: /* as v4i32 (aligned to 4) */
2623                 out[3] = LLVMGetUndef(ctx->i32);
2624                 /* fall through */
2625         case 4: /* as v4i32 */
2626                 vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
2627                 break;
2628         }
2629
2630         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2631                                     vdata, num_comps,
2632                                     so_write_offsets[buf_idx],
2633                                     ctx->i32_0,
2634                                     stream_out->dst_offset * 4, 1, 1, true, false);
2635 }
2636
2637 /**
2638  * Write streamout data to buffers for vertex stream @p stream (different
2639  * vertex streams can occur for GS copy shaders).
2640  */
2641 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2642                                    struct si_shader_output_values *outputs,
2643                                    unsigned noutput, unsigned stream)
2644 {
2645         struct si_shader_selector *sel = ctx->shader->selector;
2646         struct pipe_stream_output_info *so = &sel->so;
2647         LLVMBuilderRef builder = ctx->ac.builder;
2648         int i;
2649         struct lp_build_if_state if_ctx;
2650
2651         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2652         LLVMValueRef so_vtx_count =
2653                 si_unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2654
2655         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2656
2657         /* can_emit = tid < so_vtx_count; */
2658         LLVMValueRef can_emit =
2659                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2660
2661         /* Emit the streamout code conditionally. This actually avoids
2662          * out-of-bounds buffer access. The hw tells us via the SGPR
2663          * (so_vtx_count) which threads are allowed to emit streamout data. */
2664         lp_build_if(&if_ctx, &ctx->gallivm, can_emit);
2665         {
2666                 /* The buffer offset is computed as follows:
2667                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2668                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2669                  *                attrib_offset
2670                  */
2671
2672                 LLVMValueRef so_write_index =
2673                         LLVMGetParam(ctx->main_fn,
2674                                      ctx->param_streamout_write_index);
2675
2676                 /* Compute (streamout_write_index + thread_id). */
2677                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2678
2679                 /* Load the descriptor and compute the write offset for each
2680                  * enabled buffer. */
2681                 LLVMValueRef so_write_offset[4] = {};
2682                 LLVMValueRef so_buffers[4];
2683                 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2684                                                     ctx->param_rw_buffers);
2685
2686                 for (i = 0; i < 4; i++) {
2687                         if (!so->stride[i])
2688                                 continue;
2689
2690                         LLVMValueRef offset = LLVMConstInt(ctx->i32,
2691                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
2692
2693                         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
2694
2695                         LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2696                                                               ctx->param_streamout_offset[i]);
2697                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2698
2699                         so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
2700                                                            LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
2701                                                            so_offset);
2702                 }
2703
2704                 /* Write streamout data. */
2705                 for (i = 0; i < so->num_outputs; i++) {
2706                         unsigned reg = so->output[i].register_index;
2707
2708                         if (reg >= noutput)
2709                                 continue;
2710
2711                         if (stream != so->output[i].stream)
2712                                 continue;
2713
2714                         emit_streamout_output(ctx, so_buffers, so_write_offset,
2715                                               &so->output[i], &outputs[reg]);
2716                 }
2717         }
2718         lp_build_endif(&if_ctx);
2719 }
2720
2721 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2722                             LLVMValueRef *values)
2723 {
2724         struct ac_export_args args;
2725
2726         si_llvm_init_export_args(ctx, values,
2727                                  V_008DFC_SQ_EXP_PARAM + index, &args);
2728         ac_build_export(&ctx->ac, &args);
2729 }
2730
2731 static void si_build_param_exports(struct si_shader_context *ctx,
2732                                    struct si_shader_output_values *outputs,
2733                                    unsigned noutput)
2734 {
2735         struct si_shader *shader = ctx->shader;
2736         unsigned param_count = 0;
2737
2738         for (unsigned i = 0; i < noutput; i++) {
2739                 unsigned semantic_name = outputs[i].semantic_name;
2740                 unsigned semantic_index = outputs[i].semantic_index;
2741
2742                 if (outputs[i].vertex_stream[0] != 0 &&
2743                     outputs[i].vertex_stream[1] != 0 &&
2744                     outputs[i].vertex_stream[2] != 0 &&
2745                     outputs[i].vertex_stream[3] != 0)
2746                         continue;
2747
2748                 switch (semantic_name) {
2749                 case TGSI_SEMANTIC_LAYER:
2750                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2751                 case TGSI_SEMANTIC_CLIPDIST:
2752                 case TGSI_SEMANTIC_COLOR:
2753                 case TGSI_SEMANTIC_BCOLOR:
2754                 case TGSI_SEMANTIC_PRIMID:
2755                 case TGSI_SEMANTIC_FOG:
2756                 case TGSI_SEMANTIC_TEXCOORD:
2757                 case TGSI_SEMANTIC_GENERIC:
2758                         break;
2759                 default:
2760                         continue;
2761                 }
2762
2763                 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2764                      semantic_index < SI_MAX_IO_GENERIC) &&
2765                     shader->key.opt.kill_outputs &
2766                     (1ull << si_shader_io_get_unique_index(semantic_name,
2767                                                            semantic_index, true)))
2768                         continue;
2769
2770                 si_export_param(ctx, param_count, outputs[i].values);
2771
2772                 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2773                 shader->info.vs_output_param_offset[i] = param_count++;
2774         }
2775
2776         shader->info.nr_param_exports = param_count;
2777 }
2778
2779 /* Generate export instructions for hardware VS shader stage */
2780 static void si_llvm_export_vs(struct si_shader_context *ctx,
2781                               struct si_shader_output_values *outputs,
2782                               unsigned noutput)
2783 {
2784         struct si_shader *shader = ctx->shader;
2785         struct ac_export_args pos_args[4] = {};
2786         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2787         unsigned pos_idx;
2788         int i;
2789
2790         /* Build position exports. */
2791         for (i = 0; i < noutput; i++) {
2792                 switch (outputs[i].semantic_name) {
2793                 case TGSI_SEMANTIC_POSITION:
2794                         si_llvm_init_export_args(ctx, outputs[i].values,
2795                                                  V_008DFC_SQ_EXP_POS, &pos_args[0]);
2796                         break;
2797                 case TGSI_SEMANTIC_PSIZE:
2798                         psize_value = outputs[i].values[0];
2799                         break;
2800                 case TGSI_SEMANTIC_LAYER:
2801                         layer_value = outputs[i].values[0];
2802                         break;
2803                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2804                         viewport_index_value = outputs[i].values[0];
2805                         break;
2806                 case TGSI_SEMANTIC_EDGEFLAG:
2807                         edgeflag_value = outputs[i].values[0];
2808                         break;
2809                 case TGSI_SEMANTIC_CLIPDIST:
2810                         if (!shader->key.opt.clip_disable) {
2811                                 unsigned index = 2 + outputs[i].semantic_index;
2812                                 si_llvm_init_export_args(ctx, outputs[i].values,
2813                                                          V_008DFC_SQ_EXP_POS + index,
2814                                                          &pos_args[index]);
2815                         }
2816                         break;
2817                 case TGSI_SEMANTIC_CLIPVERTEX:
2818                         if (!shader->key.opt.clip_disable) {
2819                                 si_llvm_emit_clipvertex(ctx, pos_args,
2820                                                         outputs[i].values);
2821                         }
2822                         break;
2823                 }
2824         }
2825
2826         /* We need to add the position output manually if it's missing. */
2827         if (!pos_args[0].out[0]) {
2828                 pos_args[0].enabled_channels = 0xf; /* writemask */
2829                 pos_args[0].valid_mask = 0; /* EXEC mask */
2830                 pos_args[0].done = 0; /* last export? */
2831                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2832                 pos_args[0].compr = 0; /* COMPR flag */
2833                 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2834                 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2835                 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2836                 pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
2837         }
2838
2839         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2840         if (shader->selector->info.writes_psize ||
2841             shader->selector->info.writes_edgeflag ||
2842             shader->selector->info.writes_viewport_index ||
2843             shader->selector->info.writes_layer) {
2844                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2845                                                (shader->selector->info.writes_edgeflag << 1) |
2846                                                (shader->selector->info.writes_layer << 2);
2847
2848                 pos_args[1].valid_mask = 0; /* EXEC mask */
2849                 pos_args[1].done = 0; /* last export? */
2850                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2851                 pos_args[1].compr = 0; /* COMPR flag */
2852                 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
2853                 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
2854                 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
2855                 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
2856
2857                 if (shader->selector->info.writes_psize)
2858                         pos_args[1].out[0] = psize_value;
2859
2860                 if (shader->selector->info.writes_edgeflag) {
2861                         /* The output is a float, but the hw expects an integer
2862                          * with the first bit containing the edge flag. */
2863                         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
2864                                                          edgeflag_value,
2865                                                          ctx->i32, "");
2866                         edgeflag_value = ac_build_umin(&ctx->ac,
2867                                                       edgeflag_value,
2868                                                       ctx->i32_1);
2869
2870                         /* The LLVM intrinsic expects a float. */
2871                         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
2872                 }
2873
2874                 if (ctx->screen->info.chip_class >= GFX9) {
2875                         /* GFX9 has the layer in out.z[10:0] and the viewport
2876                          * index in out.z[19:16].
2877                          */
2878                         if (shader->selector->info.writes_layer)
2879                                 pos_args[1].out[2] = layer_value;
2880
2881                         if (shader->selector->info.writes_viewport_index) {
2882                                 LLVMValueRef v = viewport_index_value;
2883
2884                                 v = ac_to_integer(&ctx->ac, v);
2885                                 v = LLVMBuildShl(ctx->ac.builder, v,
2886                                                  LLVMConstInt(ctx->i32, 16, 0), "");
2887                                 v = LLVMBuildOr(ctx->ac.builder, v,
2888                                                 ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
2889                                 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
2890                                 pos_args[1].enabled_channels |= 1 << 2;
2891                         }
2892                 } else {
2893                         if (shader->selector->info.writes_layer)
2894                                 pos_args[1].out[2] = layer_value;
2895
2896                         if (shader->selector->info.writes_viewport_index) {
2897                                 pos_args[1].out[3] = viewport_index_value;
2898                                 pos_args[1].enabled_channels |= 1 << 3;
2899                         }
2900                 }
2901         }
2902
2903         for (i = 0; i < 4; i++)
2904                 if (pos_args[i].out[0])
2905                         shader->info.nr_pos_exports++;
2906
2907         pos_idx = 0;
2908         for (i = 0; i < 4; i++) {
2909                 if (!pos_args[i].out[0])
2910                         continue;
2911
2912                 /* Specify the target we are exporting */
2913                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2914
2915                 if (pos_idx == shader->info.nr_pos_exports)
2916                         /* Specify that this is the last export */
2917                         pos_args[i].done = 1;
2918
2919                 ac_build_export(&ctx->ac, &pos_args[i]);
2920         }
2921
2922         /* Build parameter exports. */
2923         si_build_param_exports(ctx, outputs, noutput);
2924 }
2925
2926 /**
2927  * Forward all outputs from the vertex shader to the TES. This is only used
2928  * for the fixed function TCS.
2929  */
2930 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2931 {
2932         struct si_shader_context *ctx = si_shader_context(bld_base);
2933         LLVMValueRef invocation_id, buffer, buffer_offset;
2934         LLVMValueRef lds_vertex_stride, lds_base;
2935         uint64_t inputs;
2936
2937         invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
2938         buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
2939         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2940
2941         lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
2942         lds_base = get_tcs_in_current_patch_offset(ctx);
2943         lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
2944                                  lds_base);
2945
2946         inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2947         while (inputs) {
2948                 unsigned i = u_bit_scan64(&inputs);
2949
2950                 LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
2951                                             LLVMConstInt(ctx->i32, 4 * i, 0),
2952                                              "");
2953
2954                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2955                                               get_rel_patch_id(ctx),
2956                                               invocation_id,
2957                                               LLVMConstInt(ctx->i32, i, 0));
2958
2959                 LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0,
2960                                               lds_ptr);
2961
2962                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2963                                             buffer_offset, 0, 1, 0, true, false);
2964         }
2965 }
2966
2967 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2968                                   LLVMValueRef rel_patch_id,
2969                                   LLVMValueRef invocation_id,
2970                                   LLVMValueRef tcs_out_current_patch_data_offset,
2971                                   LLVMValueRef invoc0_tf_outer[4],
2972                                   LLVMValueRef invoc0_tf_inner[2])
2973 {
2974         struct si_shader_context *ctx = si_shader_context(bld_base);
2975         struct si_shader *shader = ctx->shader;
2976         unsigned tess_inner_index, tess_outer_index;
2977         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2978         LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2979         unsigned stride, outer_comps, inner_comps, i, offset;
2980         struct lp_build_if_state if_ctx, inner_if_ctx;
2981
2982         /* Add a barrier before loading tess factors from LDS. */
2983         if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
2984                 si_llvm_emit_barrier(NULL, bld_base, NULL);
2985
2986         /* Do this only for invocation 0, because the tess levels are per-patch,
2987          * not per-vertex.
2988          *
2989          * This can't jump, because invocation 0 executes this. It should
2990          * at least mask out the loads and stores for other invocations.
2991          */
2992         lp_build_if(&if_ctx, &ctx->gallivm,
2993                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
2994                                   invocation_id, ctx->i32_0, ""));
2995
2996         /* Determine the layout of one tess factor element in the buffer. */
2997         switch (shader->key.part.tcs.epilog.prim_mode) {
2998         case PIPE_PRIM_LINES:
2999                 stride = 2; /* 2 dwords, 1 vec2 store */
3000                 outer_comps = 2;
3001                 inner_comps = 0;
3002                 break;
3003         case PIPE_PRIM_TRIANGLES:
3004                 stride = 4; /* 4 dwords, 1 vec4 store */
3005                 outer_comps = 3;
3006                 inner_comps = 1;
3007                 break;
3008         case PIPE_PRIM_QUADS:
3009                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
3010                 outer_comps = 4;
3011                 inner_comps = 2;
3012                 break;
3013         default:
3014                 assert(0);
3015                 return;
3016         }
3017
3018         for (i = 0; i < 4; i++) {
3019                 inner[i] = LLVMGetUndef(ctx->i32);
3020                 outer[i] = LLVMGetUndef(ctx->i32);
3021         }
3022
3023         if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
3024                 /* Tess factors are in VGPRs. */
3025                 for (i = 0; i < outer_comps; i++)
3026                         outer[i] = out[i] = invoc0_tf_outer[i];
3027                 for (i = 0; i < inner_comps; i++)
3028                         inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
3029         } else {
3030                 /* Load tess_inner and tess_outer from LDS.
3031                  * Any invocation can write them, so we can't get them from a temporary.
3032                  */
3033                 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
3034                 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
3035
3036                 lds_base = tcs_out_current_patch_data_offset;
3037                 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
3038                                          LLVMConstInt(ctx->i32,
3039                                                       tess_inner_index * 4, 0), "");
3040                 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
3041                                          LLVMConstInt(ctx->i32,
3042                                                       tess_outer_index * 4, 0), "");
3043
3044                 for (i = 0; i < outer_comps; i++) {
3045                         outer[i] = out[i] =
3046                                 lds_load(bld_base, ctx->ac.i32, i, lds_outer);
3047                 }
3048                 for (i = 0; i < inner_comps; i++) {
3049                         inner[i] = out[outer_comps+i] =
3050                                 lds_load(bld_base, ctx->ac.i32, i, lds_inner);
3051                 }
3052         }
3053
3054         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
3055                 /* For isolines, the hardware expects tess factors in the
3056                  * reverse order from what GLSL / TGSI specify.
3057                  */
3058                 LLVMValueRef tmp = out[0];
3059                 out[0] = out[1];
3060                 out[1] = tmp;
3061         }
3062
3063         /* Convert the outputs to vectors for stores. */
3064         vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
3065         vec1 = NULL;
3066
3067         if (stride > 4)
3068                 vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
3069
3070         /* Get the buffer. */
3071         buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
3072
3073         /* Get the offset. */
3074         tf_base = LLVMGetParam(ctx->main_fn,
3075                                ctx->param_tcs_factor_offset);
3076         byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
3077                                   LLVMConstInt(ctx->i32, 4 * stride, 0), "");
3078
3079         lp_build_if(&inner_if_ctx, &ctx->gallivm,
3080                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3081                                   rel_patch_id, ctx->i32_0, ""));
3082
3083         /* Store the dynamic HS control word. */
3084         offset = 0;
3085         if (ctx->screen->info.chip_class <= VI) {
3086                 ac_build_buffer_store_dword(&ctx->ac, buffer,
3087                                             LLVMConstInt(ctx->i32, 0x80000000, 0),
3088                                             1, ctx->i32_0, tf_base,
3089                                             offset, 1, 0, true, false);
3090                 offset += 4;
3091         }
3092
3093         lp_build_endif(&inner_if_ctx);
3094
3095         /* Store the tessellation factors. */
3096         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
3097                                     MIN2(stride, 4), byteoffset, tf_base,
3098                                     offset, 1, 0, true, false);
3099         offset += 16;
3100         if (vec1)
3101                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
3102                                             stride - 4, byteoffset, tf_base,
3103                                             offset, 1, 0, true, false);
3104
3105         /* Store the tess factors into the offchip buffer if TES reads them. */
3106         if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
3107                 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
3108                 LLVMValueRef tf_inner_offset;
3109                 unsigned param_outer, param_inner;
3110
3111                 buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
3112                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
3113
3114                 param_outer = si_shader_io_get_unique_index_patch(
3115                                       TGSI_SEMANTIC_TESSOUTER, 0);
3116                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
3117                                         LLVMConstInt(ctx->i32, param_outer, 0));
3118
3119                 outer_vec = ac_build_gather_values(&ctx->ac, outer,
3120                                                    util_next_power_of_two(outer_comps));
3121
3122                 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
3123                                             outer_comps, tf_outer_offset,
3124                                             base, 0, 1, 0, true, false);
3125                 if (inner_comps) {
3126                         param_inner = si_shader_io_get_unique_index_patch(
3127                                               TGSI_SEMANTIC_TESSINNER, 0);
3128                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
3129                                         LLVMConstInt(ctx->i32, param_inner, 0));
3130
3131                         inner_vec = inner_comps == 1 ? inner[0] :
3132                                     ac_build_gather_values(&ctx->ac, inner, inner_comps);
3133                         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
3134                                                     inner_comps, tf_inner_offset,
3135                                                     base, 0, 1, 0, true, false);
3136                 }
3137         }
3138
3139         lp_build_endif(&if_ctx);
3140 }
3141
3142 static LLVMValueRef
3143 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
3144                     unsigned param, unsigned return_index)
3145 {
3146         return LLVMBuildInsertValue(ctx->ac.builder, ret,
3147                                     LLVMGetParam(ctx->main_fn, param),
3148                                     return_index, "");
3149 }
3150
3151 static LLVMValueRef
3152 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
3153                           unsigned param, unsigned return_index)
3154 {
3155         LLVMBuilderRef builder = ctx->ac.builder;
3156         LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
3157
3158         return LLVMBuildInsertValue(builder, ret,
3159                                     ac_to_float(&ctx->ac, p),
3160                                     return_index, "");
3161 }
3162
3163 static LLVMValueRef
3164 si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
3165                     unsigned param, unsigned return_index)
3166 {
3167         LLVMBuilderRef builder = ctx->ac.builder;
3168         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, param);
3169         ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
3170         return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
3171 }
3172
3173 /* This only writes the tessellation factor levels. */
3174 static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
3175                                       unsigned max_outputs,
3176                                       LLVMValueRef *addrs)
3177 {
3178         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3179         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
3180         LLVMBuilderRef builder = ctx->ac.builder;
3181         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
3182
3183         si_copy_tcs_inputs(bld_base);
3184
3185         rel_patch_id = get_rel_patch_id(ctx);
3186         invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
3187         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
3188
3189         if (ctx->screen->info.chip_class >= GFX9) {
3190                 LLVMBasicBlockRef blocks[2] = {
3191                         LLVMGetInsertBlock(builder),
3192                         ctx->merged_wrap_if_state.entry_block
3193                 };
3194                 LLVMValueRef values[2];
3195
3196                 lp_build_endif(&ctx->merged_wrap_if_state);
3197
3198                 values[0] = rel_patch_id;
3199                 values[1] = LLVMGetUndef(ctx->i32);
3200                 rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3201
3202                 values[0] = tf_lds_offset;
3203                 values[1] = LLVMGetUndef(ctx->i32);
3204                 tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3205
3206                 values[0] = invocation_id;
3207                 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
3208                 invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3209         }
3210
3211         /* Return epilog parameters from this function. */
3212         LLVMValueRef ret = ctx->return_value;
3213         unsigned vgpr;
3214
3215         if (ctx->screen->info.chip_class >= GFX9) {
3216                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3217                                           8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3218                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3219                                           8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3220                 /* Tess offchip and tess factor offsets are at the beginning. */
3221                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3222                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3223                 vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
3224         } else {
3225                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3226                                           GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
3227                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3228                                           GFX6_SGPR_TCS_OUT_LAYOUT);
3229                 /* Tess offchip and tess factor offsets are after user SGPRs. */
3230                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
3231                                           GFX6_TCS_NUM_USER_SGPR);
3232                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
3233                                           GFX6_TCS_NUM_USER_SGPR + 1);
3234                 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
3235         }
3236
3237         /* VGPRs */
3238         rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
3239         invocation_id = ac_to_float(&ctx->ac, invocation_id);
3240         tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
3241
3242         /* Leave a hole corresponding to the two input VGPRs. This ensures that
3243          * the invocation_id output does not alias the tcs_rel_ids input,
3244          * which saves a V_MOV on gfx9.
3245          */
3246         vgpr += 2;
3247
3248         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
3249         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
3250
3251         if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
3252                 vgpr++; /* skip the tess factor LDS offset */
3253                 for (unsigned i = 0; i < 6; i++) {
3254                         LLVMValueRef value =
3255                                 LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
3256                         value = ac_to_float(&ctx->ac, value);
3257                         ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
3258                 }
3259         } else {
3260                 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
3261         }
3262         ctx->return_value = ret;
3263 }
3264
3265 /* Pass TCS inputs from LS to TCS on GFX9. */
3266 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
3267 {
3268         LLVMValueRef ret = ctx->return_value;
3269
3270         ret = si_insert_input_ptr(ctx, ret, 0, 0);
3271         ret = si_insert_input_ptr(ctx, ret, 1, 1);
3272         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3273         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3274         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3275         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3276
3277         ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers,
3278                                   8 + SI_SGPR_RW_BUFFERS);
3279         ret = si_insert_input_ptr(ctx, ret,
3280                                   ctx->param_bindless_samplers_and_images,
3281                                   8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3282
3283         ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
3284                                   8 + SI_SGPR_VS_STATE_BITS);
3285
3286         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3287                                   8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3288         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
3289                                   8 + GFX9_SGPR_TCS_OUT_OFFSETS);
3290         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3291                                   8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3292
3293         unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
3294         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
3295                                    ac_to_float(&ctx->ac, ctx->abi.tcs_patch_id),
3296                                    vgpr++, "");
3297         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
3298                                    ac_to_float(&ctx->ac, ctx->abi.tcs_rel_ids),
3299                                    vgpr++, "");
3300         ctx->return_value = ret;
3301 }
3302
3303 /* Pass GS inputs from ES to GS on GFX9. */
3304 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
3305 {
3306         LLVMValueRef ret = ctx->return_value;
3307
3308         ret = si_insert_input_ptr(ctx, ret, 0, 0);
3309         ret = si_insert_input_ptr(ctx, ret, 1, 1);
3310         ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
3311         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3312         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3313
3314         ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers,
3315                                   8 + SI_SGPR_RW_BUFFERS);
3316         ret = si_insert_input_ptr(ctx, ret,
3317                                   ctx->param_bindless_samplers_and_images,
3318                                   8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3319
3320         unsigned vgpr;
3321         if (ctx->type == PIPE_SHADER_VERTEX)
3322                 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
3323         else
3324                 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
3325
3326         for (unsigned i = 0; i < 5; i++) {
3327                 unsigned param = ctx->param_gs_vtx01_offset + i;
3328                 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
3329         }
3330         ctx->return_value = ret;
3331 }
3332
3333 static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi,
3334                                      unsigned max_outputs,
3335                                      LLVMValueRef *addrs)
3336 {
3337         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3338         struct si_shader *shader = ctx->shader;
3339         struct tgsi_shader_info *info = &shader->selector->info;
3340         unsigned i, chan;
3341         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3342                                               ctx->param_rel_auto_id);
3343         LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3344         LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
3345                                                  vertex_dw_stride, "");
3346
3347         /* Write outputs to LDS. The next shader (TCS aka HS) will read
3348          * its inputs from it. */
3349         for (i = 0; i < info->num_outputs; i++) {
3350                 unsigned name = info->output_semantic_name[i];
3351                 unsigned index = info->output_semantic_index[i];
3352
3353                 /* The ARB_shader_viewport_layer_array spec contains the
3354                  * following issue:
3355                  *
3356                  *    2) What happens if gl_ViewportIndex or gl_Layer is
3357                  *    written in the vertex shader and a geometry shader is
3358                  *    present?
3359                  *
3360                  *    RESOLVED: The value written by the last vertex processing
3361                  *    stage is used. If the last vertex processing stage
3362                  *    (vertex, tessellation evaluation or geometry) does not
3363                  *    statically assign to gl_ViewportIndex or gl_Layer, index
3364                  *    or layer zero is assumed.
3365                  *
3366                  * So writes to those outputs in VS-as-LS are simply ignored.
3367                  */
3368                 if (name == TGSI_SEMANTIC_LAYER ||
3369                     name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3370                         continue;
3371
3372                 int param = si_shader_io_get_unique_index(name, index, false);
3373                 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
3374                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
3375
3376                 for (chan = 0; chan < 4; chan++) {
3377                         if (!(info->output_usagemask[i] & (1 << chan)))
3378                                 continue;
3379
3380                         lds_store(ctx, chan, dw_addr,
3381                                   LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
3382                 }
3383         }
3384
3385         if (ctx->screen->info.chip_class >= GFX9)
3386                 si_set_ls_return_value_for_tcs(ctx);
3387 }
3388
3389 static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
3390                                      unsigned max_outputs,
3391                                      LLVMValueRef *addrs)
3392 {
3393         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3394         struct si_shader *es = ctx->shader;
3395         struct tgsi_shader_info *info = &es->selector->info;
3396         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3397                                             ctx->param_es2gs_offset);
3398         LLVMValueRef lds_base = NULL;
3399         unsigned chan;
3400         int i;
3401
3402         if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
3403                 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3404                 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3405                 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3406                 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
3407                                          LLVMBuildMul(ctx->ac.builder, wave_idx,
3408                                                       LLVMConstInt(ctx->i32, 64, false), ""), "");
3409                 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
3410                                         LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3411         }
3412
3413         for (i = 0; i < info->num_outputs; i++) {
3414                 int param;
3415
3416                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3417                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3418                         continue;
3419
3420                 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3421                                                       info->output_semantic_index[i], false);
3422
3423                 for (chan = 0; chan < 4; chan++) {
3424                         if (!(info->output_usagemask[i] & (1 << chan)))
3425                                 continue;
3426
3427                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
3428                         out_val = ac_to_integer(&ctx->ac, out_val);
3429
3430                         /* GFX9 has the ESGS ring in LDS. */
3431                         if (ctx->screen->info.chip_class >= GFX9) {
3432                                 lds_store(ctx, param * 4 + chan, lds_base, out_val);
3433                                 continue;
3434                         }
3435
3436                         ac_build_buffer_store_dword(&ctx->ac,
3437                                                     ctx->esgs_ring,
3438                                                     out_val, 1, NULL, soffset,
3439                                                     (4 * param + chan) * 4,
3440                                                     1, 1, true, true);
3441                 }
3442         }
3443
3444         if (ctx->screen->info.chip_class >= GFX9)
3445                 si_set_es_return_value_for_gs(ctx);
3446 }
3447
3448 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3449 {
3450         if (ctx->screen->info.chip_class >= GFX9)
3451                 return si_unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3452         else
3453                 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3454 }
3455
3456 static void emit_gs_epilogue(struct si_shader_context *ctx)
3457 {
3458         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3459                          si_get_gs_wave_id(ctx));
3460
3461         if (ctx->screen->info.chip_class >= GFX9)
3462                 lp_build_endif(&ctx->merged_wrap_if_state);
3463 }
3464
3465 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
3466                                      unsigned max_outputs,
3467                                      LLVMValueRef *addrs)
3468 {
3469         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3470         struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info;
3471
3472         assert(info->num_outputs <= max_outputs);
3473
3474         emit_gs_epilogue(ctx);
3475 }
3476
3477 static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3478 {
3479         struct si_shader_context *ctx = si_shader_context(bld_base);
3480         emit_gs_epilogue(ctx);
3481 }
3482
3483 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3484                                      unsigned max_outputs,
3485                                      LLVMValueRef *addrs)
3486 {
3487         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3488         struct tgsi_shader_info *info = &ctx->shader->selector->info;
3489         struct si_shader_output_values *outputs = NULL;
3490         int i,j;
3491
3492         assert(!ctx->shader->is_gs_copy_shader);
3493         assert(info->num_outputs <= max_outputs);
3494
3495         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3496
3497         /* Vertex color clamping.
3498          *
3499          * This uses a state constant loaded in a user data SGPR and
3500          * an IF statement is added that clamps all colors if the constant
3501          * is true.
3502          */
3503         struct lp_build_if_state if_ctx;
3504         LLVMValueRef cond = NULL;
3505         LLVMValueRef addr, val;
3506
3507         for (i = 0; i < info->num_outputs; i++) {
3508                 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3509                     info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3510                         continue;
3511
3512                 /* We've found a color. */
3513                 if (!cond) {
3514                         /* The state is in the first bit of the user SGPR. */
3515                         cond = LLVMGetParam(ctx->main_fn,
3516                                             ctx->param_vs_state_bits);
3517                         cond = LLVMBuildTrunc(ctx->ac.builder, cond,
3518                                               ctx->i1, "");
3519                         lp_build_if(&if_ctx, &ctx->gallivm, cond);
3520                 }
3521
3522                 for (j = 0; j < 4; j++) {
3523                         addr = addrs[4 * i + j];
3524                         val = LLVMBuildLoad(ctx->ac.builder, addr, "");
3525                         val = ac_build_clamp(&ctx->ac, val);
3526                         LLVMBuildStore(ctx->ac.builder, val, addr);
3527                 }
3528         }
3529
3530         if (cond)
3531                 lp_build_endif(&if_ctx);
3532
3533         for (i = 0; i < info->num_outputs; i++) {
3534                 outputs[i].semantic_name = info->output_semantic_name[i];
3535                 outputs[i].semantic_index = info->output_semantic_index[i];
3536
3537                 for (j = 0; j < 4; j++) {
3538                         outputs[i].values[j] =
3539                                 LLVMBuildLoad(ctx->ac.builder,
3540                                               addrs[4 * i + j],
3541                                               "");
3542                         outputs[i].vertex_stream[j] =
3543                                 (info->output_streams[i] >> (2 * j)) & 3;
3544                 }
3545         }
3546
3547         if (ctx->shader->selector->so.num_outputs)
3548                 si_llvm_emit_streamout(ctx, outputs, i, 0);
3549
3550         /* Export PrimitiveID. */
3551         if (ctx->shader->key.mono.u.vs_export_prim_id) {
3552                 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3553                 outputs[i].semantic_index = 0;
3554                 outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0));
3555                 for (j = 1; j < 4; j++)
3556                         outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3557
3558                 memset(outputs[i].vertex_stream, 0,
3559                        sizeof(outputs[i].vertex_stream));
3560                 i++;
3561         }
3562
3563         si_llvm_export_vs(ctx, outputs, i);
3564         FREE(outputs);
3565 }
3566
3567 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3568 {
3569         struct si_shader_context *ctx = si_shader_context(bld_base);
3570
3571         ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3572                               &ctx->outputs[0][0]);
3573 }
3574
3575 struct si_ps_exports {
3576         unsigned num;
3577         struct ac_export_args args[10];
3578 };
3579
3580 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3581                             LLVMValueRef depth, LLVMValueRef stencil,
3582                             LLVMValueRef samplemask, struct si_ps_exports *exp)
3583 {
3584         struct si_shader_context *ctx = si_shader_context(bld_base);
3585         struct ac_export_args args;
3586
3587         ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
3588
3589         memcpy(&exp->args[exp->num++], &args, sizeof(args));
3590 }
3591
3592 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3593                                 LLVMValueRef *color, unsigned index,
3594                                 unsigned samplemask_param,
3595                                 bool is_last, struct si_ps_exports *exp)
3596 {
3597         struct si_shader_context *ctx = si_shader_context(bld_base);
3598         int i;
3599
3600         /* Clamp color */
3601         if (ctx->shader->key.part.ps.epilog.clamp_color)
3602                 for (i = 0; i < 4; i++)
3603                         color[i] = ac_build_clamp(&ctx->ac, color[i]);
3604
3605         /* Alpha to one */
3606         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3607                 color[3] = ctx->ac.f32_1;
3608
3609         /* Alpha test */
3610         if (index == 0 &&
3611             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3612                 si_alpha_test(bld_base, color[3]);
3613
3614         /* Line & polygon smoothing */
3615         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3616                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3617                                                          samplemask_param);
3618
3619         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3620         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3621                 struct ac_export_args args[8];
3622                 int c, last = -1;
3623
3624                 /* Get the export arguments, also find out what the last one is. */
3625                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3626                         si_llvm_init_export_args(ctx, color,
3627                                                  V_008DFC_SQ_EXP_MRT + c, &args[c]);
3628                         if (args[c].enabled_channels)
3629                                 last = c;
3630                 }
3631
3632                 /* Emit all exports. */
3633                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3634                         if (is_last && last == c) {
3635                                 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3636                                 args[c].done = 1; /* DONE bit */
3637                         } else if (!args[c].enabled_channels)
3638                                 continue; /* unnecessary NULL export */
3639
3640                         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3641                 }
3642         } else {
3643                 struct ac_export_args args;
3644
3645                 /* Export */
3646                 si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
3647                                          &args);
3648                 if (is_last) {
3649                         args.valid_mask = 1; /* whether the EXEC mask is valid */
3650                         args.done = 1; /* DONE bit */
3651                 } else if (!args.enabled_channels)
3652                         return; /* unnecessary NULL export */
3653
3654                 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3655         }
3656 }
3657
3658 static void si_emit_ps_exports(struct si_shader_context *ctx,
3659                                struct si_ps_exports *exp)
3660 {
3661         for (unsigned i = 0; i < exp->num; i++)
3662                 ac_build_export(&ctx->ac, &exp->args[i]);
3663 }
3664
3665 /**
3666  * Return PS outputs in this order:
3667  *
3668  * v[0:3] = color0.xyzw
3669  * v[4:7] = color1.xyzw
3670  * ...
3671  * vN+0 = Depth
3672  * vN+1 = Stencil
3673  * vN+2 = SampleMask
3674  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3675  *
3676  * The alpha-ref SGPR is returned via its original location.
3677  */
3678 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3679                                       unsigned max_outputs,
3680                                       LLVMValueRef *addrs)
3681 {
3682         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3683         struct si_shader *shader = ctx->shader;
3684         struct tgsi_shader_info *info = &shader->selector->info;
3685         LLVMBuilderRef builder = ctx->ac.builder;
3686         unsigned i, j, first_vgpr, vgpr;
3687
3688         LLVMValueRef color[8][4] = {};
3689         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3690         LLVMValueRef ret;
3691
3692         if (ctx->postponed_kill)
3693                 ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3694
3695         /* Read the output values. */
3696         for (i = 0; i < info->num_outputs; i++) {
3697                 unsigned semantic_name = info->output_semantic_name[i];
3698                 unsigned semantic_index = info->output_semantic_index[i];
3699
3700                 switch (semantic_name) {
3701                 case TGSI_SEMANTIC_COLOR:
3702                         assert(semantic_index < 8);
3703                         for (j = 0; j < 4; j++) {
3704                                 LLVMValueRef ptr = addrs[4 * i + j];
3705                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3706                                 color[semantic_index][j] = result;
3707                         }
3708                         break;
3709                 case TGSI_SEMANTIC_POSITION:
3710                         depth = LLVMBuildLoad(builder,
3711                                               addrs[4 * i + 2], "");
3712                         break;
3713                 case TGSI_SEMANTIC_STENCIL:
3714                         stencil = LLVMBuildLoad(builder,
3715                                                 addrs[4 * i + 1], "");
3716                         break;
3717                 case TGSI_SEMANTIC_SAMPLEMASK:
3718                         samplemask = LLVMBuildLoad(builder,
3719                                                    addrs[4 * i + 0], "");
3720                         break;
3721                 default:
3722                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3723                                 semantic_name);
3724                 }
3725         }
3726
3727         /* Fill the return structure. */
3728         ret = ctx->return_value;
3729
3730         /* Set SGPRs. */
3731         ret = LLVMBuildInsertValue(builder, ret,
3732                                    ac_to_integer(&ctx->ac,
3733                                                  LLVMGetParam(ctx->main_fn,
3734                                                               SI_PARAM_ALPHA_REF)),
3735                                    SI_SGPR_ALPHA_REF, "");
3736
3737         /* Set VGPRs */
3738         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3739         for (i = 0; i < ARRAY_SIZE(color); i++) {
3740                 if (!color[i][0])
3741                         continue;
3742
3743                 for (j = 0; j < 4; j++)
3744                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3745         }
3746         if (depth)
3747                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3748         if (stencil)
3749                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3750         if (samplemask)
3751                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3752
3753         /* Add the input sample mask for smoothing at the end. */
3754         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3755                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3756         ret = LLVMBuildInsertValue(builder, ret,
3757                                    LLVMGetParam(ctx->main_fn,
3758                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3759
3760         ctx->return_value = ret;
3761 }
3762
3763 static void membar_emit(
3764                 const struct lp_build_tgsi_action *action,
3765                 struct lp_build_tgsi_context *bld_base,
3766                 struct lp_build_emit_data *emit_data)
3767 {
3768         struct si_shader_context *ctx = si_shader_context(bld_base);
3769         LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3770         unsigned flags = LLVMConstIntGetZExtValue(src0);
3771         unsigned waitcnt = NOOP_WAITCNT;
3772
3773         if (flags & TGSI_MEMBAR_THREAD_GROUP)
3774                 waitcnt &= VM_CNT & LGKM_CNT;
3775
3776         if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3777                      TGSI_MEMBAR_SHADER_BUFFER |
3778                      TGSI_MEMBAR_SHADER_IMAGE))
3779                 waitcnt &= VM_CNT;
3780
3781         if (flags & TGSI_MEMBAR_SHARED)
3782                 waitcnt &= LGKM_CNT;
3783
3784         if (waitcnt != NOOP_WAITCNT)
3785                 ac_build_waitcnt(&ctx->ac, waitcnt);
3786 }
3787
3788 static void clock_emit(
3789                 const struct lp_build_tgsi_action *action,
3790                 struct lp_build_tgsi_context *bld_base,
3791                 struct lp_build_emit_data *emit_data)
3792 {
3793         struct si_shader_context *ctx = si_shader_context(bld_base);
3794         LLVMValueRef tmp = ac_build_shader_clock(&ctx->ac);
3795
3796         emit_data->output[0] =
3797                 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, "");
3798         emit_data->output[1] =
3799                 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, "");
3800 }
3801
3802 static void si_llvm_emit_ddxy(
3803         const struct lp_build_tgsi_action *action,
3804         struct lp_build_tgsi_context *bld_base,
3805         struct lp_build_emit_data *emit_data)
3806 {
3807         struct si_shader_context *ctx = si_shader_context(bld_base);
3808         unsigned opcode = emit_data->info->opcode;
3809         LLVMValueRef val;
3810         int idx;
3811         unsigned mask;
3812
3813         if (opcode == TGSI_OPCODE_DDX_FINE)
3814                 mask = AC_TID_MASK_LEFT;
3815         else if (opcode == TGSI_OPCODE_DDY_FINE)
3816                 mask = AC_TID_MASK_TOP;
3817         else
3818                 mask = AC_TID_MASK_TOP_LEFT;
3819
3820         /* for DDX we want to next X pixel, DDY next Y pixel. */
3821         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3822
3823         val = ac_to_integer(&ctx->ac, emit_data->args[0]);
3824         val = ac_build_ddxy(&ctx->ac, mask, idx, val);
3825         emit_data->output[emit_data->chan] = val;
3826 }
3827
3828 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3829                                 struct lp_build_tgsi_context *bld_base,
3830                                 struct lp_build_emit_data *emit_data)
3831 {
3832         struct si_shader_context *ctx = si_shader_context(bld_base);
3833         struct si_shader *shader = ctx->shader;
3834         const struct tgsi_shader_info *info = &shader->selector->info;
3835         LLVMValueRef interp_param;
3836         const struct tgsi_full_instruction *inst = emit_data->inst;
3837         const struct tgsi_full_src_register *input = &inst->Src[0];
3838         int input_base, input_array_size;
3839         int chan;
3840         int i;
3841         LLVMValueRef prim_mask = ctx->abi.prim_mask;
3842         LLVMValueRef array_idx, offset_x = NULL, offset_y = NULL;
3843         int interp_param_idx;
3844         unsigned interp;
3845         unsigned location;
3846
3847         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3848                 /* offset is in second src, first two channels */
3849                 offset_x = lp_build_emit_fetch(bld_base, emit_data->inst, 1,
3850                                                TGSI_CHAN_X);
3851                 offset_y = lp_build_emit_fetch(bld_base, emit_data->inst, 1,
3852                                                TGSI_CHAN_Y);
3853         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3854                 LLVMValueRef sample_position;
3855                 LLVMValueRef sample_id;
3856                 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3857
3858                 /* fetch sample ID, then fetch its sample position,
3859                  * and place into first two channels.
3860                  */
3861                 sample_id = lp_build_emit_fetch(bld_base,
3862                                                 emit_data->inst, 1, TGSI_CHAN_X);
3863                 sample_id = ac_to_integer(&ctx->ac, sample_id);
3864
3865                 /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading
3866                  * Language 4.50 spec says about interpolateAtSample:
3867                  *
3868                  *    "Returns the value of the input interpolant variable at
3869                  *     the location of sample number sample. If multisample
3870                  *     buffers are not available, the input variable will be
3871                  *     evaluated at the center of the pixel. If sample sample
3872                  *     does not exist, the position used to interpolate the
3873                  *     input variable is undefined."
3874                  *
3875                  * This means that sample_id values outside of the valid are
3876                  * in fact valid input, and the usual mechanism for loading the
3877                  * sample position doesn't work.
3878                  */
3879                 if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) {
3880                         LLVMValueRef center[4] = {
3881                                 LLVMConstReal(ctx->f32, 0.5),
3882                                 LLVMConstReal(ctx->f32, 0.5),
3883                                 ctx->ac.f32_0,
3884                                 ctx->ac.f32_0,
3885                         };
3886
3887                         sample_position = ac_build_gather_values(&ctx->ac, center, 4);
3888                 } else {
3889                         sample_position = load_sample_position(&ctx->abi, sample_id);
3890                 }
3891
3892                 offset_x = LLVMBuildExtractElement(ctx->ac.builder, sample_position,
3893                                                    ctx->i32_0, "");
3894
3895                 offset_x = LLVMBuildFSub(ctx->ac.builder, offset_x, halfval, "");
3896                 offset_y = LLVMBuildExtractElement(ctx->ac.builder, sample_position,
3897                                                    ctx->i32_1, "");
3898                 offset_y = LLVMBuildFSub(ctx->ac.builder, offset_y, halfval, "");
3899         }
3900
3901         assert(input->Register.File == TGSI_FILE_INPUT);
3902
3903         if (input->Register.Indirect) {
3904                 unsigned array_id = input->Indirect.ArrayID;
3905
3906                 if (array_id) {
3907                         input_base = info->input_array_first[array_id];
3908                         input_array_size = info->input_array_last[array_id] - input_base + 1;
3909                 } else {
3910                         input_base = inst->Src[0].Register.Index;
3911                         input_array_size = info->num_inputs - input_base;
3912                 }
3913
3914                 array_idx = si_get_indirect_index(ctx, &input->Indirect,
3915                                                   1, input->Register.Index - input_base);
3916         } else {
3917                 input_base = inst->Src[0].Register.Index;
3918                 input_array_size = 1;
3919                 array_idx = ctx->i32_0;
3920         }
3921
3922         interp = shader->selector->info.input_interpolate[input_base];
3923
3924         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3925             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3926                 location = TGSI_INTERPOLATE_LOC_CENTER;
3927         else
3928                 location = TGSI_INTERPOLATE_LOC_CENTROID;
3929
3930         interp_param_idx = lookup_interp_param_index(interp, location);
3931         if (interp_param_idx == -1)
3932                 return;
3933         else if (interp_param_idx)
3934                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3935         else
3936                 interp_param = NULL;
3937
3938         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3939             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3940                 LLVMValueRef ij_out[2];
3941                 LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param);
3942
3943                 /*
3944                  * take the I then J parameters, and the DDX/Y for it, and
3945                  * calculate the IJ inputs for the interpolator.
3946                  * temp1 = ddx * offset/sample.x + I;
3947                  * interp_param.I = ddy * offset/sample.y + temp1;
3948                  * temp1 = ddx * offset/sample.x + J;
3949                  * interp_param.J = ddy * offset/sample.y + temp1;
3950                  */
3951                 for (i = 0; i < 2; i++) {
3952                         LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3953                         LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3954                         LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder,
3955                                                                       ddxy_out, ix_ll, "");
3956                         LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder,
3957                                                                       ddxy_out, iy_ll, "");
3958                         LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
3959                                                                          interp_param, ix_ll, "");
3960                         LLVMValueRef temp;
3961
3962                         interp_el = ac_to_float(&ctx->ac, interp_el);
3963
3964                         temp = ac_build_fmad(&ctx->ac, ddx_el, offset_x, interp_el);
3965                         ij_out[i] = ac_build_fmad(&ctx->ac, ddy_el, offset_y, temp);
3966                 }
3967                 interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
3968         }
3969
3970         if (interp_param)
3971                 interp_param = ac_to_float(&ctx->ac, interp_param);
3972
3973         for (chan = 0; chan < 4; chan++) {
3974                 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3975                 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3976
3977                 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3978                         LLVMValueRef v, i = NULL, j = NULL;
3979
3980                         if (interp_param) {
3981                                 i = LLVMBuildExtractElement(
3982                                         ctx->ac.builder, interp_param, ctx->i32_0, "");
3983                                 j = LLVMBuildExtractElement(
3984                                         ctx->ac.builder, interp_param, ctx->i32_1, "");
3985                         }
3986                         v = si_build_fs_interp(ctx, input_base + idx, schan,
3987                                                prim_mask, i, j);
3988
3989                         gather = LLVMBuildInsertElement(ctx->ac.builder,
3990                                 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3991                 }
3992
3993                 emit_data->output[chan] = LLVMBuildExtractElement(
3994                         ctx->ac.builder, gather, array_idx, "");
3995         }
3996 }
3997
3998 static void vote_all_emit(
3999         const struct lp_build_tgsi_action *action,
4000         struct lp_build_tgsi_context *bld_base,
4001         struct lp_build_emit_data *emit_data)
4002 {
4003         struct si_shader_context *ctx = si_shader_context(bld_base);
4004
4005         LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]);
4006         emit_data->output[emit_data->chan] =
4007                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4008 }
4009
4010 static void vote_any_emit(
4011         const struct lp_build_tgsi_action *action,
4012         struct lp_build_tgsi_context *bld_base,
4013         struct lp_build_emit_data *emit_data)
4014 {
4015         struct si_shader_context *ctx = si_shader_context(bld_base);
4016
4017         LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]);
4018         emit_data->output[emit_data->chan] =
4019                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4020 }
4021
4022 static void vote_eq_emit(
4023         const struct lp_build_tgsi_action *action,
4024         struct lp_build_tgsi_context *bld_base,
4025         struct lp_build_emit_data *emit_data)
4026 {
4027         struct si_shader_context *ctx = si_shader_context(bld_base);
4028
4029         LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]);
4030         emit_data->output[emit_data->chan] =
4031                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4032 }
4033
4034 static void ballot_emit(
4035         const struct lp_build_tgsi_action *action,
4036         struct lp_build_tgsi_context *bld_base,
4037         struct lp_build_emit_data *emit_data)
4038 {
4039         struct si_shader_context *ctx = si_shader_context(bld_base);
4040         LLVMBuilderRef builder = ctx->ac.builder;
4041         LLVMValueRef tmp;
4042
4043         tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4044         tmp = ac_build_ballot(&ctx->ac, tmp);
4045         tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
4046
4047         emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
4048         emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
4049 }
4050
4051 static void read_lane_emit(
4052         const struct lp_build_tgsi_action *action,
4053         struct lp_build_tgsi_context *bld_base,
4054         struct lp_build_emit_data *emit_data)
4055 {
4056         struct si_shader_context *ctx = si_shader_context(bld_base);
4057
4058         if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_READ_INVOC) {
4059                 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
4060                                                          0, emit_data->src_chan);
4061
4062                 /* Always read the source invocation (= lane) from the X channel. */
4063                 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
4064                                                          1, TGSI_CHAN_X);
4065                 emit_data->arg_count = 2;
4066         }
4067
4068         /* We currently have no other way to prevent LLVM from lifting the icmp
4069          * calls to a dominating basic block.
4070          */
4071         ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]);
4072
4073         for (unsigned i = 0; i < emit_data->arg_count; ++i)
4074                 emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]);
4075
4076         emit_data->output[emit_data->chan] =
4077                 ac_build_intrinsic(&ctx->ac, action->intr_name,
4078                                    ctx->i32, emit_data->args, emit_data->arg_count,
4079                                    AC_FUNC_ATTR_READNONE |
4080                                    AC_FUNC_ATTR_CONVERGENT);
4081 }
4082
4083 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4084                                        struct lp_build_emit_data *emit_data)
4085 {
4086         struct si_shader_context *ctx = si_shader_context(bld_base);
4087         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4088         LLVMValueRef imm;
4089         unsigned stream;
4090
4091         assert(src0.File == TGSI_FILE_IMMEDIATE);
4092
4093         imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
4094         stream = LLVMConstIntGetZExtValue(imm) & 0x3;
4095         return stream;
4096 }
4097
4098 /* Emit one vertex from the geometry shader */
4099 static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
4100                                 unsigned stream,
4101                                 LLVMValueRef *addrs)
4102 {
4103         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4104         struct tgsi_shader_info *info = &ctx->shader->selector->info;
4105         struct si_shader *shader = ctx->shader;
4106         struct lp_build_if_state if_state;
4107         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
4108                                             ctx->param_gs2vs_offset);
4109         LLVMValueRef gs_next_vertex;
4110         LLVMValueRef can_emit;
4111         unsigned chan, offset;
4112         int i;
4113
4114         /* Write vertex attribute values to GSVS ring */
4115         gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
4116                                        ctx->gs_next_vertex[stream],
4117                                        "");
4118
4119         /* If this thread has already emitted the declared maximum number of
4120          * vertices, skip the write: excessive vertex emissions are not
4121          * supposed to have any effect.
4122          *
4123          * If the shader has no writes to memory, kill it instead. This skips
4124          * further memory loads and may allow LLVM to skip to the end
4125          * altogether.
4126          */
4127         can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
4128                                  LLVMConstInt(ctx->i32,
4129                                               shader->selector->gs_max_out_vertices, 0), "");
4130
4131         bool use_kill = !info->writes_memory;
4132         if (use_kill) {
4133                 ac_build_kill_if_false(&ctx->ac, can_emit);
4134         } else {
4135                 lp_build_if(&if_state, &ctx->gallivm, can_emit);
4136         }
4137
4138         offset = 0;
4139         for (i = 0; i < info->num_outputs; i++) {
4140                 for (chan = 0; chan < 4; chan++) {
4141                         if (!(info->output_usagemask[i] & (1 << chan)) ||
4142                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4143                                 continue;
4144
4145                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
4146                         LLVMValueRef voffset =
4147                                 LLVMConstInt(ctx->i32, offset *
4148                                              shader->selector->gs_max_out_vertices, 0);
4149                         offset++;
4150
4151                         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
4152                         voffset = LLVMBuildMul(ctx->ac.builder, voffset,
4153                                                LLVMConstInt(ctx->i32, 4, 0), "");
4154
4155                         out_val = ac_to_integer(&ctx->ac, out_val);
4156
4157                         ac_build_buffer_store_dword(&ctx->ac,
4158                                                     ctx->gsvs_ring[stream],
4159                                                     out_val, 1,
4160                                                     voffset, soffset, 0,
4161                                                     1, 1, true, true);
4162                 }
4163         }
4164
4165         gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
4166         LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4167
4168         /* Signal vertex emission if vertex data was written. */
4169         if (offset) {
4170                 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4171                                  si_get_gs_wave_id(ctx));
4172         }
4173
4174         if (!use_kill)
4175                 lp_build_endif(&if_state);
4176 }
4177
4178 /* Emit one vertex from the geometry shader */
4179 static void si_tgsi_emit_vertex(
4180         const struct lp_build_tgsi_action *action,
4181         struct lp_build_tgsi_context *bld_base,
4182         struct lp_build_emit_data *emit_data)
4183 {
4184         struct si_shader_context *ctx = si_shader_context(bld_base);
4185         unsigned stream = si_llvm_get_stream(bld_base, emit_data);
4186
4187         si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]);
4188 }
4189
4190 /* Cut one primitive from the geometry shader */
4191 static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
4192                                    unsigned stream)
4193 {
4194         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4195
4196         /* Signal primitive cut */
4197         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4198                          si_get_gs_wave_id(ctx));
4199 }
4200
4201 /* Cut one primitive from the geometry shader */
4202 static void si_tgsi_emit_primitive(
4203         const struct lp_build_tgsi_action *action,
4204         struct lp_build_tgsi_context *bld_base,
4205         struct lp_build_emit_data *emit_data)
4206 {
4207         struct si_shader_context *ctx = si_shader_context(bld_base);
4208
4209         si_llvm_emit_primitive(&ctx->abi, si_llvm_get_stream(bld_base, emit_data));
4210 }
4211
4212 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4213                                  struct lp_build_tgsi_context *bld_base,
4214                                  struct lp_build_emit_data *emit_data)
4215 {
4216         struct si_shader_context *ctx = si_shader_context(bld_base);
4217
4218         /* SI only (thanks to a hw bug workaround):
4219          * The real barrier instruction isn’t needed, because an entire patch
4220          * always fits into a single wave.
4221          */
4222         if (ctx->screen->info.chip_class == SI &&
4223             ctx->type == PIPE_SHADER_TESS_CTRL) {
4224                 ac_build_waitcnt(&ctx->ac, LGKM_CNT & VM_CNT);
4225                 return;
4226         }
4227
4228         ac_build_s_barrier(&ctx->ac);
4229 }
4230
4231 static void si_create_function(struct si_shader_context *ctx,
4232                                const char *name,
4233                                LLVMTypeRef *returns, unsigned num_returns,
4234                                struct si_function_info *fninfo,
4235                                unsigned max_workgroup_size)
4236 {
4237         int i;
4238
4239         si_llvm_create_func(ctx, name, returns, num_returns,
4240                             fninfo->types, fninfo->num_params);
4241         ctx->return_value = LLVMGetUndef(ctx->return_type);
4242
4243         for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4244                 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4245
4246                 /* The combination of:
4247                  * - noalias
4248                  * - dereferenceable
4249                  * - invariant.load
4250                  * allows the optimization passes to move loads and reduces
4251                  * SGPR spilling significantly.
4252                  */
4253                 ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1,
4254                                      AC_FUNC_ATTR_INREG);
4255
4256                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4257                         ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1,
4258                                              AC_FUNC_ATTR_NOALIAS);
4259                         ac_add_attr_dereferenceable(P, UINT64_MAX);
4260                 }
4261         }
4262
4263         for (i = 0; i < fninfo->num_params; ++i) {
4264                 if (fninfo->assign[i])
4265                         *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4266         }
4267
4268         if (ctx->screen->info.address32_hi) {
4269                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4270                                                      "amdgpu-32bit-address-high-bits",
4271                                                      ctx->screen->info.address32_hi);
4272         }
4273
4274         if (max_workgroup_size) {
4275                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4276                                                      "amdgpu-max-work-group-size",
4277                                                      max_workgroup_size);
4278         }
4279         LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4280                                            "no-signed-zeros-fp-math",
4281                                            "true");
4282
4283         if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) {
4284                 /* These were copied from some LLVM test. */
4285                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4286                                                    "less-precise-fpmad",
4287                                                    "true");
4288                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4289                                                    "no-infs-fp-math",
4290                                                    "true");
4291                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4292                                                    "no-nans-fp-math",
4293                                                    "true");
4294                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4295                                                    "unsafe-fp-math",
4296                                                    "true");
4297         }
4298 }
4299
4300 static void declare_streamout_params(struct si_shader_context *ctx,
4301                                      struct pipe_stream_output_info *so,
4302                                      struct si_function_info *fninfo)
4303 {
4304         int i;
4305
4306         /* Streamout SGPRs. */
4307         if (so->num_outputs) {
4308                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4309                         ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4310                 else
4311                         ctx->param_streamout_config = fninfo->num_params - 1;
4312
4313                 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4314         }
4315         /* A streamout buffer offset is loaded if the stride is non-zero. */
4316         for (i = 0; i < 4; i++) {
4317                 if (!so->stride[i])
4318                         continue;
4319
4320                 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4321         }
4322 }
4323
4324 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4325 {
4326         switch (shader->selector->type) {
4327         case PIPE_SHADER_TESS_CTRL:
4328                 /* Return this so that LLVM doesn't remove s_barrier
4329                  * instructions on chips where we use s_barrier. */
4330                 return shader->selector->screen->info.chip_class >= CIK ? 128 : 64;
4331
4332         case PIPE_SHADER_GEOMETRY:
4333                 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64;
4334
4335         case PIPE_SHADER_COMPUTE:
4336                 break; /* see below */
4337
4338         default:
4339                 return 0;
4340         }
4341
4342         const unsigned *properties = shader->selector->info.properties;
4343         unsigned max_work_group_size =
4344                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4345                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4346                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4347
4348         if (!max_work_group_size) {
4349                 /* This is a variable group size compute shader,
4350                  * compile it for the maximum possible group size.
4351                  */
4352                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4353         }
4354         return max_work_group_size;
4355 }
4356
4357 static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
4358                                              struct si_function_info *fninfo,
4359                                              bool assign_params)
4360 {
4361         LLVMTypeRef const_shader_buf_type;
4362
4363         if (ctx->shader->selector->info.const_buffers_declared == 1 &&
4364             ctx->shader->selector->info.shader_buffers_declared == 0)
4365                 const_shader_buf_type = ctx->f32;
4366         else
4367                 const_shader_buf_type = ctx->v4i32;
4368
4369         unsigned const_and_shader_buffers =
4370                 add_arg(fninfo, ARG_SGPR,
4371                         ac_array_in_const32_addr_space(const_shader_buf_type));
4372
4373         if (assign_params)
4374                 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4375 }
4376
4377 static void declare_samplers_and_images(struct si_shader_context *ctx,
4378                                         struct si_function_info *fninfo,
4379                                         bool assign_params)
4380 {
4381         unsigned samplers_and_images =
4382                 add_arg(fninfo, ARG_SGPR,
4383                         ac_array_in_const32_addr_space(ctx->v8i32));
4384
4385         if (assign_params)
4386                 ctx->param_samplers_and_images = samplers_and_images;
4387 }
4388
4389 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4390                                             struct si_function_info *fninfo,
4391                                             bool assign_params)
4392 {
4393         declare_const_and_shader_buffers(ctx, fninfo, assign_params);
4394         declare_samplers_and_images(ctx, fninfo, assign_params);
4395 }
4396
4397 static void declare_global_desc_pointers(struct si_shader_context *ctx,
4398                                          struct si_function_info *fninfo)
4399 {
4400         ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4401                 ac_array_in_const32_addr_space(ctx->v4i32));
4402         ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4403                 ac_array_in_const32_addr_space(ctx->v8i32));
4404 }
4405
4406 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4407                                             struct si_function_info *fninfo)
4408 {
4409         ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4410         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4411         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4412         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4413 }
4414
4415 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4416                                    struct si_function_info *fninfo,
4417                                    unsigned *num_prolog_vgprs)
4418 {
4419         struct si_shader *shader = ctx->shader;
4420
4421         add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4422         if (shader->key.as_ls) {
4423                 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4424                 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4425         } else {
4426                 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4427                 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4428         }
4429         add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4430
4431         if (!shader->is_gs_copy_shader) {
4432                 /* Vertex load indices. */
4433                 ctx->param_vertex_index0 = fninfo->num_params;
4434                 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4435                         add_arg(fninfo, ARG_VGPR, ctx->i32);
4436                 *num_prolog_vgprs += shader->selector->info.num_inputs;
4437         }
4438 }
4439
4440 static void declare_vs_blit_inputs(struct si_shader_context *ctx,
4441                                    struct si_function_info *fninfo,
4442                                    unsigned vs_blit_property)
4443 {
4444         ctx->param_vs_blit_inputs = fninfo->num_params;
4445         add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
4446         add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
4447         add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */
4448
4449         if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
4450                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */
4451                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */
4452                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */
4453                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */
4454         } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
4455                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
4456                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
4457                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
4458                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
4459                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
4460                 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
4461         }
4462 }
4463
4464 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4465                                     struct si_function_info *fninfo)
4466 {
4467         ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4468         ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4469         ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4470         add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id);
4471 }
4472
4473 enum {
4474         /* Convenient merged shader definitions. */
4475         SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4476         SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4477 };
4478
4479 static void create_function(struct si_shader_context *ctx)
4480 {
4481         struct si_shader *shader = ctx->shader;
4482         struct si_function_info fninfo;
4483         LLVMTypeRef returns[16+32*4];
4484         unsigned i, num_return_sgprs;
4485         unsigned num_returns = 0;
4486         unsigned num_prolog_vgprs = 0;
4487         unsigned type = ctx->type;
4488         unsigned vs_blit_property =
4489                 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
4490
4491         si_init_function_info(&fninfo);
4492
4493         /* Set MERGED shaders. */
4494         if (ctx->screen->info.chip_class >= GFX9) {
4495                 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4496                         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4497                 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4498                         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4499         }
4500
4501         LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4502
4503         switch (type) {
4504         case PIPE_SHADER_VERTEX:
4505                 declare_global_desc_pointers(ctx, &fninfo);
4506
4507                 if (vs_blit_property) {
4508                         declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property);
4509
4510                         /* VGPRs */
4511                         declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4512                         break;
4513                 }
4514
4515                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4516                 declare_vs_specific_input_sgprs(ctx, &fninfo);
4517                 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4518                         ac_array_in_const32_addr_space(ctx->v4i32));
4519
4520                 if (shader->key.as_es) {
4521                         ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4522                 } else if (shader->key.as_ls) {
4523                         /* no extra parameters */
4524                 } else {
4525                         if (shader->is_gs_copy_shader) {
4526                                 fninfo.num_params = ctx->param_vs_state_bits + 1;
4527                                 fninfo.num_sgpr_params = fninfo.num_params;
4528                         }
4529
4530                         /* The locations of the other parameters are assigned dynamically. */
4531                         declare_streamout_params(ctx, &shader->selector->so,
4532                                                  &fninfo);
4533                 }
4534
4535                 /* VGPRs */
4536                 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4537                 break;
4538
4539         case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4540                 declare_global_desc_pointers(ctx, &fninfo);
4541                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4542                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4543                 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4544                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4545                 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4546                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4547                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4548
4549                 /* VGPRs */
4550                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id);
4551                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids);
4552
4553                 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4554                  * placed after the user SGPRs.
4555                  */
4556                 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4557                         returns[num_returns++] = ctx->i32; /* SGPRs */
4558                 for (i = 0; i < 11; i++)
4559                         returns[num_returns++] = ctx->f32; /* VGPRs */
4560                 break;
4561
4562         case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4563                 /* Merged stages have 8 system SGPRs at the beginning. */
4564                 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
4565                 declare_per_stage_desc_pointers(ctx, &fninfo,
4566                                                 ctx->type == PIPE_SHADER_TESS_CTRL);
4567                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4568                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4569                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4570                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4571                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4572                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4573
4574                 declare_global_desc_pointers(ctx, &fninfo);
4575                 declare_per_stage_desc_pointers(ctx, &fninfo,
4576                                                 ctx->type == PIPE_SHADER_VERTEX);
4577                 declare_vs_specific_input_sgprs(ctx, &fninfo);
4578
4579                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4580                 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4581                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4582                 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4583                         ac_array_in_const32_addr_space(ctx->v4i32));
4584
4585                 /* VGPRs (first TCS, then VS) */
4586                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id);
4587                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids);
4588
4589                 if (ctx->type == PIPE_SHADER_VERTEX) {
4590                         declare_vs_input_vgprs(ctx, &fninfo,
4591                                                &num_prolog_vgprs);
4592
4593                         /* LS return values are inputs to the TCS main shader part. */
4594                         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4595                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4596                         for (i = 0; i < 2; i++)
4597                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4598                 } else {
4599                         /* TCS return values are inputs to the TCS epilog.
4600                          *
4601                          * param_tcs_offchip_offset, param_tcs_factor_offset,
4602                          * param_tcs_offchip_layout, and param_rw_buffers
4603                          * should be passed to the epilog.
4604                          */
4605                         for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
4606                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4607                         for (i = 0; i < 11; i++)
4608                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4609                 }
4610                 break;
4611
4612         case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4613                 /* Merged stages have 8 system SGPRs at the beginning. */
4614                 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
4615                 declare_per_stage_desc_pointers(ctx, &fninfo,
4616                                                 ctx->type == PIPE_SHADER_GEOMETRY);
4617                 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4618                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4619                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4620                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4621                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4622                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4623
4624                 declare_global_desc_pointers(ctx, &fninfo);
4625                 declare_per_stage_desc_pointers(ctx, &fninfo,
4626                                                 (ctx->type == PIPE_SHADER_VERTEX ||
4627                                                  ctx->type == PIPE_SHADER_TESS_EVAL));
4628                 if (ctx->type == PIPE_SHADER_VERTEX) {
4629                         declare_vs_specific_input_sgprs(ctx, &fninfo);
4630                 } else {
4631                         ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4632                         ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4633                         ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4634                         /* Declare as many input SGPRs as the VS has. */
4635                 }
4636
4637                 if (ctx->type == PIPE_SHADER_VERTEX) {
4638                         ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4639                                 ac_array_in_const32_addr_space(ctx->v4i32));
4640                 }
4641
4642                 /* VGPRs (first GS, then VS/TES) */
4643                 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4644                 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4645                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4646                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4647                 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4648
4649                 if (ctx->type == PIPE_SHADER_VERTEX) {
4650                         declare_vs_input_vgprs(ctx, &fninfo,
4651                                                &num_prolog_vgprs);
4652                 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4653                         declare_tes_input_vgprs(ctx, &fninfo);
4654                 }
4655
4656                 if (ctx->type == PIPE_SHADER_VERTEX ||
4657                     ctx->type == PIPE_SHADER_TESS_EVAL) {
4658                         unsigned num_user_sgprs;
4659
4660                         if (ctx->type == PIPE_SHADER_VERTEX)
4661                                 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
4662                         else
4663                                 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
4664
4665                         /* ES return values are inputs to GS. */
4666                         for (i = 0; i < 8 + num_user_sgprs; i++)
4667                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4668                         for (i = 0; i < 5; i++)
4669                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4670                 }
4671                 break;
4672
4673         case PIPE_SHADER_TESS_EVAL:
4674                 declare_global_desc_pointers(ctx, &fninfo);
4675                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4676                 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4677                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4678                 ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4679
4680                 if (shader->key.as_es) {
4681                         ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4682                         add_arg(&fninfo, ARG_SGPR, ctx->i32);
4683                         ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4684                 } else {
4685                         add_arg(&fninfo, ARG_SGPR, ctx->i32);
4686                         declare_streamout_params(ctx, &shader->selector->so,
4687                                                  &fninfo);
4688                         ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4689                 }
4690
4691                 /* VGPRs */
4692                 declare_tes_input_vgprs(ctx, &fninfo);
4693                 break;
4694
4695         case PIPE_SHADER_GEOMETRY:
4696                 declare_global_desc_pointers(ctx, &fninfo);
4697                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4698                 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4699                 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4700
4701                 /* VGPRs */
4702                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]);
4703                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]);
4704                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4705                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]);
4706                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]);
4707                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]);
4708                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]);
4709                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4710                 break;
4711
4712         case PIPE_SHADER_FRAGMENT:
4713                 declare_global_desc_pointers(ctx, &fninfo);
4714                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4715                 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4716                 add_arg_assign_checked(&fninfo, ARG_SGPR, ctx->i32,
4717                                        &ctx->abi.prim_mask, SI_PARAM_PRIM_MASK);
4718
4719                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4720                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4721                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4722                 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4723                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4724                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4725                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4726                 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4727                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4728                                        &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4729                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4730                                        &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4731                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4732                                        &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4733                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4734                                        &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4735                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4736                                        &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4737                 shader->info.face_vgpr_index = 20;
4738                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4739                                        &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4740                 shader->info.ancillary_vgpr_index = 21;
4741                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4742                                        &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4743                 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4744
4745                 /* Color inputs from the prolog. */
4746                 if (shader->selector->info.colors_read) {
4747                         unsigned num_color_elements =
4748                                 util_bitcount(shader->selector->info.colors_read);
4749
4750                         assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4751                         for (i = 0; i < num_color_elements; i++)
4752                                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4753
4754                         num_prolog_vgprs += num_color_elements;
4755                 }
4756
4757                 /* Outputs for the epilog. */
4758                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4759                 num_returns =
4760                         num_return_sgprs +
4761                         util_bitcount(shader->selector->info.colors_written) * 4 +
4762                         shader->selector->info.writes_z +
4763                         shader->selector->info.writes_stencil +
4764                         shader->selector->info.writes_samplemask +
4765                         1 /* SampleMaskIn */;
4766
4767                 num_returns = MAX2(num_returns,
4768                                    num_return_sgprs +
4769                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4770
4771                 for (i = 0; i < num_return_sgprs; i++)
4772                         returns[i] = ctx->i32;
4773                 for (; i < num_returns; i++)
4774                         returns[i] = ctx->f32;
4775                 break;
4776
4777         case PIPE_SHADER_COMPUTE:
4778                 declare_global_desc_pointers(ctx, &fninfo);
4779                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4780                 if (shader->selector->info.uses_grid_size)
4781                         add_arg_assign(&fninfo, ARG_SGPR, v3i32, &ctx->abi.num_work_groups);
4782                 if (shader->selector->info.uses_block_size &&
4783                     shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
4784                         ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4785
4786                 unsigned cs_user_data_dwords =
4787                         shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_DWORDS];
4788                 if (cs_user_data_dwords) {
4789                         ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR,
4790                                                           LLVMVectorType(ctx->i32, cs_user_data_dwords));
4791                 }
4792
4793                 for (i = 0; i < 3; i++) {
4794                         ctx->abi.workgroup_ids[i] = NULL;
4795                         if (shader->selector->info.uses_block_id[i])
4796                                 add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->abi.workgroup_ids[i]);
4797                 }
4798
4799                 add_arg_assign(&fninfo, ARG_VGPR, v3i32, &ctx->abi.local_invocation_ids);
4800                 break;
4801         default:
4802                 assert(0 && "unimplemented shader");
4803                 return;
4804         }
4805
4806         si_create_function(ctx, "main", returns, num_returns, &fninfo,
4807                            si_get_max_workgroup_size(shader));
4808
4809         /* Reserve register locations for VGPR inputs the PS prolog may need. */
4810         if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
4811                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4812                                                      "InitialPSInputAddr",
4813                                                      S_0286D0_PERSP_SAMPLE_ENA(1) |
4814                                                      S_0286D0_PERSP_CENTER_ENA(1) |
4815                                                      S_0286D0_PERSP_CENTROID_ENA(1) |
4816                                                      S_0286D0_LINEAR_SAMPLE_ENA(1) |
4817                                                      S_0286D0_LINEAR_CENTER_ENA(1) |
4818                                                      S_0286D0_LINEAR_CENTROID_ENA(1) |
4819                                                      S_0286D0_FRONT_FACE_ENA(1) |
4820                                                      S_0286D0_ANCILLARY_ENA(1) |
4821                                                      S_0286D0_POS_FIXED_PT_ENA(1));
4822         }
4823
4824         shader->info.num_input_sgprs = 0;
4825         shader->info.num_input_vgprs = 0;
4826
4827         for (i = 0; i < fninfo.num_sgpr_params; ++i)
4828                 shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4;
4829
4830         for (; i < fninfo.num_params; ++i)
4831                 shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4;
4832
4833         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4834         shader->info.num_input_vgprs -= num_prolog_vgprs;
4835
4836         if (shader->key.as_ls ||
4837             ctx->type == PIPE_SHADER_TESS_CTRL ||
4838             /* GFX9 has the ESGS ring buffer in LDS. */
4839             type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY)
4840                 ac_declare_lds_as_pointer(&ctx->ac);
4841 }
4842
4843 /**
4844  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4845  * for later use.
4846  */
4847 static void preload_ring_buffers(struct si_shader_context *ctx)
4848 {
4849         LLVMBuilderRef builder = ctx->ac.builder;
4850
4851         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4852                                             ctx->param_rw_buffers);
4853
4854         if (ctx->screen->info.chip_class <= VI &&
4855             (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4856                 unsigned ring =
4857                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4858                                                              : SI_ES_RING_ESGS;
4859                 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4860
4861                 ctx->esgs_ring =
4862                         ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4863         }
4864
4865         if (ctx->shader->is_gs_copy_shader) {
4866                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4867
4868                 ctx->gsvs_ring[0] =
4869                         ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4870         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4871                 const struct si_shader_selector *sel = ctx->shader->selector;
4872                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4873                 LLVMValueRef base_ring;
4874
4875                 base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4876
4877                 /* The conceptual layout of the GSVS ring is
4878                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
4879                  * but the real memory layout is swizzled across
4880                  * threads:
4881                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4882                  *   t16v0c0 ..
4883                  * Override the buffer descriptor accordingly.
4884                  */
4885                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4886                 uint64_t stream_offset = 0;
4887
4888                 for (unsigned stream = 0; stream < 4; ++stream) {
4889                         unsigned num_components;
4890                         unsigned stride;
4891                         unsigned num_records;
4892                         LLVMValueRef ring, tmp;
4893
4894                         num_components = sel->info.num_stream_output_components[stream];
4895                         if (!num_components)
4896                                 continue;
4897
4898                         stride = 4 * num_components * sel->gs_max_out_vertices;
4899
4900                         /* Limit on the stride field for <= CIK. */
4901                         assert(stride < (1 << 14));
4902
4903                         num_records = 64;
4904
4905                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4906                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4907                         tmp = LLVMBuildAdd(builder, tmp,
4908                                            LLVMConstInt(ctx->i64,
4909                                                         stream_offset, 0), "");
4910                         stream_offset += stride * 64;
4911
4912                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4913                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4914                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4915                         tmp = LLVMBuildOr(builder, tmp,
4916                                 LLVMConstInt(ctx->i32,
4917                                              S_008F04_STRIDE(stride) |
4918                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
4919                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4920                         ring = LLVMBuildInsertElement(builder, ring,
4921                                         LLVMConstInt(ctx->i32, num_records, 0),
4922                                         LLVMConstInt(ctx->i32, 2, 0), "");
4923                         ring = LLVMBuildInsertElement(builder, ring,
4924                                 LLVMConstInt(ctx->i32,
4925                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4926                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4927                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4928                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4929                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4930                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4931                                              S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4932                                              S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4933                                              S_008F0C_ADD_TID_ENABLE(1),
4934                                              0),
4935                                 LLVMConstInt(ctx->i32, 3, 0), "");
4936
4937                         ctx->gsvs_ring[stream] = ring;
4938                 }
4939         } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4940                 ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
4941         }
4942 }
4943
4944 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4945                                          LLVMValueRef param_rw_buffers,
4946                                          unsigned param_pos_fixed_pt)
4947 {
4948         LLVMBuilderRef builder = ctx->ac.builder;
4949         LLVMValueRef slot, desc, offset, row, bit, address[2];
4950
4951         /* Use the fixed-point gl_FragCoord input.
4952          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4953          * per coordinate to get the repeating effect.
4954          */
4955         address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4956         address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4957
4958         /* Load the buffer descriptor. */
4959         slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4960         desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
4961
4962         /* The stipple pattern is 32x32, each row has 32 bits. */
4963         offset = LLVMBuildMul(builder, address[1],
4964                               LLVMConstInt(ctx->i32, 4, 0), "");
4965         row = buffer_load_const(ctx, desc, offset);
4966         row = ac_to_integer(&ctx->ac, row);
4967         bit = LLVMBuildLShr(builder, row, address[0], "");
4968         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4969         ac_build_kill_if_false(&ctx->ac, bit);
4970 }
4971
4972 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4973                                   struct si_shader_config *conf,
4974                                   unsigned symbol_offset)
4975 {
4976         unsigned i;
4977         const unsigned char *config =
4978                 ac_shader_binary_config_start(binary, symbol_offset);
4979         bool really_needs_scratch = false;
4980
4981         /* LLVM adds SGPR spills to the scratch size.
4982          * Find out if we really need the scratch buffer.
4983          */
4984         for (i = 0; i < binary->reloc_count; i++) {
4985                 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4986
4987                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4988                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4989                         really_needs_scratch = true;
4990                         break;
4991                 }
4992         }
4993
4994         /* XXX: We may be able to emit some of these values directly rather than
4995          * extracting fields to be emitted later.
4996          */
4997
4998         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4999                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5000                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5001                 switch (reg) {
5002                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5003                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5004                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5005                 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
5006                 case R_00B848_COMPUTE_PGM_RSRC1:
5007                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5008                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5009                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
5010                         conf->rsrc1 = value;
5011                         break;
5012                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5013                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5014                         break;
5015                 case R_00B84C_COMPUTE_PGM_RSRC2:
5016                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5017                         conf->rsrc2 = value;
5018                         break;
5019                 case R_0286CC_SPI_PS_INPUT_ENA:
5020                         conf->spi_ps_input_ena = value;
5021                         break;
5022                 case R_0286D0_SPI_PS_INPUT_ADDR:
5023                         conf->spi_ps_input_addr = value;
5024                         break;
5025                 case R_0286E8_SPI_TMPRING_SIZE:
5026                 case R_00B860_COMPUTE_TMPRING_SIZE:
5027                         /* WAVESIZE is in units of 256 dwords. */
5028                         if (really_needs_scratch)
5029                                 conf->scratch_bytes_per_wave =
5030                                         G_00B860_WAVESIZE(value) * 256 * 4;
5031                         break;
5032                 case 0x4: /* SPILLED_SGPRS */
5033                         conf->spilled_sgprs = value;
5034                         break;
5035                 case 0x8: /* SPILLED_VGPRS */
5036                         conf->spilled_vgprs = value;
5037                         break;
5038                 default:
5039                         {
5040                                 static bool printed;
5041
5042                                 if (!printed) {
5043                                         fprintf(stderr, "Warning: LLVM emitted unknown "
5044                                                 "config register: 0x%x\n", reg);
5045                                         printed = true;
5046                                 }
5047                         }
5048                         break;
5049                 }
5050         }
5051
5052         if (!conf->spi_ps_input_addr)
5053                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5054 }
5055
5056 void si_shader_apply_scratch_relocs(struct si_shader *shader,
5057                                     uint64_t scratch_va)
5058 {
5059         unsigned i;
5060         uint32_t scratch_rsrc_dword0 = scratch_va;
5061         uint32_t scratch_rsrc_dword1 =
5062                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5063
5064         /* Enable scratch coalescing. */
5065         scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5066
5067         for (i = 0 ; i < shader->binary.reloc_count; i++) {
5068                 const struct ac_shader_reloc *reloc =
5069                                         &shader->binary.relocs[i];
5070                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5071                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5072                         &scratch_rsrc_dword0, 4);
5073                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5074                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5075                         &scratch_rsrc_dword1, 4);
5076                 }
5077         }
5078 }
5079
5080 /* For the UMR disassembler. */
5081 #define DEBUGGER_END_OF_CODE_MARKER     0xbf9f0000 /* invalid instruction */
5082 #define DEBUGGER_NUM_MARKERS            5
5083
5084 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
5085 {
5086         unsigned size = shader->binary.code_size;
5087
5088         if (shader->prolog)
5089                 size += shader->prolog->binary.code_size;
5090         if (shader->previous_stage)
5091                 size += shader->previous_stage->binary.code_size;
5092         if (shader->prolog2)
5093                 size += shader->prolog2->binary.code_size;
5094         if (shader->epilog)
5095                 size += shader->epilog->binary.code_size;
5096         return size + DEBUGGER_NUM_MARKERS * 4;
5097 }
5098
5099 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5100 {
5101         const struct ac_shader_binary *prolog =
5102                 shader->prolog ? &shader->prolog->binary : NULL;
5103         const struct ac_shader_binary *previous_stage =
5104                 shader->previous_stage ? &shader->previous_stage->binary : NULL;
5105         const struct ac_shader_binary *prolog2 =
5106                 shader->prolog2 ? &shader->prolog2->binary : NULL;
5107         const struct ac_shader_binary *epilog =
5108                 shader->epilog ? &shader->epilog->binary : NULL;
5109         const struct ac_shader_binary *mainb = &shader->binary;
5110         unsigned bo_size = si_get_shader_binary_size(shader) +
5111                            (!epilog ? mainb->rodata_size : 0);
5112         unsigned char *ptr;
5113
5114         assert(!prolog || !prolog->rodata_size);
5115         assert(!previous_stage || !previous_stage->rodata_size);
5116         assert(!prolog2 || !prolog2->rodata_size);
5117         assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
5118                !mainb->rodata_size);
5119         assert(!epilog || !epilog->rodata_size);
5120
5121         si_resource_reference(&shader->bo, NULL);
5122         shader->bo = si_aligned_buffer_create(&sscreen->b,
5123                                               sscreen->cpdma_prefetch_writes_memory ?
5124                                                 0 : SI_RESOURCE_FLAG_READ_ONLY,
5125                                               PIPE_USAGE_IMMUTABLE,
5126                                               align(bo_size, SI_CPDMA_ALIGNMENT),
5127                                               256);
5128         if (!shader->bo)
5129                 return -ENOMEM;
5130
5131         /* Upload. */
5132         ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
5133                                         PIPE_TRANSFER_READ_WRITE |
5134                                         PIPE_TRANSFER_UNSYNCHRONIZED |
5135                                         RADEON_TRANSFER_TEMPORARY);
5136
5137         /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
5138          * endian-independent. */
5139         if (prolog) {
5140                 memcpy(ptr, prolog->code, prolog->code_size);
5141                 ptr += prolog->code_size;
5142         }
5143         if (previous_stage) {
5144                 memcpy(ptr, previous_stage->code, previous_stage->code_size);
5145                 ptr += previous_stage->code_size;
5146         }
5147         if (prolog2) {
5148                 memcpy(ptr, prolog2->code, prolog2->code_size);
5149                 ptr += prolog2->code_size;
5150         }
5151
5152         memcpy(ptr, mainb->code, mainb->code_size);
5153         ptr += mainb->code_size;
5154
5155         if (epilog) {
5156                 memcpy(ptr, epilog->code, epilog->code_size);
5157                 ptr += epilog->code_size;
5158         } else if (mainb->rodata_size > 0) {
5159                 memcpy(ptr, mainb->rodata, mainb->rodata_size);
5160                 ptr += mainb->rodata_size;
5161         }
5162
5163         /* Add end-of-code markers for the UMR disassembler. */
5164         uint32_t *ptr32 = (uint32_t*)ptr;
5165         for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
5166                 ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
5167
5168         sscreen->ws->buffer_unmap(shader->bo->buf);
5169         return 0;
5170 }
5171
5172 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5173                                        struct pipe_debug_callback *debug,
5174                                        const char *name, FILE *file)
5175 {
5176         char *line, *p;
5177         unsigned i, count;
5178
5179         if (binary->disasm_string) {
5180                 fprintf(file, "Shader %s disassembly:\n", name);
5181                 fprintf(file, "%s", binary->disasm_string);
5182
5183                 if (debug && debug->debug_message) {
5184                         /* Very long debug messages are cut off, so send the
5185                          * disassembly one line at a time. This causes more
5186                          * overhead, but on the plus side it simplifies
5187                          * parsing of resulting logs.
5188                          */
5189                         pipe_debug_message(debug, SHADER_INFO,
5190                                            "Shader Disassembly Begin");
5191
5192                         line = binary->disasm_string;
5193                         while (*line) {
5194                                 p = util_strchrnul(line, '\n');
5195                                 count = p - line;
5196
5197                                 if (count) {
5198                                         pipe_debug_message(debug, SHADER_INFO,
5199                                                            "%.*s", count, line);
5200                                 }
5201
5202                                 if (!*p)
5203                                         break;
5204                                 line = p + 1;
5205                         }
5206
5207                         pipe_debug_message(debug, SHADER_INFO,
5208                                            "Shader Disassembly End");
5209                 }
5210         } else {
5211                 fprintf(file, "Shader %s binary:\n", name);
5212                 for (i = 0; i < binary->code_size; i += 4) {
5213                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5214                                 binary->code[i + 3], binary->code[i + 2],
5215                                 binary->code[i + 1], binary->code[i]);
5216                 }
5217         }
5218 }
5219
5220 static void si_calculate_max_simd_waves(struct si_shader *shader)
5221 {
5222         struct si_screen *sscreen = shader->selector->screen;
5223         struct si_shader_config *conf = &shader->config;
5224         unsigned num_inputs = shader->selector->info.num_inputs;
5225         unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256;
5226         unsigned lds_per_wave = 0;
5227         unsigned max_simd_waves;
5228
5229         max_simd_waves = ac_get_max_simd_waves(sscreen->info.family);
5230
5231         /* Compute LDS usage for PS. */
5232         switch (shader->selector->type) {
5233         case PIPE_SHADER_FRAGMENT:
5234                 /* The minimum usage per wave is (num_inputs * 48). The maximum
5235                  * usage is (num_inputs * 48 * 16).
5236                  * We can get anything in between and it varies between waves.
5237                  *
5238                  * The 48 bytes per input for a single primitive is equal to
5239                  * 4 bytes/component * 4 components/input * 3 points.
5240                  *
5241                  * Other stages don't know the size at compile time or don't
5242                  * allocate LDS per wave, but instead they do it per thread group.
5243                  */
5244                 lds_per_wave = conf->lds_size * lds_increment +
5245                                align(num_inputs * 48, lds_increment);
5246                 break;
5247         case PIPE_SHADER_COMPUTE:
5248                 if (shader->selector) {
5249                         unsigned max_workgroup_size =
5250                                 si_get_max_workgroup_size(shader);
5251                         lds_per_wave = (conf->lds_size * lds_increment) /
5252                                        DIV_ROUND_UP(max_workgroup_size, 64);
5253                 }
5254                 break;
5255         }
5256
5257         /* Compute the per-SIMD wave counts. */
5258         if (conf->num_sgprs) {
5259                 max_simd_waves =
5260                         MIN2(max_simd_waves,
5261                              ac_get_num_physical_sgprs(sscreen->info.chip_class) / conf->num_sgprs);
5262         }
5263
5264         if (conf->num_vgprs)
5265                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5266
5267         /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5268          * 16KB makes some SIMDs unoccupied). */
5269         if (lds_per_wave)
5270                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5271
5272         conf->max_simd_waves = max_simd_waves;
5273 }
5274
5275 void si_shader_dump_stats_for_shader_db(const struct si_shader *shader,
5276                                         struct pipe_debug_callback *debug)
5277 {
5278         const struct si_shader_config *conf = &shader->config;
5279
5280         pipe_debug_message(debug, SHADER_INFO,
5281                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5282                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5283                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
5284                            conf->num_sgprs, conf->num_vgprs,
5285                            si_get_shader_binary_size(shader),
5286                            conf->lds_size, conf->scratch_bytes_per_wave,
5287                            conf->max_simd_waves, conf->spilled_sgprs,
5288                            conf->spilled_vgprs, conf->private_mem_vgprs);
5289 }
5290
5291 static void si_shader_dump_stats(struct si_screen *sscreen,
5292                                  const struct si_shader *shader,
5293                                  unsigned processor,
5294                                  FILE *file,
5295                                  bool check_debug_option)
5296 {
5297         const struct si_shader_config *conf = &shader->config;
5298
5299         if (!check_debug_option ||
5300             si_can_dump_shader(sscreen, processor)) {
5301                 if (processor == PIPE_SHADER_FRAGMENT) {
5302                         fprintf(file, "*** SHADER CONFIG ***\n"
5303                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5304                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
5305                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5306                 }
5307
5308                 fprintf(file, "*** SHADER STATS ***\n"
5309                         "SGPRS: %d\n"
5310                         "VGPRS: %d\n"
5311                         "Spilled SGPRs: %d\n"
5312                         "Spilled VGPRs: %d\n"
5313                         "Private memory VGPRs: %d\n"
5314                         "Code Size: %d bytes\n"
5315                         "LDS: %d blocks\n"
5316                         "Scratch: %d bytes per wave\n"
5317                         "Max Waves: %d\n"
5318                         "********************\n\n\n",
5319                         conf->num_sgprs, conf->num_vgprs,
5320                         conf->spilled_sgprs, conf->spilled_vgprs,
5321                         conf->private_mem_vgprs,
5322                         si_get_shader_binary_size(shader),
5323                         conf->lds_size, conf->scratch_bytes_per_wave,
5324                         conf->max_simd_waves);
5325         }
5326 }
5327
5328 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5329 {
5330         switch (processor) {
5331         case PIPE_SHADER_VERTEX:
5332                 if (shader->key.as_es)
5333                         return "Vertex Shader as ES";
5334                 else if (shader->key.as_ls)
5335                         return "Vertex Shader as LS";
5336                 else
5337                         return "Vertex Shader as VS";
5338         case PIPE_SHADER_TESS_CTRL:
5339                 return "Tessellation Control Shader";
5340         case PIPE_SHADER_TESS_EVAL:
5341                 if (shader->key.as_es)
5342                         return "Tessellation Evaluation Shader as ES";
5343                 else
5344                         return "Tessellation Evaluation Shader as VS";
5345         case PIPE_SHADER_GEOMETRY:
5346                 if (shader->is_gs_copy_shader)
5347                         return "GS Copy Shader as VS";
5348                 else
5349                         return "Geometry Shader";
5350         case PIPE_SHADER_FRAGMENT:
5351                 return "Pixel Shader";
5352         case PIPE_SHADER_COMPUTE:
5353                 return "Compute Shader";
5354         default:
5355                 return "Unknown Shader";
5356         }
5357 }
5358
5359 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5360                     struct pipe_debug_callback *debug, unsigned processor,
5361                     FILE *file, bool check_debug_option)
5362 {
5363         if (!check_debug_option ||
5364             si_can_dump_shader(sscreen, processor))
5365                 si_dump_shader_key(processor, shader, file);
5366
5367         if (!check_debug_option && shader->binary.llvm_ir_string) {
5368                 if (shader->previous_stage &&
5369                     shader->previous_stage->binary.llvm_ir_string) {
5370                         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5371                                 si_get_shader_name(shader, processor));
5372                         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5373                 }
5374
5375                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5376                         si_get_shader_name(shader, processor));
5377                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5378         }
5379
5380         if (!check_debug_option ||
5381             (si_can_dump_shader(sscreen, processor) &&
5382              !(sscreen->debug_flags & DBG(NO_ASM)))) {
5383                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5384
5385                 if (shader->prolog)
5386                         si_shader_dump_disassembly(&shader->prolog->binary,
5387                                                    debug, "prolog", file);
5388                 if (shader->previous_stage)
5389                         si_shader_dump_disassembly(&shader->previous_stage->binary,
5390                                                    debug, "previous stage", file);
5391                 if (shader->prolog2)
5392                         si_shader_dump_disassembly(&shader->prolog2->binary,
5393                                                    debug, "prolog2", file);
5394
5395                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5396
5397                 if (shader->epilog)
5398                         si_shader_dump_disassembly(&shader->epilog->binary,
5399                                                    debug, "epilog", file);
5400                 fprintf(file, "\n");
5401         }
5402
5403         si_shader_dump_stats(sscreen, shader, processor, file,
5404                              check_debug_option);
5405 }
5406
5407 static int si_compile_llvm(struct si_screen *sscreen,
5408                            struct ac_shader_binary *binary,
5409                            struct si_shader_config *conf,
5410                            struct ac_llvm_compiler *compiler,
5411                            LLVMModuleRef mod,
5412                            struct pipe_debug_callback *debug,
5413                            unsigned processor,
5414                            const char *name,
5415                            bool less_optimized)
5416 {
5417         int r = 0;
5418         unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
5419
5420         if (si_can_dump_shader(sscreen, processor)) {
5421                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5422
5423                 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
5424                         fprintf(stderr, "%s LLVM IR:\n\n", name);
5425                         ac_dump_module(mod);
5426                         fprintf(stderr, "\n");
5427                 }
5428         }
5429
5430         if (sscreen->record_llvm_ir) {
5431                 char *ir = LLVMPrintModuleToString(mod);
5432                 binary->llvm_ir_string = strdup(ir);
5433                 LLVMDisposeMessage(ir);
5434         }
5435
5436         if (!si_replace_shader(count, binary)) {
5437                 r = si_llvm_compile(mod, binary, compiler, debug,
5438                                     less_optimized);
5439                 if (r)
5440                         return r;
5441         }
5442
5443         si_shader_binary_read_config(binary, conf, 0);
5444
5445         /* Enable 64-bit and 16-bit denormals, because there is no performance
5446          * cost.
5447          *
5448          * If denormals are enabled, all floating-point output modifiers are
5449          * ignored.
5450          *
5451          * Don't enable denormals for 32-bit floats, because:
5452          * - Floating-point output modifiers would be ignored by the hw.
5453          * - Some opcodes don't support denormals, such as v_mad_f32. We would
5454          *   have to stop using those.
5455          * - SI & CI would be very slow.
5456          */
5457         conf->float_mode |= V_00B028_FP_64_DENORMS;
5458
5459         FREE(binary->config);
5460         FREE(binary->global_symbol_offsets);
5461         binary->config = NULL;
5462         binary->global_symbol_offsets = NULL;
5463
5464         /* Some shaders can't have rodata because their binaries can be
5465          * concatenated.
5466          */
5467         if (binary->rodata_size &&
5468             (processor == PIPE_SHADER_VERTEX ||
5469              processor == PIPE_SHADER_TESS_CTRL ||
5470              processor == PIPE_SHADER_TESS_EVAL ||
5471              processor == PIPE_SHADER_FRAGMENT)) {
5472                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5473                 return -EINVAL;
5474         }
5475
5476         return r;
5477 }
5478
5479 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5480 {
5481         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5482                 LLVMBuildRetVoid(ctx->ac.builder);
5483         else
5484                 LLVMBuildRet(ctx->ac.builder, ret);
5485 }
5486
5487 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5488 struct si_shader *
5489 si_generate_gs_copy_shader(struct si_screen *sscreen,
5490                            struct ac_llvm_compiler *compiler,
5491                            struct si_shader_selector *gs_selector,
5492                            struct pipe_debug_callback *debug)
5493 {
5494         struct si_shader_context ctx;
5495         struct si_shader *shader;
5496         LLVMBuilderRef builder;
5497         struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
5498         struct tgsi_shader_info *gsinfo = &gs_selector->info;
5499         int i, r;
5500
5501
5502         shader = CALLOC_STRUCT(si_shader);
5503         if (!shader)
5504                 return NULL;
5505
5506         /* We can leave the fence as permanently signaled because the GS copy
5507          * shader only becomes visible globally after it has been compiled. */
5508         util_queue_fence_init(&shader->ready);
5509
5510         shader->selector = gs_selector;
5511         shader->is_gs_copy_shader = true;
5512
5513         si_init_shader_ctx(&ctx, sscreen, compiler);
5514         ctx.shader = shader;
5515         ctx.type = PIPE_SHADER_VERTEX;
5516
5517         builder = ctx.ac.builder;
5518
5519         create_function(&ctx);
5520         preload_ring_buffers(&ctx);
5521
5522         LLVMValueRef voffset =
5523                 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
5524                              LLVMConstInt(ctx.i32, 4, 0), "");
5525
5526         /* Fetch the vertex stream ID.*/
5527         LLVMValueRef stream_id;
5528
5529         if (gs_selector->so.num_outputs)
5530                 stream_id = si_unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5531         else
5532                 stream_id = ctx.i32_0;
5533
5534         /* Fill in output information. */
5535         for (i = 0; i < gsinfo->num_outputs; ++i) {
5536                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5537                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5538
5539                 for (int chan = 0; chan < 4; chan++) {
5540                         outputs[i].vertex_stream[chan] =
5541                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5542                 }
5543         }
5544
5545         LLVMBasicBlockRef end_bb;
5546         LLVMValueRef switch_inst;
5547
5548         end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
5549         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5550
5551         for (int stream = 0; stream < 4; stream++) {
5552                 LLVMBasicBlockRef bb;
5553                 unsigned offset;
5554
5555                 if (!gsinfo->num_stream_output_components[stream])
5556                         continue;
5557
5558                 if (stream > 0 && !gs_selector->so.num_outputs)
5559                         continue;
5560
5561                 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
5562                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5563                 LLVMPositionBuilderAtEnd(builder, bb);
5564
5565                 /* Fetch vertex data from GSVS ring */
5566                 offset = 0;
5567                 for (i = 0; i < gsinfo->num_outputs; ++i) {
5568                         for (unsigned chan = 0; chan < 4; chan++) {
5569                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5570                                     outputs[i].vertex_stream[chan] != stream) {
5571                                         outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
5572                                         continue;
5573                                 }
5574
5575                                 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5576                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5577                                 offset++;
5578
5579                                 outputs[i].values[chan] =
5580                                         ac_build_buffer_load(&ctx.ac,
5581                                                              ctx.gsvs_ring[0], 1,
5582                                                              ctx.i32_0, voffset,
5583                                                              soffset, 0, 1, 1,
5584                                                              true, false);
5585                         }
5586                 }
5587
5588                 /* Streamout and exports. */
5589                 if (gs_selector->so.num_outputs) {
5590                         si_llvm_emit_streamout(&ctx, outputs,
5591                                                gsinfo->num_outputs,
5592                                                stream);
5593                 }
5594
5595                 if (stream == 0) {
5596                         /* Vertex color clamping.
5597                          *
5598                          * This uses a state constant loaded in a user data SGPR and
5599                          * an IF statement is added that clamps all colors if the constant
5600                          * is true.
5601                          */
5602                         struct lp_build_if_state if_ctx;
5603                         LLVMValueRef v[2], cond = NULL;
5604                         LLVMBasicBlockRef blocks[2];
5605
5606                         for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
5607                                 if (gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
5608                                     gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
5609                                         continue;
5610
5611                                 /* We've found a color. */
5612                                 if (!cond) {
5613                                         /* The state is in the first bit of the user SGPR. */
5614                                         cond = LLVMGetParam(ctx.main_fn,
5615                                                             ctx.param_vs_state_bits);
5616                                         cond = LLVMBuildTrunc(ctx.ac.builder, cond,
5617                                                               ctx.i1, "");
5618                                         lp_build_if(&if_ctx, &ctx.gallivm, cond);
5619                                         /* Remember blocks for Phi. */
5620                                         blocks[0] = if_ctx.true_block;
5621                                         blocks[1] = if_ctx.entry_block;
5622                                 }
5623
5624                                 for (unsigned j = 0; j < 4; j++) {
5625                                         /* Insert clamp into the true block. */
5626                                         v[0] = ac_build_clamp(&ctx.ac, outputs[i].values[j]);
5627                                         v[1] = outputs[i].values[j];
5628
5629                                         /* Insert Phi into the endif block. */
5630                                         LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.merge_block);
5631                                         outputs[i].values[j] = ac_build_phi(&ctx.ac, ctx.f32, 2, v, blocks);
5632                                         LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.true_block);
5633                                 }
5634                         }
5635                         if (cond)
5636                                 lp_build_endif(&if_ctx);
5637
5638                         si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
5639                 }
5640
5641                 LLVMBuildBr(builder, end_bb);
5642         }
5643
5644         LLVMPositionBuilderAtEnd(builder, end_bb);
5645
5646         LLVMBuildRetVoid(ctx.ac.builder);
5647
5648         ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5649         si_llvm_optimize_module(&ctx);
5650
5651         r = si_compile_llvm(sscreen, &ctx.shader->binary,
5652                             &ctx.shader->config, ctx.compiler,
5653                             ctx.ac.module,
5654                             debug, PIPE_SHADER_GEOMETRY,
5655                             "GS Copy Shader", false);
5656         if (!r) {
5657                 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
5658                         fprintf(stderr, "GS Copy Shader:\n");
5659                 si_shader_dump(sscreen, ctx.shader, debug,
5660                                PIPE_SHADER_GEOMETRY, stderr, true);
5661                 r = si_shader_binary_upload(sscreen, ctx.shader);
5662         }
5663
5664         si_llvm_dispose(&ctx);
5665
5666         if (r != 0) {
5667                 FREE(shader);
5668                 shader = NULL;
5669         } else {
5670                 si_fix_resource_usage(sscreen, shader);
5671         }
5672         return shader;
5673 }
5674
5675 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5676                                   const struct si_vs_prolog_bits *prolog,
5677                                   const char *prefix, FILE *f)
5678 {
5679         fprintf(f, "  %s.instance_divisor_is_one = %u\n",
5680                 prefix, prolog->instance_divisor_is_one);
5681         fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
5682                 prefix, prolog->instance_divisor_is_fetched);
5683         fprintf(f, "  %s.ls_vgpr_fix = %u\n",
5684                 prefix, prolog->ls_vgpr_fix);
5685
5686         fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
5687         fprintf(f, "  mono.vs.fix_fetch = {");
5688         for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
5689                 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
5690                 if (i)
5691                         fprintf(f, ", ");
5692                 if (!fix.bits)
5693                         fprintf(f, "0");
5694                 else
5695                         fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
5696                                 fix.u.num_channels_m1, fix.u.format);
5697         }
5698         fprintf(f, "}\n");
5699 }
5700
5701 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5702                                FILE *f)
5703 {
5704         const struct si_shader_key *key = &shader->key;
5705
5706         fprintf(f, "SHADER KEY\n");
5707
5708         switch (processor) {
5709         case PIPE_SHADER_VERTEX:
5710                 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5711                                       "part.vs.prolog", f);
5712                 fprintf(f, "  as_es = %u\n", key->as_es);
5713                 fprintf(f, "  as_ls = %u\n", key->as_ls);
5714                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5715                         key->mono.u.vs_export_prim_id);
5716                 break;
5717
5718         case PIPE_SHADER_TESS_CTRL:
5719                 if (shader->selector->screen->info.chip_class >= GFX9) {
5720                         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5721                                               "part.tcs.ls_prolog", f);
5722                 }
5723                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5724                 fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5725                 break;
5726
5727         case PIPE_SHADER_TESS_EVAL:
5728                 fprintf(f, "  as_es = %u\n", key->as_es);
5729                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5730                         key->mono.u.vs_export_prim_id);
5731                 break;
5732
5733         case PIPE_SHADER_GEOMETRY:
5734                 if (shader->is_gs_copy_shader)
5735                         break;
5736
5737                 if (shader->selector->screen->info.chip_class >= GFX9 &&
5738                     key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5739                         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5740                                               "part.gs.vs_prolog", f);
5741                 }
5742                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5743                 break;
5744
5745         case PIPE_SHADER_COMPUTE:
5746                 break;
5747
5748         case PIPE_SHADER_FRAGMENT:
5749                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5750                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5751                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5752                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5753                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5754                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5755                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5756                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5757                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5758                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5759                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5760                 fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5761                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5762                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5763                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5764                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5765                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5766                 break;
5767
5768         default:
5769                 assert(0);
5770         }
5771
5772         if ((processor == PIPE_SHADER_GEOMETRY ||
5773              processor == PIPE_SHADER_TESS_EVAL ||
5774              processor == PIPE_SHADER_VERTEX) &&
5775             !key->as_es && !key->as_ls) {
5776                 fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5777                 fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
5778         }
5779 }
5780
5781 static void si_init_shader_ctx(struct si_shader_context *ctx,
5782                                struct si_screen *sscreen,
5783                                struct ac_llvm_compiler *compiler)
5784 {
5785         struct lp_build_tgsi_context *bld_base;
5786
5787         si_llvm_context_init(ctx, sscreen, compiler);
5788
5789         bld_base = &ctx->bld_base;
5790         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5791
5792         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID].emit = build_interp_intrinsic;
5793         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE].emit = build_interp_intrinsic;
5794         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET].emit = build_interp_intrinsic;
5795
5796         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5797
5798         bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5799
5800         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5801         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5802         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5803         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5804
5805         bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5806         bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5807         bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5808         bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5809         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5810         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5811         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5812         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5813
5814         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex;
5815         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_tgsi_emit_primitive;
5816         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5817 }
5818
5819 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5820 {
5821         struct si_shader *shader = ctx->shader;
5822         struct tgsi_shader_info *info = &shader->selector->info;
5823
5824         if ((ctx->type != PIPE_SHADER_VERTEX &&
5825              ctx->type != PIPE_SHADER_TESS_EVAL) ||
5826             shader->key.as_ls ||
5827             shader->key.as_es)
5828                 return;
5829
5830         ac_optimize_vs_outputs(&ctx->ac,
5831                                ctx->main_fn,
5832                                shader->info.vs_output_param_offset,
5833                                info->num_outputs,
5834                                &shader->info.nr_param_exports);
5835 }
5836
5837 static void si_init_exec_from_input(struct si_shader_context *ctx,
5838                                     unsigned param, unsigned bitoffset)
5839 {
5840         LLVMValueRef args[] = {
5841                 LLVMGetParam(ctx->main_fn, param),
5842                 LLVMConstInt(ctx->i32, bitoffset, 0),
5843         };
5844         ac_build_intrinsic(&ctx->ac,
5845                            "llvm.amdgcn.init.exec.from.input",
5846                            ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
5847 }
5848
5849 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
5850                                const struct si_vs_prolog_bits *key)
5851 {
5852         /* VGPR initialization fixup for Vega10 and Raven is always done in the
5853          * VS prolog. */
5854         return sel->vs_needs_prolog || key->ls_vgpr_fix;
5855 }
5856
5857 static bool si_compile_tgsi_main(struct si_shader_context *ctx)
5858 {
5859         struct si_shader *shader = ctx->shader;
5860         struct si_shader_selector *sel = shader->selector;
5861         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5862
5863         // TODO clean all this up!
5864         switch (ctx->type) {
5865         case PIPE_SHADER_VERTEX:
5866                 ctx->load_input = declare_input_vs;
5867                 if (shader->key.as_ls)
5868                         ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
5869                 else if (shader->key.as_es)
5870                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5871                 else
5872                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5873                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5874                 ctx->abi.load_base_vertex = get_base_vertex;
5875                 break;
5876         case PIPE_SHADER_TESS_CTRL:
5877                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5878                 ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
5879                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5880                 bld_base->emit_store = store_output_tcs;
5881                 ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
5882                 ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
5883                 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
5884                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5885                 break;
5886         case PIPE_SHADER_TESS_EVAL:
5887                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5888                 ctx->abi.load_tess_varyings = si_nir_load_input_tes;
5889                 ctx->abi.load_tess_coord = si_load_tess_coord;
5890                 ctx->abi.load_tess_level = si_load_tess_level;
5891                 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
5892                 if (shader->key.as_es)
5893                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5894                 else
5895                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5896                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5897                 break;
5898         case PIPE_SHADER_GEOMETRY:
5899                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5900                 ctx->abi.load_inputs = si_nir_load_input_gs;
5901                 ctx->abi.emit_vertex = si_llvm_emit_vertex;
5902                 ctx->abi.emit_primitive = si_llvm_emit_primitive;
5903                 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
5904                 bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue;
5905                 break;
5906         case PIPE_SHADER_FRAGMENT:
5907                 ctx->load_input = declare_input_fs;
5908                 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5909                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5910                 ctx->abi.lookup_interp_param = si_nir_lookup_interp_param;
5911                 ctx->abi.load_sample_position = load_sample_position;
5912                 ctx->abi.load_sample_mask_in = load_sample_mask_in;
5913                 ctx->abi.emit_kill = si_llvm_emit_kill;
5914                 break;
5915         case PIPE_SHADER_COMPUTE:
5916                 ctx->abi.load_local_group_size = get_block_size;
5917                 break;
5918         default:
5919                 assert(!"Unsupported shader type");
5920                 return false;
5921         }
5922
5923         ctx->abi.load_ubo = load_ubo;
5924         ctx->abi.load_ssbo = load_ssbo;
5925
5926         create_function(ctx);
5927         preload_ring_buffers(ctx);
5928
5929         /* For GFX9 merged shaders:
5930          * - Set EXEC for the first shader. If the prolog is present, set
5931          *   EXEC there instead.
5932          * - Add a barrier before the second shader.
5933          * - In the second shader, reset EXEC to ~0 and wrap the main part in
5934          *   an if-statement. This is required for correctness in geometry
5935          *   shaders, to ensure that empty GS waves do not send GS_EMIT and
5936          *   GS_CUT messages.
5937          *
5938          * For monolithic merged shaders, the first shader is wrapped in an
5939          * if-block together with its prolog in si_build_wrapper_function.
5940          */
5941         if (ctx->screen->info.chip_class >= GFX9) {
5942                 if (!shader->is_monolithic &&
5943                     sel->info.num_instructions > 1 && /* not empty shader */
5944                     (shader->key.as_es || shader->key.as_ls) &&
5945                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
5946                      (ctx->type == PIPE_SHADER_VERTEX &&
5947                       !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
5948                         si_init_exec_from_input(ctx,
5949                                                 ctx->param_merged_wave_info, 0);
5950                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5951                            ctx->type == PIPE_SHADER_GEOMETRY) {
5952                         if (!shader->is_monolithic)
5953                                 ac_init_exec_full_mask(&ctx->ac);
5954
5955                         LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5956                         LLVMValueRef ena =
5957                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5958                                             ac_get_thread_id(&ctx->ac), num_threads, "");
5959                         lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5960
5961                         /* The barrier must execute for all shaders in a
5962                          * threadgroup.
5963                          *
5964                          * Execute the barrier inside the conditional block,
5965                          * so that empty waves can jump directly to s_endpgm,
5966                          * which will also signal the barrier.
5967                          *
5968                          * If the shader is TCS and the TCS epilog is present
5969                          * and contains a barrier, it will wait there and then
5970                          * reach s_endpgm.
5971                          */
5972                         si_llvm_emit_barrier(NULL, bld_base, NULL);
5973                 }
5974         }
5975
5976         if (ctx->type == PIPE_SHADER_TESS_CTRL &&
5977             sel->tcs_info.tessfactors_are_def_in_all_invocs) {
5978                 for (unsigned i = 0; i < 6; i++) {
5979                         ctx->invoc0_tess_factors[i] =
5980                                 ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
5981                 }
5982         }
5983
5984         if (ctx->type == PIPE_SHADER_GEOMETRY) {
5985                 int i;
5986                 for (i = 0; i < 4; i++) {
5987                         ctx->gs_next_vertex[i] =
5988                                 ac_build_alloca(&ctx->ac, ctx->i32, "");
5989                 }
5990         }
5991
5992         if (sel->force_correct_derivs_after_kill) {
5993                 ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
5994                 /* true = don't kill. */
5995                 LLVMBuildStore(ctx->ac.builder, ctx->i1true,
5996                                ctx->postponed_kill);
5997         }
5998
5999         if (sel->tokens) {
6000                 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6001                         fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6002                         return false;
6003                 }
6004         } else {
6005                 if (!si_nir_build_llvm(ctx, sel->nir)) {
6006                         fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
6007                         return false;
6008                 }
6009         }
6010
6011         si_llvm_build_ret(ctx, ctx->return_value);
6012         return true;
6013 }
6014
6015 /**
6016  * Compute the VS prolog key, which contains all the information needed to
6017  * build the VS prolog function, and set shader->info bits where needed.
6018  *
6019  * \param info             Shader info of the vertex shader.
6020  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
6021  * \param prolog_key       Key of the VS prolog
6022  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
6023  * \param key              Output shader part key.
6024  */
6025 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
6026                                  unsigned num_input_sgprs,
6027                                  const struct si_vs_prolog_bits *prolog_key,
6028                                  struct si_shader *shader_out,
6029                                  union si_shader_part_key *key)
6030 {
6031         memset(key, 0, sizeof(*key));
6032         key->vs_prolog.states = *prolog_key;
6033         key->vs_prolog.num_input_sgprs = num_input_sgprs;
6034         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6035         key->vs_prolog.as_ls = shader_out->key.as_ls;
6036         key->vs_prolog.as_es = shader_out->key.as_es;
6037
6038         if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
6039                 key->vs_prolog.as_ls = 1;
6040                 key->vs_prolog.num_merged_next_stage_vgprs = 2;
6041         } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
6042                 key->vs_prolog.as_es = 1;
6043                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
6044         }
6045
6046         /* Enable loading the InstanceID VGPR. */
6047         uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
6048
6049         if ((key->vs_prolog.states.instance_divisor_is_one |
6050              key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
6051                 shader_out->info.uses_instanceid = true;
6052 }
6053
6054 /**
6055  * Compute the PS prolog key, which contains all the information needed to
6056  * build the PS prolog function, and set related bits in shader->config.
6057  */
6058 static void si_get_ps_prolog_key(struct si_shader *shader,
6059                                  union si_shader_part_key *key,
6060                                  bool separate_prolog)
6061 {
6062         struct tgsi_shader_info *info = &shader->selector->info;
6063
6064         memset(key, 0, sizeof(*key));
6065         key->ps_prolog.states = shader->key.part.ps.prolog;
6066         key->ps_prolog.colors_read = info->colors_read;
6067         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6068         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6069         key->ps_prolog.wqm = info->uses_derivatives &&
6070                 (key->ps_prolog.colors_read ||
6071                  key->ps_prolog.states.force_persp_sample_interp ||
6072                  key->ps_prolog.states.force_linear_sample_interp ||
6073                  key->ps_prolog.states.force_persp_center_interp ||
6074                  key->ps_prolog.states.force_linear_center_interp ||
6075                  key->ps_prolog.states.bc_optimize_for_persp ||
6076                  key->ps_prolog.states.bc_optimize_for_linear);
6077         key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
6078
6079         if (info->colors_read) {
6080                 unsigned *color = shader->selector->color_attr_index;
6081
6082                 if (shader->key.part.ps.prolog.color_two_side) {
6083                         /* BCOLORs are stored after the last input. */
6084                         key->ps_prolog.num_interp_inputs = info->num_inputs;
6085                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6086                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6087                 }
6088
6089                 for (unsigned i = 0; i < 2; i++) {
6090                         unsigned interp = info->input_interpolate[color[i]];
6091                         unsigned location = info->input_interpolate_loc[color[i]];
6092
6093                         if (!(info->colors_read & (0xf << i*4)))
6094                                 continue;
6095
6096                         key->ps_prolog.color_attr_index[i] = color[i];
6097
6098                         if (shader->key.part.ps.prolog.flatshade_colors &&
6099                             interp == TGSI_INTERPOLATE_COLOR)
6100                                 interp = TGSI_INTERPOLATE_CONSTANT;
6101
6102                         switch (interp) {
6103                         case TGSI_INTERPOLATE_CONSTANT:
6104                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
6105                                 break;
6106                         case TGSI_INTERPOLATE_PERSPECTIVE:
6107                         case TGSI_INTERPOLATE_COLOR:
6108                                 /* Force the interpolation location for colors here. */
6109                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
6110                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
6111                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
6112                                         location = TGSI_INTERPOLATE_LOC_CENTER;
6113
6114                                 switch (location) {
6115                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
6116                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
6117                                         shader->config.spi_ps_input_ena |=
6118                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
6119                                         break;
6120                                 case TGSI_INTERPOLATE_LOC_CENTER:
6121                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
6122                                         shader->config.spi_ps_input_ena |=
6123                                                 S_0286CC_PERSP_CENTER_ENA(1);
6124                                         break;
6125                                 case TGSI_INTERPOLATE_LOC_CENTROID:
6126                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
6127                                         shader->config.spi_ps_input_ena |=
6128                                                 S_0286CC_PERSP_CENTROID_ENA(1);
6129                                         break;
6130                                 default:
6131                                         assert(0);
6132                                 }
6133                                 break;
6134                         case TGSI_INTERPOLATE_LINEAR:
6135                                 /* Force the interpolation location for colors here. */
6136                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
6137                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
6138                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
6139                                         location = TGSI_INTERPOLATE_LOC_CENTER;
6140
6141                                 /* The VGPR assignment for non-monolithic shaders
6142                                  * works because InitialPSInputAddr is set on the
6143                                  * main shader and PERSP_PULL_MODEL is never used.
6144                                  */
6145                                 switch (location) {
6146                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
6147                                         key->ps_prolog.color_interp_vgpr_index[i] =
6148                                                 separate_prolog ? 6 : 9;
6149                                         shader->config.spi_ps_input_ena |=
6150                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
6151                                         break;
6152                                 case TGSI_INTERPOLATE_LOC_CENTER:
6153                                         key->ps_prolog.color_interp_vgpr_index[i] =
6154                                                 separate_prolog ? 8 : 11;
6155                                         shader->config.spi_ps_input_ena |=
6156                                                 S_0286CC_LINEAR_CENTER_ENA(1);
6157                                         break;
6158                                 case TGSI_INTERPOLATE_LOC_CENTROID:
6159                                         key->ps_prolog.color_interp_vgpr_index[i] =
6160                                                 separate_prolog ? 10 : 13;
6161                                         shader->config.spi_ps_input_ena |=
6162                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
6163                                         break;
6164                                 default:
6165                                         assert(0);
6166                                 }
6167                                 break;
6168                         default:
6169                                 assert(0);
6170                         }
6171                 }
6172         }
6173 }
6174
6175 /**
6176  * Check whether a PS prolog is required based on the key.
6177  */
6178 static bool si_need_ps_prolog(const union si_shader_part_key *key)
6179 {
6180         return key->ps_prolog.colors_read ||
6181                key->ps_prolog.states.force_persp_sample_interp ||
6182                key->ps_prolog.states.force_linear_sample_interp ||
6183                key->ps_prolog.states.force_persp_center_interp ||
6184                key->ps_prolog.states.force_linear_center_interp ||
6185                key->ps_prolog.states.bc_optimize_for_persp ||
6186                key->ps_prolog.states.bc_optimize_for_linear ||
6187                key->ps_prolog.states.poly_stipple ||
6188                key->ps_prolog.states.samplemask_log_ps_iter;
6189 }
6190
6191 /**
6192  * Compute the PS epilog key, which contains all the information needed to
6193  * build the PS epilog function.
6194  */
6195 static void si_get_ps_epilog_key(struct si_shader *shader,
6196                                  union si_shader_part_key *key)
6197 {
6198         struct tgsi_shader_info *info = &shader->selector->info;
6199         memset(key, 0, sizeof(*key));
6200         key->ps_epilog.colors_written = info->colors_written;
6201         key->ps_epilog.writes_z = info->writes_z;
6202         key->ps_epilog.writes_stencil = info->writes_stencil;
6203         key->ps_epilog.writes_samplemask = info->writes_samplemask;
6204         key->ps_epilog.states = shader->key.part.ps.epilog;
6205 }
6206
6207 /**
6208  * Build the GS prolog function. Rotate the input vertices for triangle strips
6209  * with adjacency.
6210  */
6211 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
6212                                         union si_shader_part_key *key)
6213 {
6214         unsigned num_sgprs, num_vgprs;
6215         struct si_function_info fninfo;
6216         LLVMBuilderRef builder = ctx->ac.builder;
6217         LLVMTypeRef returns[48];
6218         LLVMValueRef func, ret;
6219
6220         si_init_function_info(&fninfo);
6221
6222         if (ctx->screen->info.chip_class >= GFX9) {
6223                 if (key->gs_prolog.states.gfx9_prev_is_vs)
6224                         num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
6225                 else
6226                         num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
6227                 num_vgprs = 5; /* ES inputs are not needed by GS */
6228         } else {
6229                 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6230                 num_vgprs = 8;
6231         }
6232
6233         for (unsigned i = 0; i < num_sgprs; ++i) {
6234                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6235                 returns[i] = ctx->i32;
6236         }
6237
6238         for (unsigned i = 0; i < num_vgprs; ++i) {
6239                 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6240                 returns[num_sgprs + i] = ctx->f32;
6241         }
6242
6243         /* Create the function. */
6244         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6245                            &fninfo, 0);
6246         func = ctx->main_fn;
6247
6248         /* Set the full EXEC mask for the prolog, because we are only fiddling
6249          * with registers here. The main shader part will set the correct EXEC
6250          * mask.
6251          */
6252         if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6253                 ac_init_exec_full_mask(&ctx->ac);
6254
6255         /* Copy inputs to outputs. This should be no-op, as the registers match,
6256          * but it will prevent the compiler from overwriting them unintentionally.
6257          */
6258         ret = ctx->return_value;
6259         for (unsigned i = 0; i < num_sgprs; i++) {
6260                 LLVMValueRef p = LLVMGetParam(func, i);
6261                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6262         }
6263         for (unsigned i = 0; i < num_vgprs; i++) {
6264                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6265                 p = ac_to_float(&ctx->ac, p);
6266                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6267         }
6268
6269         if (key->gs_prolog.states.tri_strip_adj_fix) {
6270                 /* Remap the input vertices for every other primitive. */
6271                 const unsigned gfx6_vtx_params[6] = {
6272                         num_sgprs,
6273                         num_sgprs + 1,
6274                         num_sgprs + 3,
6275                         num_sgprs + 4,
6276                         num_sgprs + 5,
6277                         num_sgprs + 6
6278                 };
6279                 const unsigned gfx9_vtx_params[3] = {
6280                         num_sgprs,
6281                         num_sgprs + 1,
6282                         num_sgprs + 4,
6283                 };
6284                 LLVMValueRef vtx_in[6], vtx_out[6];
6285                 LLVMValueRef prim_id, rotate;
6286
6287                 if (ctx->screen->info.chip_class >= GFX9) {
6288                         for (unsigned i = 0; i < 3; i++) {
6289                                 vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6290                                 vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6291                         }
6292                 } else {
6293                         for (unsigned i = 0; i < 6; i++)
6294                                 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6295                 }
6296
6297                 prim_id = LLVMGetParam(func, num_sgprs + 2);
6298                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6299
6300                 for (unsigned i = 0; i < 6; ++i) {
6301                         LLVMValueRef base, rotated;
6302                         base = vtx_in[i];
6303                         rotated = vtx_in[(i + 4) % 6];
6304                         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6305                 }
6306
6307                 if (ctx->screen->info.chip_class >= GFX9) {
6308                         for (unsigned i = 0; i < 3; i++) {
6309                                 LLVMValueRef hi, out;
6310
6311                                 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6312                                                   LLVMConstInt(ctx->i32, 16, 0), "");
6313                                 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6314                                 out = ac_to_float(&ctx->ac, out);
6315                                 ret = LLVMBuildInsertValue(builder, ret, out,
6316                                                            gfx9_vtx_params[i], "");
6317                         }
6318                 } else {
6319                         for (unsigned i = 0; i < 6; i++) {
6320                                 LLVMValueRef out;
6321
6322                                 out = ac_to_float(&ctx->ac, vtx_out[i]);
6323                                 ret = LLVMBuildInsertValue(builder, ret, out,
6324                                                            gfx6_vtx_params[i], "");
6325                         }
6326                 }
6327         }
6328
6329         LLVMBuildRet(builder, ret);
6330 }
6331
6332 /**
6333  * Given a list of shader part functions, build a wrapper function that
6334  * runs them in sequence to form a monolithic shader.
6335  */
6336 static void si_build_wrapper_function(struct si_shader_context *ctx,
6337                                       LLVMValueRef *parts,
6338                                       unsigned num_parts,
6339                                       unsigned main_part,
6340                                       unsigned next_shader_first_part)
6341 {
6342         LLVMBuilderRef builder = ctx->ac.builder;
6343         /* PS epilog has one arg per color component; gfx9 merged shader
6344          * prologs need to forward 32 user SGPRs.
6345          */
6346         struct si_function_info fninfo;
6347         LLVMValueRef initial[64], out[64];
6348         LLVMTypeRef function_type;
6349         unsigned num_first_params;
6350         unsigned num_out, initial_num_out;
6351         MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6352         MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6353         unsigned num_sgprs, num_vgprs;
6354         unsigned gprs;
6355         struct lp_build_if_state if_state;
6356
6357         si_init_function_info(&fninfo);
6358
6359         for (unsigned i = 0; i < num_parts; ++i) {
6360                 ac_add_function_attr(ctx->ac.context, parts[i], -1,
6361                                      AC_FUNC_ATTR_ALWAYSINLINE);
6362                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6363         }
6364
6365         /* The parameters of the wrapper function correspond to those of the
6366          * first part in terms of SGPRs and VGPRs, but we use the types of the
6367          * main part to get the right types. This is relevant for the
6368          * dereferenceable attribute on descriptor table pointers.
6369          */
6370         num_sgprs = 0;
6371         num_vgprs = 0;
6372
6373         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6374         num_first_params = LLVMCountParamTypes(function_type);
6375
6376         for (unsigned i = 0; i < num_first_params; ++i) {
6377                 LLVMValueRef param = LLVMGetParam(parts[0], i);
6378
6379                 if (ac_is_sgpr_param(param)) {
6380                         assert(num_vgprs == 0);
6381                         num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6382                 } else {
6383                         num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6384                 }
6385         }
6386
6387         gprs = 0;
6388         while (gprs < num_sgprs + num_vgprs) {
6389                 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6390                 LLVMTypeRef type = LLVMTypeOf(param);
6391                 unsigned size = ac_get_type_size(type) / 4;
6392
6393                 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6394
6395                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6396                 assert(gprs + size <= num_sgprs + num_vgprs &&
6397                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
6398
6399                 gprs += size;
6400         }
6401
6402         si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6403                            si_get_max_workgroup_size(ctx->shader));
6404
6405         if (is_merged_shader(ctx))
6406                 ac_init_exec_full_mask(&ctx->ac);
6407
6408         /* Record the arguments of the function as if they were an output of
6409          * a previous part.
6410          */
6411         num_out = 0;
6412         num_out_sgpr = 0;
6413
6414         for (unsigned i = 0; i < fninfo.num_params; ++i) {
6415                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6416                 LLVMTypeRef param_type = LLVMTypeOf(param);
6417                 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6418                 unsigned size = ac_get_type_size(param_type) / 4;
6419
6420                 if (size == 1) {
6421                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6422                                 param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
6423                                 param_type = ctx->i32;
6424                         }
6425
6426                         if (param_type != out_type)
6427                                 param = LLVMBuildBitCast(builder, param, out_type, "");
6428                         out[num_out++] = param;
6429                 } else {
6430                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6431
6432                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6433                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6434                                 param_type = ctx->i64;
6435                         }
6436
6437                         if (param_type != vector_type)
6438                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
6439
6440                         for (unsigned j = 0; j < size; ++j)
6441                                 out[num_out++] = LLVMBuildExtractElement(
6442                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6443                 }
6444
6445                 if (i < fninfo.num_sgpr_params)
6446                         num_out_sgpr = num_out;
6447         }
6448
6449         memcpy(initial, out, sizeof(out));
6450         initial_num_out = num_out;
6451         initial_num_out_sgpr = num_out_sgpr;
6452
6453         /* Now chain the parts. */
6454         for (unsigned part = 0; part < num_parts; ++part) {
6455                 LLVMValueRef in[48];
6456                 LLVMValueRef ret;
6457                 LLVMTypeRef ret_type;
6458                 unsigned out_idx = 0;
6459                 unsigned num_params = LLVMCountParams(parts[part]);
6460
6461                 /* Merged shaders are executed conditionally depending
6462                  * on the number of enabled threads passed in the input SGPRs. */
6463                 if (is_merged_shader(ctx) && part == 0) {
6464                         LLVMValueRef ena, count = initial[3];
6465
6466                         count = LLVMBuildAnd(builder, count,
6467                                              LLVMConstInt(ctx->i32, 0x7f, 0), "");
6468                         ena = LLVMBuildICmp(builder, LLVMIntULT,
6469                                             ac_get_thread_id(&ctx->ac), count, "");
6470                         lp_build_if(&if_state, &ctx->gallivm, ena);
6471                 }
6472
6473                 /* Derive arguments for the next part from outputs of the
6474                  * previous one.
6475                  */
6476                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6477                         LLVMValueRef param;
6478                         LLVMTypeRef param_type;
6479                         bool is_sgpr;
6480                         unsigned param_size;
6481                         LLVMValueRef arg = NULL;
6482
6483                         param = LLVMGetParam(parts[part], param_idx);
6484                         param_type = LLVMTypeOf(param);
6485                         param_size = ac_get_type_size(param_type) / 4;
6486                         is_sgpr = ac_is_sgpr_param(param);
6487
6488                         if (is_sgpr) {
6489                                 ac_add_function_attr(ctx->ac.context, parts[part],
6490                                                      param_idx + 1, AC_FUNC_ATTR_INREG);
6491                         } else if (out_idx < num_out_sgpr) {
6492                                 /* Skip returned SGPRs the current part doesn't
6493                                  * declare on the input. */
6494                                 out_idx = num_out_sgpr;
6495                         }
6496
6497                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6498
6499                         if (param_size == 1)
6500                                 arg = out[out_idx];
6501                         else
6502                                 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
6503
6504                         if (LLVMTypeOf(arg) != param_type) {
6505                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6506                                         if (LLVMGetPointerAddressSpace(param_type) ==
6507                                             AC_ADDR_SPACE_CONST_32BIT) {
6508                                                 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
6509                                                 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6510                                         } else {
6511                                                 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6512                                                 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6513                                         }
6514                                 } else {
6515                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
6516                                 }
6517                         }
6518
6519                         in[param_idx] = arg;
6520                         out_idx += param_size;
6521                 }
6522
6523                 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6524
6525                 if (is_merged_shader(ctx) &&
6526                     part + 1 == next_shader_first_part) {
6527                         lp_build_endif(&if_state);
6528
6529                         /* The second half of the merged shader should use
6530                          * the inputs from the toplevel (wrapper) function,
6531                          * not the return value from the last call.
6532                          *
6533                          * That's because the last call was executed condi-
6534                          * tionally, so we can't consume it in the main
6535                          * block.
6536                          */
6537                         memcpy(out, initial, sizeof(initial));
6538                         num_out = initial_num_out;
6539                         num_out_sgpr = initial_num_out_sgpr;
6540                         continue;
6541                 }
6542
6543                 /* Extract the returned GPRs. */
6544                 ret_type = LLVMTypeOf(ret);
6545                 num_out = 0;
6546                 num_out_sgpr = 0;
6547
6548                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6549                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6550
6551                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6552
6553                         for (unsigned i = 0; i < ret_size; ++i) {
6554                                 LLVMValueRef val =
6555                                         LLVMBuildExtractValue(builder, ret, i, "");
6556
6557                                 assert(num_out < ARRAY_SIZE(out));
6558                                 out[num_out++] = val;
6559
6560                                 if (LLVMTypeOf(val) == ctx->i32) {
6561                                         assert(num_out_sgpr + 1 == num_out);
6562                                         num_out_sgpr = num_out;
6563                                 }
6564                         }
6565                 }
6566         }
6567
6568         LLVMBuildRetVoid(builder);
6569 }
6570
6571 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
6572                                     struct si_shader_selector *sel)
6573 {
6574         if (!compiler->low_opt_passes)
6575                 return false;
6576
6577         /* Assume a slow CPU. */
6578         assert(!sel->screen->info.has_dedicated_vram &&
6579                sel->screen->info.chip_class <= VI);
6580
6581         /* For a crazy dEQP test containing 2597 memory opcodes, mostly
6582          * buffer stores. */
6583         return sel->type == PIPE_SHADER_COMPUTE &&
6584                sel->info.num_memory_instructions > 1000;
6585 }
6586
6587 int si_compile_tgsi_shader(struct si_screen *sscreen,
6588                            struct ac_llvm_compiler *compiler,
6589                            struct si_shader *shader,
6590                            struct pipe_debug_callback *debug)
6591 {
6592         struct si_shader_selector *sel = shader->selector;
6593         struct si_shader_context ctx;
6594         int r = -1;
6595
6596         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6597          * conversion fails. */
6598         if (si_can_dump_shader(sscreen, sel->info.processor) &&
6599             !(sscreen->debug_flags & DBG(NO_TGSI))) {
6600                 if (sel->tokens)
6601                         tgsi_dump(sel->tokens, 0);
6602                 else
6603                         nir_print_shader(sel->nir, stderr);
6604                 si_dump_streamout(&sel->so);
6605         }
6606
6607         si_init_shader_ctx(&ctx, sscreen, compiler);
6608         si_llvm_context_set_tgsi(&ctx, shader);
6609
6610         memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6611                sizeof(shader->info.vs_output_param_offset));
6612
6613         shader->info.uses_instanceid = sel->info.uses_instanceid;
6614
6615         if (!si_compile_tgsi_main(&ctx)) {
6616                 si_llvm_dispose(&ctx);
6617                 return -1;
6618         }
6619
6620         if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6621                 LLVMValueRef parts[2];
6622                 bool need_prolog = sel->vs_needs_prolog;
6623
6624                 parts[1] = ctx.main_fn;
6625
6626                 if (need_prolog) {
6627                         union si_shader_part_key prolog_key;
6628                         si_get_vs_prolog_key(&sel->info,
6629                                              shader->info.num_input_sgprs,
6630                                              &shader->key.part.vs.prolog,
6631                                              shader, &prolog_key);
6632                         si_build_vs_prolog_function(&ctx, &prolog_key);
6633                         parts[0] = ctx.main_fn;
6634                 }
6635
6636                 si_build_wrapper_function(&ctx, parts + !need_prolog,
6637                                           1 + need_prolog, need_prolog, 0);
6638         } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6639                 if (sscreen->info.chip_class >= GFX9) {
6640                         struct si_shader_selector *ls = shader->key.part.tcs.ls;
6641                         LLVMValueRef parts[4];
6642                         bool vs_needs_prolog =
6643                                 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6644
6645                         /* TCS main part */
6646                         parts[2] = ctx.main_fn;
6647
6648                         /* TCS epilog */
6649                         union si_shader_part_key tcs_epilog_key;
6650                         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6651                         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6652                         si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6653                         parts[3] = ctx.main_fn;
6654
6655                         /* VS as LS main part */
6656                         struct si_shader shader_ls = {};
6657                         shader_ls.selector = ls;
6658                         shader_ls.key.as_ls = 1;
6659                         shader_ls.key.mono = shader->key.mono;
6660                         shader_ls.key.opt = shader->key.opt;
6661                         shader_ls.is_monolithic = true;
6662                         si_llvm_context_set_tgsi(&ctx, &shader_ls);
6663
6664                         if (!si_compile_tgsi_main(&ctx)) {
6665                                 si_llvm_dispose(&ctx);
6666                                 return -1;
6667                         }
6668                         shader->info.uses_instanceid |= ls->info.uses_instanceid;
6669                         parts[1] = ctx.main_fn;
6670
6671                         /* LS prolog */
6672                         if (vs_needs_prolog) {
6673                                 union si_shader_part_key vs_prolog_key;
6674                                 si_get_vs_prolog_key(&ls->info,
6675                                                      shader_ls.info.num_input_sgprs,
6676                                                      &shader->key.part.tcs.ls_prolog,
6677                                                      shader, &vs_prolog_key);
6678                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6679                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6680                                 parts[0] = ctx.main_fn;
6681                         }
6682
6683                         /* Reset the shader context. */
6684                         ctx.shader = shader;
6685                         ctx.type = PIPE_SHADER_TESS_CTRL;
6686
6687                         si_build_wrapper_function(&ctx,
6688                                                   parts + !vs_needs_prolog,
6689                                                   4 - !vs_needs_prolog, vs_needs_prolog,
6690                                                   vs_needs_prolog ? 2 : 1);
6691                 } else {
6692                         LLVMValueRef parts[2];
6693                         union si_shader_part_key epilog_key;
6694
6695                         parts[0] = ctx.main_fn;
6696
6697                         memset(&epilog_key, 0, sizeof(epilog_key));
6698                         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6699                         si_build_tcs_epilog_function(&ctx, &epilog_key);
6700                         parts[1] = ctx.main_fn;
6701
6702                         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6703                 }
6704         } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6705                 if (ctx.screen->info.chip_class >= GFX9) {
6706                         struct si_shader_selector *es = shader->key.part.gs.es;
6707                         LLVMValueRef es_prolog = NULL;
6708                         LLVMValueRef es_main = NULL;
6709                         LLVMValueRef gs_prolog = NULL;
6710                         LLVMValueRef gs_main = ctx.main_fn;
6711
6712                         /* GS prolog */
6713                         union si_shader_part_key gs_prolog_key;
6714                         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6715                         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6716                         gs_prolog_key.gs_prolog.is_monolithic = true;
6717                         si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6718                         gs_prolog = ctx.main_fn;
6719
6720                         /* ES main part */
6721                         struct si_shader shader_es = {};
6722                         shader_es.selector = es;
6723                         shader_es.key.as_es = 1;
6724                         shader_es.key.mono = shader->key.mono;
6725                         shader_es.key.opt = shader->key.opt;
6726                         shader_es.is_monolithic = true;
6727                         si_llvm_context_set_tgsi(&ctx, &shader_es);
6728
6729                         if (!si_compile_tgsi_main(&ctx)) {
6730                                 si_llvm_dispose(&ctx);
6731                                 return -1;
6732                         }
6733                         shader->info.uses_instanceid |= es->info.uses_instanceid;
6734                         es_main = ctx.main_fn;
6735
6736                         /* ES prolog */
6737                         if (es->vs_needs_prolog) {
6738                                 union si_shader_part_key vs_prolog_key;
6739                                 si_get_vs_prolog_key(&es->info,
6740                                                      shader_es.info.num_input_sgprs,
6741                                                      &shader->key.part.gs.vs_prolog,
6742                                                      shader, &vs_prolog_key);
6743                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6744                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6745                                 es_prolog = ctx.main_fn;
6746                         }
6747
6748                         /* Reset the shader context. */
6749                         ctx.shader = shader;
6750                         ctx.type = PIPE_SHADER_GEOMETRY;
6751
6752                         /* Prepare the array of shader parts. */
6753                         LLVMValueRef parts[4];
6754                         unsigned num_parts = 0, main_part, next_first_part;
6755
6756                         if (es_prolog)
6757                                 parts[num_parts++] = es_prolog;
6758
6759                         parts[main_part = num_parts++] = es_main;
6760                         parts[next_first_part = num_parts++] = gs_prolog;
6761                         parts[num_parts++] = gs_main;
6762
6763                         si_build_wrapper_function(&ctx, parts, num_parts,
6764                                                   main_part, next_first_part);
6765                 } else {
6766                         LLVMValueRef parts[2];
6767                         union si_shader_part_key prolog_key;
6768
6769                         parts[1] = ctx.main_fn;
6770
6771                         memset(&prolog_key, 0, sizeof(prolog_key));
6772                         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6773                         si_build_gs_prolog_function(&ctx, &prolog_key);
6774                         parts[0] = ctx.main_fn;
6775
6776                         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6777                 }
6778         } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6779                 LLVMValueRef parts[3];
6780                 union si_shader_part_key prolog_key;
6781                 union si_shader_part_key epilog_key;
6782                 bool need_prolog;
6783
6784                 si_get_ps_prolog_key(shader, &prolog_key, false);
6785                 need_prolog = si_need_ps_prolog(&prolog_key);
6786
6787                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6788
6789                 if (need_prolog) {
6790                         si_build_ps_prolog_function(&ctx, &prolog_key);
6791                         parts[0] = ctx.main_fn;
6792                 }
6793
6794                 si_get_ps_epilog_key(shader, &epilog_key);
6795                 si_build_ps_epilog_function(&ctx, &epilog_key);
6796                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6797
6798                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6799                                           need_prolog ? 1 : 0, 0);
6800         }
6801
6802         si_llvm_optimize_module(&ctx);
6803
6804         /* Post-optimization transformations and analysis. */
6805         si_optimize_vs_outputs(&ctx);
6806
6807         if ((debug && debug->debug_message) ||
6808             si_can_dump_shader(sscreen, ctx.type)) {
6809                 ctx.shader->config.private_mem_vgprs =
6810                         ac_count_scratch_private_memory(ctx.main_fn);
6811         }
6812
6813         /* Make sure the input is a pointer and not integer followed by inttoptr. */
6814         assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
6815                LLVMPointerTypeKind);
6816
6817         /* Compile to bytecode. */
6818         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
6819                             ctx.ac.module, debug, ctx.type,
6820                             si_get_shader_name(shader, ctx.type),
6821                             si_should_optimize_less(compiler, shader->selector));
6822         si_llvm_dispose(&ctx);
6823         if (r) {
6824                 fprintf(stderr, "LLVM failed to compile shader\n");
6825                 return r;
6826         }
6827
6828         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6829          * LLVM 3.9svn has this bug.
6830          */
6831         if (sel->type == PIPE_SHADER_COMPUTE) {
6832                 unsigned wave_size = 64;
6833                 unsigned max_vgprs = 256;
6834                 unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512;
6835                 unsigned max_sgprs_per_wave = 128;
6836                 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6837                 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6838                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6839
6840                 max_vgprs = max_vgprs / min_waves_per_simd;
6841                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6842
6843                 if (shader->config.num_sgprs > max_sgprs ||
6844                     shader->config.num_vgprs > max_vgprs) {
6845                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
6846                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6847                                 shader->config.num_sgprs, shader->config.num_vgprs,
6848                                 max_sgprs, max_vgprs);
6849
6850                         /* Just terminate the process, because dependent
6851                          * shaders can hang due to bad input data, but use
6852                          * the env var to allow shader-db to work.
6853                          */
6854                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6855                                 abort();
6856                 }
6857         }
6858
6859         /* Add the scratch offset to input SGPRs. */
6860         if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx))
6861                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6862
6863         /* Calculate the number of fragment input VGPRs. */
6864         if (ctx.type == PIPE_SHADER_FRAGMENT) {
6865                 shader->info.num_input_vgprs = 0;
6866                 shader->info.face_vgpr_index = -1;
6867                 shader->info.ancillary_vgpr_index = -1;
6868
6869                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6870                         shader->info.num_input_vgprs += 2;
6871                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6872                         shader->info.num_input_vgprs += 2;
6873                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6874                         shader->info.num_input_vgprs += 2;
6875                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6876                         shader->info.num_input_vgprs += 3;
6877                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6878                         shader->info.num_input_vgprs += 2;
6879                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6880                         shader->info.num_input_vgprs += 2;
6881                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6882                         shader->info.num_input_vgprs += 2;
6883                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6884                         shader->info.num_input_vgprs += 1;
6885                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6886                         shader->info.num_input_vgprs += 1;
6887                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6888                         shader->info.num_input_vgprs += 1;
6889                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6890                         shader->info.num_input_vgprs += 1;
6891                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6892                         shader->info.num_input_vgprs += 1;
6893                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6894                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6895                         shader->info.num_input_vgprs += 1;
6896                 }
6897                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) {
6898                         shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs;
6899                         shader->info.num_input_vgprs += 1;
6900                 }
6901                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6902                         shader->info.num_input_vgprs += 1;
6903                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6904                         shader->info.num_input_vgprs += 1;
6905         }
6906
6907         si_calculate_max_simd_waves(shader);
6908         si_shader_dump_stats_for_shader_db(shader, debug);
6909         return 0;
6910 }
6911
6912 /**
6913  * Create, compile and return a shader part (prolog or epilog).
6914  *
6915  * \param sscreen       screen
6916  * \param list          list of shader parts of the same category
6917  * \param type          shader type
6918  * \param key           shader part key
6919  * \param prolog        whether the part being requested is a prolog
6920  * \param tm            LLVM target machine
6921  * \param debug         debug callback
6922  * \param build         the callback responsible for building the main function
6923  * \return              non-NULL on success
6924  */
6925 static struct si_shader_part *
6926 si_get_shader_part(struct si_screen *sscreen,
6927                    struct si_shader_part **list,
6928                    enum pipe_shader_type type,
6929                    bool prolog,
6930                    union si_shader_part_key *key,
6931                    struct ac_llvm_compiler *compiler,
6932                    struct pipe_debug_callback *debug,
6933                    void (*build)(struct si_shader_context *,
6934                                  union si_shader_part_key *),
6935                    const char *name)
6936 {
6937         struct si_shader_part *result;
6938
6939         mtx_lock(&sscreen->shader_parts_mutex);
6940
6941         /* Find existing. */
6942         for (result = *list; result; result = result->next) {
6943                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6944                         mtx_unlock(&sscreen->shader_parts_mutex);
6945                         return result;
6946                 }
6947         }
6948
6949         /* Compile a new one. */
6950         result = CALLOC_STRUCT(si_shader_part);
6951         result->key = *key;
6952
6953         struct si_shader shader = {};
6954         struct si_shader_context ctx;
6955
6956         si_init_shader_ctx(&ctx, sscreen, compiler);
6957         ctx.shader = &shader;
6958         ctx.type = type;
6959
6960         switch (type) {
6961         case PIPE_SHADER_VERTEX:
6962                 shader.key.as_ls = key->vs_prolog.as_ls;
6963                 shader.key.as_es = key->vs_prolog.as_es;
6964                 break;
6965         case PIPE_SHADER_TESS_CTRL:
6966                 assert(!prolog);
6967                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6968                 break;
6969         case PIPE_SHADER_GEOMETRY:
6970                 assert(prolog);
6971                 break;
6972         case PIPE_SHADER_FRAGMENT:
6973                 if (prolog)
6974                         shader.key.part.ps.prolog = key->ps_prolog.states;
6975                 else
6976                         shader.key.part.ps.epilog = key->ps_epilog.states;
6977                 break;
6978         default:
6979                 unreachable("bad shader part");
6980         }
6981
6982         build(&ctx, key);
6983
6984         /* Compile. */
6985         si_llvm_optimize_module(&ctx);
6986
6987         if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
6988                             ctx.ac.module, debug, ctx.type, name, false)) {
6989                 FREE(result);
6990                 result = NULL;
6991                 goto out;
6992         }
6993
6994         result->next = *list;
6995         *list = result;
6996
6997 out:
6998         si_llvm_dispose(&ctx);
6999         mtx_unlock(&sscreen->shader_parts_mutex);
7000         return result;
7001 }
7002
7003 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
7004 {
7005         LLVMValueRef ptr[2], list;
7006         bool merged_shader = is_merged_shader(ctx);
7007
7008         ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
7009         list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
7010                                  ac_array_in_const32_addr_space(ctx->v4i32), "");
7011         return list;
7012 }
7013
7014 /**
7015  * Build the vertex shader prolog function.
7016  *
7017  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
7018  * All inputs are returned unmodified. The vertex load indices are
7019  * stored after them, which will be used by the API VS for fetching inputs.
7020  *
7021  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
7022  *   input_v0,
7023  *   input_v1,
7024  *   input_v2,
7025  *   input_v3,
7026  *   (VertexID + BaseVertex),
7027  *   (InstanceID + StartInstance),
7028  *   (InstanceID / 2 + StartInstance)
7029  */
7030 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
7031                                         union si_shader_part_key *key)
7032 {
7033         struct si_function_info fninfo;
7034         LLVMTypeRef *returns;
7035         LLVMValueRef ret, func;
7036         int num_returns, i;
7037         unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
7038         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
7039         LLVMValueRef input_vgprs[9];
7040         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
7041                                       num_input_vgprs;
7042         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
7043
7044         si_init_function_info(&fninfo);
7045
7046         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
7047         returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
7048                          sizeof(LLVMTypeRef));
7049         num_returns = 0;
7050
7051         /* Declare input and output SGPRs. */
7052         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7053                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7054                 returns[num_returns++] = ctx->i32;
7055         }
7056
7057         /* Preloaded VGPRs (outputs must be floats) */
7058         for (i = 0; i < num_input_vgprs; i++) {
7059                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
7060                 returns[num_returns++] = ctx->f32;
7061         }
7062
7063         /* Vertex load indices. */
7064         for (i = 0; i <= key->vs_prolog.last_input; i++)
7065                 returns[num_returns++] = ctx->f32;
7066
7067         /* Create the function. */
7068         si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
7069         func = ctx->main_fn;
7070
7071         if (key->vs_prolog.num_merged_next_stage_vgprs) {
7072                 if (!key->vs_prolog.is_monolithic)
7073                         si_init_exec_from_input(ctx, 3, 0);
7074
7075                 if (key->vs_prolog.as_ls &&
7076                     ctx->screen->has_ls_vgpr_init_bug) {
7077                         /* If there are no HS threads, SPI loads the LS VGPRs
7078                          * starting at VGPR 0. Shift them back to where they
7079                          * belong.
7080                          */
7081                         LLVMValueRef has_hs_threads =
7082                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
7083                                     si_unpack_param(ctx, 3, 8, 8),
7084                                     ctx->i32_0, "");
7085
7086                         for (i = 4; i > 0; --i) {
7087                                 input_vgprs[i + 1] =
7088                                         LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
7089                                                         input_vgprs[i + 1],
7090                                                         input_vgprs[i - 1], "");
7091                         }
7092                 }
7093         }
7094
7095         ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
7096         ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
7097
7098         /* Copy inputs to outputs. This should be no-op, as the registers match,
7099          * but it will prevent the compiler from overwriting them unintentionally.
7100          */
7101         ret = ctx->return_value;
7102         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7103                 LLVMValueRef p = LLVMGetParam(func, i);
7104                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7105         }
7106         for (i = 0; i < num_input_vgprs; i++) {
7107                 LLVMValueRef p = input_vgprs[i];
7108                 p = ac_to_float(&ctx->ac, p);
7109                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
7110                                            key->vs_prolog.num_input_sgprs + i, "");
7111         }
7112
7113         /* Compute vertex load indices from instance divisors. */
7114         LLVMValueRef instance_divisor_constbuf = NULL;
7115
7116         if (key->vs_prolog.states.instance_divisor_is_fetched) {
7117                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7118                 LLVMValueRef buf_index =
7119                         LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
7120                 instance_divisor_constbuf =
7121                         ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
7122         }
7123
7124         for (i = 0; i <= key->vs_prolog.last_input; i++) {
7125                 bool divisor_is_one =
7126                         key->vs_prolog.states.instance_divisor_is_one & (1u << i);
7127                 bool divisor_is_fetched =
7128                         key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
7129                 LLVMValueRef index = NULL;
7130
7131                 if (divisor_is_one) {
7132                         index = ctx->abi.instance_id;
7133                 } else if (divisor_is_fetched) {
7134                         LLVMValueRef udiv_factors[4];
7135
7136                         for (unsigned j = 0; j < 4; j++) {
7137                                 udiv_factors[j] =
7138                                         buffer_load_const(ctx, instance_divisor_constbuf,
7139                                                           LLVMConstInt(ctx->i32, i*16 + j*4, 0));
7140                                 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
7141                         }
7142                         /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
7143                          * Such InstanceID might not be achievable in a reasonable time though.
7144                          */
7145                         index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
7146                                                        udiv_factors[0], udiv_factors[1],
7147                                                        udiv_factors[2], udiv_factors[3]);
7148                 }
7149
7150                 if (divisor_is_one || divisor_is_fetched) {
7151                         /* Add StartInstance. */
7152                         index = LLVMBuildAdd(ctx->ac.builder, index,
7153                                              LLVMGetParam(ctx->main_fn, user_sgpr_base +
7154                                                           SI_SGPR_START_INSTANCE), "");
7155                 } else {
7156                         /* VertexID + BaseVertex */
7157                         index = LLVMBuildAdd(ctx->ac.builder,
7158                                              ctx->abi.vertex_id,
7159                                              LLVMGetParam(func, user_sgpr_base +
7160                                                                 SI_SGPR_BASE_VERTEX), "");
7161                 }
7162
7163                 index = ac_to_float(&ctx->ac, index);
7164                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
7165                                            fninfo.num_params + i, "");
7166         }
7167
7168         si_llvm_build_ret(ctx, ret);
7169 }
7170
7171 static bool si_get_vs_prolog(struct si_screen *sscreen,
7172                              struct ac_llvm_compiler *compiler,
7173                              struct si_shader *shader,
7174                              struct pipe_debug_callback *debug,
7175                              struct si_shader *main_part,
7176                              const struct si_vs_prolog_bits *key)
7177 {
7178         struct si_shader_selector *vs = main_part->selector;
7179
7180         if (!si_vs_needs_prolog(vs, key))
7181                 return true;
7182
7183         /* Get the prolog. */
7184         union si_shader_part_key prolog_key;
7185         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
7186                              key, shader, &prolog_key);
7187
7188         shader->prolog =
7189                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7190                                    PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
7191                                    debug, si_build_vs_prolog_function,
7192                                    "Vertex Shader Prolog");
7193         return shader->prolog != NULL;
7194 }
7195
7196 /**
7197  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7198  */
7199 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7200                                       struct ac_llvm_compiler *compiler,
7201                                       struct si_shader *shader,
7202                                       struct pipe_debug_callback *debug)
7203 {
7204         return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
7205                                 &shader->key.part.vs.prolog);
7206 }
7207
7208 /**
7209  * Compile the TCS epilog function. This writes tesselation factors to memory
7210  * based on the output primitive type of the tesselator (determined by TES).
7211  */
7212 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
7213                                          union si_shader_part_key *key)
7214 {
7215         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7216         struct si_function_info fninfo;
7217         LLVMValueRef func;
7218
7219         si_init_function_info(&fninfo);
7220
7221         if (ctx->screen->info.chip_class >= GFX9) {
7222                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7223                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7224                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7225                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
7226                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7227                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7228                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7229                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7230                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7231                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7232                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7233                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7234                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7235                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7236                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7237                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7238                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7239                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7240                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7241         } else {
7242                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7243                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7244                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7245                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7246                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7247                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7248                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7249                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7250                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7251                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7252         }
7253
7254         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7255         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7256         unsigned tess_factors_idx =
7257                 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7258         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7259         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7260
7261         for (unsigned i = 0; i < 6; i++)
7262                 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
7263
7264         /* Create the function. */
7265         si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7266                            ctx->screen->info.chip_class >= CIK ? 128 : 64);
7267         ac_declare_lds_as_pointer(&ctx->ac);
7268         func = ctx->main_fn;
7269
7270         LLVMValueRef invoc0_tess_factors[6];
7271         for (unsigned i = 0; i < 6; i++)
7272                 invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i);
7273
7274         si_write_tess_factors(bld_base,
7275                               LLVMGetParam(func, tess_factors_idx),
7276                               LLVMGetParam(func, tess_factors_idx + 1),
7277                               LLVMGetParam(func, tess_factors_idx + 2),
7278                               invoc0_tess_factors, invoc0_tess_factors + 4);
7279
7280         LLVMBuildRetVoid(ctx->ac.builder);
7281 }
7282
7283 /**
7284  * Select and compile (or reuse) TCS parts (epilog).
7285  */
7286 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7287                                        struct ac_llvm_compiler *compiler,
7288                                        struct si_shader *shader,
7289                                        struct pipe_debug_callback *debug)
7290 {
7291         if (sscreen->info.chip_class >= GFX9) {
7292                 struct si_shader *ls_main_part =
7293                         shader->key.part.tcs.ls->main_shader_part_ls;
7294
7295                 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
7296                                       &shader->key.part.tcs.ls_prolog))
7297                         return false;
7298
7299                 shader->previous_stage = ls_main_part;
7300         }
7301
7302         /* Get the epilog. */
7303         union si_shader_part_key epilog_key;
7304         memset(&epilog_key, 0, sizeof(epilog_key));
7305         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7306
7307         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7308                                             PIPE_SHADER_TESS_CTRL, false,
7309                                             &epilog_key, compiler, debug,
7310                                             si_build_tcs_epilog_function,
7311                                             "Tessellation Control Shader Epilog");
7312         return shader->epilog != NULL;
7313 }
7314
7315 /**
7316  * Select and compile (or reuse) GS parts (prolog).
7317  */
7318 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7319                                       struct ac_llvm_compiler *compiler,
7320                                       struct si_shader *shader,
7321                                       struct pipe_debug_callback *debug)
7322 {
7323         if (sscreen->info.chip_class >= GFX9) {
7324                 struct si_shader *es_main_part =
7325                         shader->key.part.gs.es->main_shader_part_es;
7326
7327                 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7328                     !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
7329                                       &shader->key.part.gs.vs_prolog))
7330                         return false;
7331
7332                 shader->previous_stage = es_main_part;
7333         }
7334
7335         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7336                 return true;
7337
7338         union si_shader_part_key prolog_key;
7339         memset(&prolog_key, 0, sizeof(prolog_key));
7340         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7341
7342         shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7343                                             PIPE_SHADER_GEOMETRY, true,
7344                                             &prolog_key, compiler, debug,
7345                                             si_build_gs_prolog_function,
7346                                             "Geometry Shader Prolog");
7347         return shader->prolog2 != NULL;
7348 }
7349
7350 /**
7351  * Build the pixel shader prolog function. This handles:
7352  * - two-side color selection and interpolation
7353  * - overriding interpolation parameters for the API PS
7354  * - polygon stippling
7355  *
7356  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7357  * overriden by other states. (e.g. per-sample interpolation)
7358  * Interpolated colors are stored after the preloaded VGPRs.
7359  */
7360 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7361                                         union si_shader_part_key *key)
7362 {
7363         struct si_function_info fninfo;
7364         LLVMValueRef ret, func;
7365         int num_returns, i, num_color_channels;
7366
7367         assert(si_need_ps_prolog(key));
7368
7369         si_init_function_info(&fninfo);
7370
7371         /* Declare inputs. */
7372         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7373                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7374
7375         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7376                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7377
7378         /* Declare outputs (same as inputs + add colors if needed) */
7379         num_returns = fninfo.num_params;
7380         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7381         for (i = 0; i < num_color_channels; i++)
7382                 fninfo.types[num_returns++] = ctx->f32;
7383
7384         /* Create the function. */
7385         si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7386                            &fninfo, 0);
7387         func = ctx->main_fn;
7388
7389         /* Copy inputs to outputs. This should be no-op, as the registers match,
7390          * but it will prevent the compiler from overwriting them unintentionally.
7391          */
7392         ret = ctx->return_value;
7393         for (i = 0; i < fninfo.num_params; i++) {
7394                 LLVMValueRef p = LLVMGetParam(func, i);
7395                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7396         }
7397
7398         /* Polygon stippling. */
7399         if (key->ps_prolog.states.poly_stipple) {
7400                 /* POS_FIXED_PT is always last. */
7401                 unsigned pos = key->ps_prolog.num_input_sgprs +
7402                                key->ps_prolog.num_input_vgprs - 1;
7403                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7404
7405                 si_llvm_emit_polygon_stipple(ctx, list, pos);
7406         }
7407
7408         if (key->ps_prolog.states.bc_optimize_for_persp ||
7409             key->ps_prolog.states.bc_optimize_for_linear) {
7410                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7411                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7412
7413                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7414                  * The hw doesn't compute CENTROID if the whole wave only
7415                  * contains fully-covered quads.
7416                  *
7417                  * PRIM_MASK is after user SGPRs.
7418                  */
7419                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7420                 bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
7421                                             LLVMConstInt(ctx->i32, 31, 0), "");
7422                 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
7423                                              ctx->i1, "");
7424
7425                 if (key->ps_prolog.states.bc_optimize_for_persp) {
7426                         /* Read PERSP_CENTER. */
7427                         for (i = 0; i < 2; i++)
7428                                 center[i] = LLVMGetParam(func, base + 2 + i);
7429                         /* Read PERSP_CENTROID. */
7430                         for (i = 0; i < 2; i++)
7431                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
7432                         /* Select PERSP_CENTROID. */
7433                         for (i = 0; i < 2; i++) {
7434                                 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7435                                                       center[i], centroid[i], "");
7436                                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7437                                                            tmp, base + 4 + i, "");
7438                         }
7439                 }
7440                 if (key->ps_prolog.states.bc_optimize_for_linear) {
7441                         /* Read LINEAR_CENTER. */
7442                         for (i = 0; i < 2; i++)
7443                                 center[i] = LLVMGetParam(func, base + 8 + i);
7444                         /* Read LINEAR_CENTROID. */
7445                         for (i = 0; i < 2; i++)
7446                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
7447                         /* Select LINEAR_CENTROID. */
7448                         for (i = 0; i < 2; i++) {
7449                                 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7450                                                       center[i], centroid[i], "");
7451                                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7452                                                            tmp, base + 10 + i, "");
7453                         }
7454                 }
7455         }
7456
7457         /* Force per-sample interpolation. */
7458         if (key->ps_prolog.states.force_persp_sample_interp) {
7459                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7460                 LLVMValueRef persp_sample[2];
7461
7462                 /* Read PERSP_SAMPLE. */
7463                 for (i = 0; i < 2; i++)
7464                         persp_sample[i] = LLVMGetParam(func, base + i);
7465                 /* Overwrite PERSP_CENTER. */
7466                 for (i = 0; i < 2; i++)
7467                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7468                                                    persp_sample[i], base + 2 + i, "");
7469                 /* Overwrite PERSP_CENTROID. */
7470                 for (i = 0; i < 2; i++)
7471                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7472                                                    persp_sample[i], base + 4 + i, "");
7473         }
7474         if (key->ps_prolog.states.force_linear_sample_interp) {
7475                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7476                 LLVMValueRef linear_sample[2];
7477
7478                 /* Read LINEAR_SAMPLE. */
7479                 for (i = 0; i < 2; i++)
7480                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7481                 /* Overwrite LINEAR_CENTER. */
7482                 for (i = 0; i < 2; i++)
7483                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7484                                                    linear_sample[i], base + 8 + i, "");
7485                 /* Overwrite LINEAR_CENTROID. */
7486                 for (i = 0; i < 2; i++)
7487                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7488                                                    linear_sample[i], base + 10 + i, "");
7489         }
7490
7491         /* Force center interpolation. */
7492         if (key->ps_prolog.states.force_persp_center_interp) {
7493                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7494                 LLVMValueRef persp_center[2];
7495
7496                 /* Read PERSP_CENTER. */
7497                 for (i = 0; i < 2; i++)
7498                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
7499                 /* Overwrite PERSP_SAMPLE. */
7500                 for (i = 0; i < 2; i++)
7501                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7502                                                    persp_center[i], base + i, "");
7503                 /* Overwrite PERSP_CENTROID. */
7504                 for (i = 0; i < 2; i++)
7505                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7506                                                    persp_center[i], base + 4 + i, "");
7507         }
7508         if (key->ps_prolog.states.force_linear_center_interp) {
7509                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7510                 LLVMValueRef linear_center[2];
7511
7512                 /* Read LINEAR_CENTER. */
7513                 for (i = 0; i < 2; i++)
7514                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
7515                 /* Overwrite LINEAR_SAMPLE. */
7516                 for (i = 0; i < 2; i++)
7517                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7518                                                    linear_center[i], base + 6 + i, "");
7519                 /* Overwrite LINEAR_CENTROID. */
7520                 for (i = 0; i < 2; i++)
7521                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7522                                                    linear_center[i], base + 10 + i, "");
7523         }
7524
7525         /* Interpolate colors. */
7526         unsigned color_out_idx = 0;
7527         for (i = 0; i < 2; i++) {
7528                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7529                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7530                                      key->ps_prolog.face_vgpr_index;
7531                 LLVMValueRef interp[2], color[4];
7532                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7533
7534                 if (!writemask)
7535                         continue;
7536
7537                 /* If the interpolation qualifier is not CONSTANT (-1). */
7538                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7539                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7540                                                key->ps_prolog.color_interp_vgpr_index[i];
7541
7542                         /* Get the (i,j) updated by bc_optimize handling. */
7543                         interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7544                                                           interp_vgpr, "");
7545                         interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7546                                                           interp_vgpr + 1, "");
7547                         interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
7548                 }
7549
7550                 /* Use the absolute location of the input. */
7551                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7552
7553                 if (key->ps_prolog.states.color_two_side) {
7554                         face = LLVMGetParam(func, face_vgpr);
7555                         face = ac_to_integer(&ctx->ac, face);
7556                 }
7557
7558                 interp_fs_input(ctx,
7559                                 key->ps_prolog.color_attr_index[i],
7560                                 TGSI_SEMANTIC_COLOR, i,
7561                                 key->ps_prolog.num_interp_inputs,
7562                                 key->ps_prolog.colors_read, interp_ij,
7563                                 prim_mask, face, color);
7564
7565                 while (writemask) {
7566                         unsigned chan = u_bit_scan(&writemask);
7567                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
7568                                                    fninfo.num_params + color_out_idx++, "");
7569                 }
7570         }
7571
7572         /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
7573          * says:
7574          *
7575          *    "When per-sample shading is active due to the use of a fragment
7576          *     input qualified by sample or due to the use of the gl_SampleID
7577          *     or gl_SamplePosition variables, only the bit for the current
7578          *     sample is set in gl_SampleMaskIn. When state specifies multiple
7579          *     fragment shader invocations for a given fragment, the sample
7580          *     mask for any single fragment shader invocation may specify a
7581          *     subset of the covered samples for the fragment. In this case,
7582          *     the bit corresponding to each covered sample will be set in
7583          *     exactly one fragment shader invocation."
7584          *
7585          * The samplemask loaded by hardware is always the coverage of the
7586          * entire pixel/fragment, so mask bits out based on the sample ID.
7587          */
7588         if (key->ps_prolog.states.samplemask_log_ps_iter) {
7589                 /* The bit pattern matches that used by fixed function fragment
7590                  * processing. */
7591                 static const uint16_t ps_iter_masks[] = {
7592                         0xffff, /* not used */
7593                         0x5555,
7594                         0x1111,
7595                         0x0101,
7596                         0x0001,
7597                 };
7598                 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
7599
7600                 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
7601                 unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs +
7602                                           key->ps_prolog.ancillary_vgpr_index;
7603                 LLVMValueRef sampleid = si_unpack_param(ctx, ancillary_vgpr, 8, 4);
7604                 LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1);
7605
7606                 samplemask = ac_to_integer(&ctx->ac, samplemask);
7607                 samplemask = LLVMBuildAnd(
7608                         ctx->ac.builder,
7609                         samplemask,
7610                         LLVMBuildShl(ctx->ac.builder,
7611                                      LLVMConstInt(ctx->i32, ps_iter_mask, false),
7612                                      sampleid, ""),
7613                         "");
7614                 samplemask = ac_to_float(&ctx->ac, samplemask);
7615
7616                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
7617                                            ancillary_vgpr + 1, "");
7618         }
7619
7620         /* Tell LLVM to insert WQM instruction sequence when needed. */
7621         if (key->ps_prolog.wqm) {
7622                 LLVMAddTargetDependentFunctionAttr(func,
7623                                                    "amdgpu-ps-wqm-outputs", "");
7624         }
7625
7626         si_llvm_build_ret(ctx, ret);
7627 }
7628
7629 /**
7630  * Build the pixel shader epilog function. This handles everything that must be
7631  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7632  */
7633 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7634                                         union si_shader_part_key *key)
7635 {
7636         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7637         struct si_function_info fninfo;
7638         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7639         int i;
7640         struct si_ps_exports exp = {};
7641
7642         si_init_function_info(&fninfo);
7643
7644         /* Declare input SGPRs. */
7645         ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7646         ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7647         ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7648         ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7649         add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7650
7651         /* Declare input VGPRs. */
7652         unsigned required_num_params =
7653                      fninfo.num_sgpr_params +
7654                      util_bitcount(key->ps_epilog.colors_written) * 4 +
7655                      key->ps_epilog.writes_z +
7656                      key->ps_epilog.writes_stencil +
7657                      key->ps_epilog.writes_samplemask;
7658
7659         required_num_params = MAX2(required_num_params,
7660                                    fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7661
7662         while (fninfo.num_params < required_num_params)
7663                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7664
7665         /* Create the function. */
7666         si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7667         /* Disable elimination of unused inputs. */
7668         ac_llvm_add_target_dep_function_attr(ctx->main_fn,
7669                                              "InitialPSInputAddr", 0xffffff);
7670
7671         /* Process colors. */
7672         unsigned vgpr = fninfo.num_sgpr_params;
7673         unsigned colors_written = key->ps_epilog.colors_written;
7674         int last_color_export = -1;
7675
7676         /* Find the last color export. */
7677         if (!key->ps_epilog.writes_z &&
7678             !key->ps_epilog.writes_stencil &&
7679             !key->ps_epilog.writes_samplemask) {
7680                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7681
7682                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7683                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7684                         /* Just set this if any of the colorbuffers are enabled. */
7685                         if (spi_format &
7686                             ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7687                                 last_color_export = 0;
7688                 } else {
7689                         for (i = 0; i < 8; i++)
7690                                 if (colors_written & (1 << i) &&
7691                                     (spi_format >> (i * 4)) & 0xf)
7692                                         last_color_export = i;
7693                 }
7694         }
7695
7696         while (colors_written) {
7697                 LLVMValueRef color[4];
7698                 int mrt = u_bit_scan(&colors_written);
7699
7700                 for (i = 0; i < 4; i++)
7701                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7702
7703                 si_export_mrt_color(bld_base, color, mrt,
7704                                     fninfo.num_params - 1,
7705                                     mrt == last_color_export, &exp);
7706         }
7707
7708         /* Process depth, stencil, samplemask. */
7709         if (key->ps_epilog.writes_z)
7710                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7711         if (key->ps_epilog.writes_stencil)
7712                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7713         if (key->ps_epilog.writes_samplemask)
7714                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7715
7716         if (depth || stencil || samplemask)
7717                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7718         else if (last_color_export == -1)
7719                 ac_build_export_null(&ctx->ac);
7720
7721         if (exp.num)
7722                 si_emit_ps_exports(ctx, &exp);
7723
7724         /* Compile. */
7725         LLVMBuildRetVoid(ctx->ac.builder);
7726 }
7727
7728 /**
7729  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7730  */
7731 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7732                                       struct ac_llvm_compiler *compiler,
7733                                       struct si_shader *shader,
7734                                       struct pipe_debug_callback *debug)
7735 {
7736         union si_shader_part_key prolog_key;
7737         union si_shader_part_key epilog_key;
7738
7739         /* Get the prolog. */
7740         si_get_ps_prolog_key(shader, &prolog_key, true);
7741
7742         /* The prolog is a no-op if these aren't set. */
7743         if (si_need_ps_prolog(&prolog_key)) {
7744                 shader->prolog =
7745                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
7746                                            PIPE_SHADER_FRAGMENT, true,
7747                                            &prolog_key, compiler, debug,
7748                                            si_build_ps_prolog_function,
7749                                            "Fragment Shader Prolog");
7750                 if (!shader->prolog)
7751                         return false;
7752         }
7753
7754         /* Get the epilog. */
7755         si_get_ps_epilog_key(shader, &epilog_key);
7756
7757         shader->epilog =
7758                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7759                                    PIPE_SHADER_FRAGMENT, false,
7760                                    &epilog_key, compiler, debug,
7761                                    si_build_ps_epilog_function,
7762                                    "Fragment Shader Epilog");
7763         if (!shader->epilog)
7764                 return false;
7765
7766         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7767         if (shader->key.part.ps.prolog.poly_stipple) {
7768                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7769                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7770         }
7771
7772         /* Set up the enable bits for per-sample shading if needed. */
7773         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7774             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7775              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7776                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7777                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7778                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7779         }
7780         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7781             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7782              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7783                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7784                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7785                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7786         }
7787         if (shader->key.part.ps.prolog.force_persp_center_interp &&
7788             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7789              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7790                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7791                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7792                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7793         }
7794         if (shader->key.part.ps.prolog.force_linear_center_interp &&
7795             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7796              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7797                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7798                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7799                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7800         }
7801
7802         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7803         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7804             !(shader->config.spi_ps_input_ena & 0xf)) {
7805                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7806                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7807         }
7808
7809         /* At least one pair of interpolation weights must be enabled. */
7810         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7811                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7812                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7813         }
7814
7815         /* Samplemask fixup requires the sample ID. */
7816         if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
7817                 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
7818                 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
7819         }
7820
7821         /* The sample mask input is always enabled, because the API shader always
7822          * passes it through to the epilog. Disable it here if it's unused.
7823          */
7824         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7825             !shader->selector->info.reads_samplemask)
7826                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7827
7828         return true;
7829 }
7830
7831 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7832                                       unsigned *lds_size)
7833 {
7834         /* If tessellation is all offchip and on-chip GS isn't used, this
7835          * workaround is not needed.
7836          */
7837         return;
7838
7839         /* SPI barrier management bug:
7840          *   Make sure we have at least 4k of LDS in use to avoid the bug.
7841          *   It applies to workgroup sizes of more than one wavefront.
7842          */
7843         if (sscreen->info.family == CHIP_BONAIRE ||
7844             sscreen->info.family == CHIP_KABINI ||
7845             sscreen->info.family == CHIP_MULLINS)
7846                 *lds_size = MAX2(*lds_size, 8);
7847 }
7848
7849 static void si_fix_resource_usage(struct si_screen *sscreen,
7850                                   struct si_shader *shader)
7851 {
7852         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7853
7854         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7855
7856         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7857             si_get_max_workgroup_size(shader) > 64) {
7858                 si_multiwave_lds_size_workaround(sscreen,
7859                                                  &shader->config.lds_size);
7860         }
7861 }
7862
7863 int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
7864                      struct si_shader *shader,
7865                      struct pipe_debug_callback *debug)
7866 {
7867         struct si_shader_selector *sel = shader->selector;
7868         struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7869         int r;
7870
7871         /* LS, ES, VS are compiled on demand if the main part hasn't been
7872          * compiled for that stage.
7873          *
7874          * Vertex shaders are compiled on demand when a vertex fetch
7875          * workaround must be applied.
7876          */
7877         if (shader->is_monolithic) {
7878                 /* Monolithic shader (compiled as a whole, has many variants,
7879                  * may take a long time to compile).
7880                  */
7881                 r = si_compile_tgsi_shader(sscreen, compiler, shader, debug);
7882                 if (r)
7883                         return r;
7884         } else {
7885                 /* The shader consists of several parts:
7886                  *
7887                  * - the middle part is the user shader, it has 1 variant only
7888                  *   and it was compiled during the creation of the shader
7889                  *   selector
7890                  * - the prolog part is inserted at the beginning
7891                  * - the epilog part is inserted at the end
7892                  *
7893                  * The prolog and epilog have many (but simple) variants.
7894                  *
7895                  * Starting with gfx9, geometry and tessellation control
7896                  * shaders also contain the prolog and user shader parts of
7897                  * the previous shader stage.
7898                  */
7899
7900                 if (!mainp)
7901                         return -1;
7902
7903                 /* Copy the compiled TGSI shader data over. */
7904                 shader->is_binary_shared = true;
7905                 shader->binary = mainp->binary;
7906                 shader->config = mainp->config;
7907                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7908                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7909                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7910                 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
7911                 memcpy(shader->info.vs_output_param_offset,
7912                        mainp->info.vs_output_param_offset,
7913                        sizeof(mainp->info.vs_output_param_offset));
7914                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7915                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7916                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7917
7918                 /* Select prologs and/or epilogs. */
7919                 switch (sel->type) {
7920                 case PIPE_SHADER_VERTEX:
7921                         if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
7922                                 return -1;
7923                         break;
7924                 case PIPE_SHADER_TESS_CTRL:
7925                         if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
7926                                 return -1;
7927                         break;
7928                 case PIPE_SHADER_TESS_EVAL:
7929                         break;
7930                 case PIPE_SHADER_GEOMETRY:
7931                         if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
7932                                 return -1;
7933                         break;
7934                 case PIPE_SHADER_FRAGMENT:
7935                         if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
7936                                 return -1;
7937
7938                         /* Make sure we have at least as many VGPRs as there
7939                          * are allocated inputs.
7940                          */
7941                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7942                                                         shader->info.num_input_vgprs);
7943                         break;
7944                 }
7945
7946                 /* Update SGPR and VGPR counts. */
7947                 if (shader->prolog) {
7948                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7949                                                         shader->prolog->config.num_sgprs);
7950                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7951                                                         shader->prolog->config.num_vgprs);
7952                 }
7953                 if (shader->previous_stage) {
7954                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7955                                                         shader->previous_stage->config.num_sgprs);
7956                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7957                                                         shader->previous_stage->config.num_vgprs);
7958                         shader->config.spilled_sgprs =
7959                                 MAX2(shader->config.spilled_sgprs,
7960                                      shader->previous_stage->config.spilled_sgprs);
7961                         shader->config.spilled_vgprs =
7962                                 MAX2(shader->config.spilled_vgprs,
7963                                      shader->previous_stage->config.spilled_vgprs);
7964                         shader->config.private_mem_vgprs =
7965                                 MAX2(shader->config.private_mem_vgprs,
7966                                      shader->previous_stage->config.private_mem_vgprs);
7967                         shader->config.scratch_bytes_per_wave =
7968                                 MAX2(shader->config.scratch_bytes_per_wave,
7969                                      shader->previous_stage->config.scratch_bytes_per_wave);
7970                         shader->info.uses_instanceid |=
7971                                 shader->previous_stage->info.uses_instanceid;
7972                 }
7973                 if (shader->prolog2) {
7974                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7975                                                         shader->prolog2->config.num_sgprs);
7976                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7977                                                         shader->prolog2->config.num_vgprs);
7978                 }
7979                 if (shader->epilog) {
7980                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7981                                                         shader->epilog->config.num_sgprs);
7982                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7983                                                         shader->epilog->config.num_vgprs);
7984                 }
7985                 si_calculate_max_simd_waves(shader);
7986         }
7987
7988         si_fix_resource_usage(sscreen, shader);
7989         si_shader_dump(sscreen, shader, debug, sel->info.processor,
7990                        stderr, true);
7991
7992         /* Upload. */
7993         r = si_shader_binary_upload(sscreen, shader);
7994         if (r) {
7995                 fprintf(stderr, "LLVM failed to upload shader\n");
7996                 return r;
7997         }
7998
7999         return 0;
8000 }
8001
8002 void si_shader_destroy(struct si_shader *shader)
8003 {
8004         if (shader->scratch_bo)
8005                 si_resource_reference(&shader->scratch_bo, NULL);
8006
8007         si_resource_reference(&shader->bo, NULL);
8008
8009         if (!shader->is_binary_shared)
8010                 ac_shader_binary_clean(&shader->binary);
8011
8012         free(shader->shader_log);
8013 }