src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "gallivm/lp_bld_const.h"
  26 #include "gallivm/lp_bld_gather.h"
  27 #include "gallivm/lp_bld_intr.h"
  28 #include "gallivm/lp_bld_logic.h"
  29 #include "gallivm/lp_bld_arit.h"
  30 #include "gallivm/lp_bld_flow.h"
  31 #include "gallivm/lp_bld_misc.h"
  32 #include "util/u_memory.h"
  33 #include "util/u_string.h"
  34 #include "tgsi/tgsi_build.h"
  35 #include "tgsi/tgsi_util.h"
  36 #include "tgsi/tgsi_dump.h"
  37
  38 #include "ac_binary.h"
  39 #include "ac_llvm_util.h"
  40 #include "ac_exp_param.h"
  41 #include "ac_shader_util.h"
  42 #include "si_shader_internal.h"
  43 #include "si_pipe.h"
  44 #include "sid.h"
  45
  46 #include "compiler/nir/nir.h"
  47
  48 static const char *scratch_rsrc_dword0_symbol =
  49         "SCRATCH_RSRC_DWORD0";
  50
  51 static const char *scratch_rsrc_dword1_symbol =
  52         "SCRATCH_RSRC_DWORD1";
  53
  54 struct si_shader_output_values
  55 {
  56         LLVMValueRef values[4];
  57         unsigned semantic_name;
  58         unsigned semantic_index;
  59         ubyte vertex_stream[4];
  60 };
  61
  62 /**
  63  * Used to collect types and other info about arguments of the LLVM function
  64  * before the function is created.
  65  */
  66 struct si_function_info {
  67         LLVMTypeRef types[100];
  68         LLVMValueRef *assign[100];
  69         unsigned num_sgpr_params;
  70         unsigned num_params;
  71 };
  72
  73 enum si_arg_regfile {
  74         ARG_SGPR,
  75         ARG_VGPR
  76 };
  77
  78 static void si_init_shader_ctx(struct si_shader_context *ctx,
  79                                struct si_screen *sscreen,
  80                                struct si_compiler *compiler);
  81
  82 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  83                                  struct lp_build_tgsi_context *bld_base,
  84                                  struct lp_build_emit_data *emit_data);
  85
  86 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
  87                                FILE *f);
  88
  89 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  90                                         union si_shader_part_key *key);
  91 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  92                                          union si_shader_part_key *key);
  93 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  94                                         union si_shader_part_key *key);
  95 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  96                                         union si_shader_part_key *key);
  97
  98 /* Ideally pass the sample mask input to the PS epilog as v14, which
  99  * is its usual location, so that the shader doesn't have to add v_mov.
 100  */
 101 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
 102
 103 static bool llvm_type_is_64bit(struct si_shader_context *ctx,
 104                                LLVMTypeRef type)
 105 {
 106         if (type == ctx->ac.i64 || type == ctx->ac.f64)
 107                 return true;
 108
 109         return false;
 110 }
 111
 112 static bool is_merged_shader(struct si_shader *shader)
 113 {
 114         if (shader->selector->screen->info.chip_class <= VI)
 115                 return false;
 116
 117         return shader->key.as_ls ||
 118                shader->key.as_es ||
 119                shader->selector->type == PIPE_SHADER_TESS_CTRL ||
 120                shader->selector->type == PIPE_SHADER_GEOMETRY;
 121 }
 122
 123 static void si_init_function_info(struct si_function_info *fninfo)
 124 {
 125         fninfo->num_params = 0;
 126         fninfo->num_sgpr_params = 0;
 127 }
 128
 129 static unsigned add_arg_assign(struct si_function_info *fninfo,
 130                         enum si_arg_regfile regfile, LLVMTypeRef type,
 131                         LLVMValueRef *assign)
 132 {
 133         assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
 134
 135         unsigned idx = fninfo->num_params++;
 136         assert(idx < ARRAY_SIZE(fninfo->types));
 137
 138         if (regfile == ARG_SGPR)
 139                 fninfo->num_sgpr_params = fninfo->num_params;
 140
 141         fninfo->types[idx] = type;
 142         fninfo->assign[idx] = assign;
 143         return idx;
 144 }
 145
 146 static unsigned add_arg(struct si_function_info *fninfo,
 147                         enum si_arg_regfile regfile, LLVMTypeRef type)
 148 {
 149         return add_arg_assign(fninfo, regfile, type, NULL);
 150 }
 151
 152 static void add_arg_assign_checked(struct si_function_info *fninfo,
 153                                    enum si_arg_regfile regfile, LLVMTypeRef type,
 154                                    LLVMValueRef *assign, unsigned idx)
 155 {
 156         MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
 157         assert(actual == idx);
 158 }
 159
 160 static void add_arg_checked(struct si_function_info *fninfo,
 161                             enum si_arg_regfile regfile, LLVMTypeRef type,
 162                             unsigned idx)
 163 {
 164         add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
 165 }
 166
 167 /**
 168  * Returns a unique index for a per-patch semantic name and index. The index
 169  * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
 170  * can be calculated.
 171  */
 172 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
 173 {
 174         switch (semantic_name) {
 175         case TGSI_SEMANTIC_TESSOUTER:
 176                 return 0;
 177         case TGSI_SEMANTIC_TESSINNER:
 178                 return 1;
 179         case TGSI_SEMANTIC_PATCH:
 180                 assert(index < 30);
 181                 return 2 + index;
 182
 183         default:
 184                 assert(!"invalid semantic name");
 185                 return 0;
 186         }
 187 }
 188
 189 /**
 190  * Returns a unique index for a semantic name and index. The index must be
 191  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 192  * calculated.
 193  */
 194 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
 195                                        unsigned is_varying)
 196 {
 197         switch (semantic_name) {
 198         case TGSI_SEMANTIC_POSITION:
 199                 return 0;
 200         case TGSI_SEMANTIC_GENERIC:
 201                 /* Since some shader stages use the the highest used IO index
 202                  * to determine the size to allocate for inputs/outputs
 203                  * (in LDS, tess and GS rings). GENERIC should be placed right
 204                  * after POSITION to make that size as small as possible.
 205                  */
 206                 if (index < SI_MAX_IO_GENERIC)
 207                         return 1 + index;
 208
 209                 assert(!"invalid generic index");
 210                 return 0;
 211         case TGSI_SEMANTIC_PSIZE:
 212                 return SI_MAX_IO_GENERIC + 1;
 213         case TGSI_SEMANTIC_CLIPDIST:
 214                 assert(index <= 1);
 215                 return SI_MAX_IO_GENERIC + 2 + index;
 216         case TGSI_SEMANTIC_FOG:
 217                 return SI_MAX_IO_GENERIC + 4;
 218         case TGSI_SEMANTIC_LAYER:
 219                 return SI_MAX_IO_GENERIC + 5;
 220         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 221                 return SI_MAX_IO_GENERIC + 6;
 222         case TGSI_SEMANTIC_PRIMID:
 223                 return SI_MAX_IO_GENERIC + 7;
 224         case TGSI_SEMANTIC_COLOR:
 225                 assert(index < 2);
 226                 return SI_MAX_IO_GENERIC + 8 + index;
 227         case TGSI_SEMANTIC_BCOLOR:
 228                 assert(index < 2);
 229                 /* If it's a varying, COLOR and BCOLOR alias. */
 230                 if (is_varying)
 231                         return SI_MAX_IO_GENERIC + 8 + index;
 232                 else
 233                         return SI_MAX_IO_GENERIC + 10 + index;
 234         case TGSI_SEMANTIC_TEXCOORD:
 235                 assert(index < 8);
 236                 STATIC_ASSERT(SI_MAX_IO_GENERIC + 12 + 8 <= 63);
 237                 return SI_MAX_IO_GENERIC + 12 + index;
 238         case TGSI_SEMANTIC_CLIPVERTEX:
 239                 return 63;
 240         default:
 241                 fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
 242                 assert(!"invalid semantic name");
 243                 return 0;
 244         }
 245 }
 246
 247 /**
 248  * Get the value of a shader input parameter and extract a bitfield.
 249  */
 250 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
 251                                       LLVMValueRef value, unsigned rshift,
 252                                       unsigned bitwidth)
 253 {
 254         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 255                 value = ac_to_integer(&ctx->ac, value);
 256
 257         if (rshift)
 258                 value = LLVMBuildLShr(ctx->ac.builder, value,
 259                                       LLVMConstInt(ctx->i32, rshift, 0), "");
 260
 261         if (rshift + bitwidth < 32) {
 262                 unsigned mask = (1 << bitwidth) - 1;
 263                 value = LLVMBuildAnd(ctx->ac.builder, value,
 264                                      LLVMConstInt(ctx->i32, mask, 0), "");
 265         }
 266
 267         return value;
 268 }
 269
 270 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
 271                              unsigned param, unsigned rshift,
 272                              unsigned bitwidth)
 273 {
 274         LLVMValueRef value = LLVMGetParam(ctx->main_fn, param);
 275
 276         return unpack_llvm_param(ctx, value, rshift, bitwidth);
 277 }
 278
 279 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 280 {
 281         switch (ctx->type) {
 282         case PIPE_SHADER_TESS_CTRL:
 283                 return unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 0, 8);
 284
 285         case PIPE_SHADER_TESS_EVAL:
 286                 return LLVMGetParam(ctx->main_fn,
 287                                     ctx->param_tes_rel_patch_id);
 288
 289         default:
 290                 assert(0);
 291                 return NULL;
 292         }
 293 }
 294
 295 /* Tessellation shaders pass outputs to the next shader using LDS.
 296  *
 297  * LS outputs = TCS inputs
 298  * TCS outputs = TES inputs
 299  *
 300  * The LDS layout is:
 301  * - TCS inputs for patch 0
 302  * - TCS inputs for patch 1
 303  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 304  * - ...
 305  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 306  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 307  * - TCS outputs for patch 1
 308  * - Per-patch TCS outputs for patch 1
 309  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 310  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 311  * - ...
 312  *
 313  * All three shaders VS(LS), TCS, TES share the same LDS space.
 314  */
 315
 316 static LLVMValueRef
 317 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 318 {
 319         return si_unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 320 }
 321
 322 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
 323 {
 324         assert(ctx->type == PIPE_SHADER_TESS_CTRL);
 325
 326         if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
 327                 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
 328
 329         return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
 330 }
 331
 332 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
 333 {
 334         unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 335
 336         return LLVMConstInt(ctx->i32, stride, 0);
 337 }
 338
 339 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
 340 {
 341         if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
 342                 return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 343
 344         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 345         unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
 346         unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 347         unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
 348         unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
 349                                    num_patch_outputs * 4;
 350         return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
 351 }
 352
 353 static LLVMValueRef
 354 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 355 {
 356         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 357                                 si_unpack_param(ctx,
 358                                              ctx->param_tcs_out_lds_offsets,
 359                                              0, 16),
 360                                 4);
 361 }
 362
 363 static LLVMValueRef
 364 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 365 {
 366         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 367                                 si_unpack_param(ctx,
 368                                              ctx->param_tcs_out_lds_offsets,
 369                                              16, 16),
 370                                 4);
 371 }
 372
 373 static LLVMValueRef
 374 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 375 {
 376         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 377         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 378
 379         return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
 380 }
 381
 382 static LLVMValueRef
 383 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 384 {
 385         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 386         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 387         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 388
 389         return LLVMBuildAdd(ctx->ac.builder, patch0_offset,
 390                             LLVMBuildMul(ctx->ac.builder, patch_stride,
 391                                          rel_patch_id, ""),
 392                             "");
 393 }
 394
 395 static LLVMValueRef
 396 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 397 {
 398         LLVMValueRef patch0_patch_data_offset =
 399                 get_tcs_out_patch0_patch_data_offset(ctx);
 400         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 401         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 402
 403         return LLVMBuildAdd(ctx->ac.builder, patch0_patch_data_offset,
 404                             LLVMBuildMul(ctx->ac.builder, patch_stride,
 405                                          rel_patch_id, ""),
 406                             "");
 407 }
 408
 409 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
 410 {
 411         unsigned tcs_out_vertices =
 412                 ctx->shader->selector ?
 413                 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
 414
 415         /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
 416         if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
 417                 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
 418
 419         return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
 420 }
 421
 422 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 423 {
 424         unsigned stride;
 425
 426         switch (ctx->type) {
 427         case PIPE_SHADER_VERTEX:
 428                 stride = util_last_bit64(ctx->shader->selector->outputs_written);
 429                 return LLVMConstInt(ctx->i32, stride * 4, 0);
 430
 431         case PIPE_SHADER_TESS_CTRL:
 432                 if (ctx->screen->info.chip_class >= GFX9 &&
 433                     ctx->shader->is_monolithic) {
 434                         stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
 435                         return LLVMConstInt(ctx->i32, stride * 4, 0);
 436                 }
 437                 return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 438
 439         default:
 440                 assert(0);
 441                 return NULL;
 442         }
 443 }
 444
 445 static LLVMValueRef get_instance_index_for_fetch(
 446         struct si_shader_context *ctx,
 447         unsigned param_start_instance, LLVMValueRef divisor)
 448 {
 449         LLVMValueRef result = ctx->abi.instance_id;
 450
 451         /* The division must be done before START_INSTANCE is added. */
 452         if (divisor != ctx->i32_1)
 453                 result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
 454
 455         return LLVMBuildAdd(ctx->ac.builder, result,
 456                             LLVMGetParam(ctx->main_fn, param_start_instance), "");
 457 }
 458
 459 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
 460  * to float. */
 461 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
 462                                             LLVMValueRef vec4,
 463                                             unsigned double_index)
 464 {
 465         LLVMBuilderRef builder = ctx->ac.builder;
 466         LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
 467         LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
 468                                               LLVMVectorType(f64, 2), "");
 469         LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
 470         LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
 471         return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
 472 }
 473
 474 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
 475                                  LLVMValueRef i32, unsigned index)
 476 {
 477         assert(index <= 1);
 478
 479         if (index == 1)
 480                 return LLVMBuildAShr(ctx->ac.builder, i32,
 481                                      LLVMConstInt(ctx->i32, 16, 0), "");
 482
 483         return LLVMBuildSExt(ctx->ac.builder,
 484                              LLVMBuildTrunc(ctx->ac.builder, i32,
 485                                             ctx->ac.i16, ""),
 486                              ctx->i32, "");
 487 }
 488
 489 void si_llvm_load_input_vs(
 490         struct si_shader_context *ctx,
 491         unsigned input_index,
 492         LLVMValueRef out[4])
 493 {
 494         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 495         unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
 496
 497         if (vs_blit_property) {
 498                 LLVMValueRef vertex_id = ctx->abi.vertex_id;
 499                 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
 500                                                     LLVMIntULE, vertex_id,
 501                                                     ctx->i32_1, "");
 502                 /* Use LLVMIntNE, because we have 3 vertices and only
 503                  * the middle one should use y2.
 504                  */
 505                 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
 506                                                     LLVMIntNE, vertex_id,
 507                                                     ctx->i32_1, "");
 508
 509                 if (input_index == 0) {
 510                         /* Position: */
 511                         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
 512                                                          ctx->param_vs_blit_inputs);
 513                         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
 514                                                          ctx->param_vs_blit_inputs + 1);
 515
 516                         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
 517                         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
 518                         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
 519                         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
 520
 521                         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 522                                                          x1, x2, "");
 523                         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 524                                                          y1, y2, "");
 525
 526                         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
 527                         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
 528                         out[2] = LLVMGetParam(ctx->main_fn,
 529                                               ctx->param_vs_blit_inputs + 2);
 530                         out[3] = ctx->ac.f32_1;
 531                         return;
 532                 }
 533
 534                 /* Color or texture coordinates: */
 535                 assert(input_index == 1);
 536
 537                 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
 538                         for (int i = 0; i < 4; i++) {
 539                                 out[i] = LLVMGetParam(ctx->main_fn,
 540                                                       ctx->param_vs_blit_inputs + 3 + i);
 541                         }
 542                 } else {
 543                         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
 544                         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
 545                                                        ctx->param_vs_blit_inputs + 3);
 546                         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
 547                                                        ctx->param_vs_blit_inputs + 4);
 548                         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
 549                                                        ctx->param_vs_blit_inputs + 5);
 550                         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
 551                                                        ctx->param_vs_blit_inputs + 6);
 552
 553                         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 554                                                  x1, x2, "");
 555                         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 556                                                  y1, y2, "");
 557                         out[2] = LLVMGetParam(ctx->main_fn,
 558                                               ctx->param_vs_blit_inputs + 7);
 559                         out[3] = LLVMGetParam(ctx->main_fn,
 560                                               ctx->param_vs_blit_inputs + 8);
 561                 }
 562                 return;
 563         }
 564
 565         unsigned chan;
 566         unsigned fix_fetch;
 567         unsigned num_fetches;
 568         unsigned fetch_stride;
 569         unsigned num_channels;
 570
 571         LLVMValueRef t_list_ptr;
 572         LLVMValueRef t_offset;
 573         LLVMValueRef t_list;
 574         LLVMValueRef vertex_index;
 575         LLVMValueRef input[3];
 576
 577         /* Load the T list */
 578         t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 579
 580         t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 581
 582         t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 583
 584         vertex_index = LLVMGetParam(ctx->main_fn,
 585                                     ctx->param_vertex_index0 +
 586                                     input_index);
 587
 588         fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
 589
 590         /* Do multiple loads for special formats. */
 591         switch (fix_fetch) {
 592         case SI_FIX_FETCH_RGB_64_FLOAT:
 593                 num_fetches = 3; /* 3 2-dword loads */
 594                 fetch_stride = 8;
 595                 num_channels = 2;
 596                 break;
 597         case SI_FIX_FETCH_RGBA_64_FLOAT:
 598                 num_fetches = 2; /* 2 4-dword loads */
 599                 fetch_stride = 16;
 600                 num_channels = 4;
 601                 break;
 602         case SI_FIX_FETCH_RGB_8:
 603         case SI_FIX_FETCH_RGB_8_INT:
 604                 num_fetches = 3;
 605                 fetch_stride = 1;
 606                 num_channels = 1;
 607                 break;
 608         case SI_FIX_FETCH_RGB_16:
 609         case SI_FIX_FETCH_RGB_16_INT:
 610                 num_fetches = 3;
 611                 fetch_stride = 2;
 612                 num_channels = 1;
 613                 break;
 614         default:
 615                 num_fetches = 1;
 616                 fetch_stride = 0;
 617                 num_channels = util_last_bit(info->input_usage_mask[input_index]);
 618         }
 619
 620         for (unsigned i = 0; i < num_fetches; i++) {
 621                 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
 622
 623                 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
 624                                                        vertex_index, voffset,
 625                                                        num_channels, false, true);
 626                 input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels);
 627         }
 628
 629         /* Break up the vec4 into individual components */
 630         for (chan = 0; chan < 4; chan++) {
 631                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
 632                 out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 633                                                     input[0], llvm_chan, "");
 634         }
 635
 636         switch (fix_fetch) {
 637         case SI_FIX_FETCH_A2_SNORM:
 638         case SI_FIX_FETCH_A2_SSCALED:
 639         case SI_FIX_FETCH_A2_SINT: {
 640                 /* The hardware returns an unsigned value; convert it to a
 641                  * signed one.
 642                  */
 643                 LLVMValueRef tmp = out[3];
 644                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 645
 646                 /* First, recover the sign-extended signed integer value. */
 647                 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
 648                         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
 649                 else
 650                         tmp = ac_to_integer(&ctx->ac, tmp);
 651
 652                 /* For the integer-like cases, do a natural sign extension.
 653                  *
 654                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 655                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 656                  * exponent.
 657                  */
 658                 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
 659                                    fix_fetch == SI_FIX_FETCH_A2_SNORM ?
 660                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 661                 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 662
 663                 /* Convert back to the right type. */
 664                 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
 665                         LLVMValueRef clamp;
 666                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 667                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 668                         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
 669                         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
 670                 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
 671                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 672                 }
 673
 674                 out[3] = tmp;
 675                 break;
 676         }
 677         case SI_FIX_FETCH_RGBA_32_UNORM:
 678         case SI_FIX_FETCH_RGBX_32_UNORM:
 679                 for (chan = 0; chan < 4; chan++) {
 680                         out[chan] = ac_to_integer(&ctx->ac, out[chan]);
 681                         out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
 682                                                     out[chan], ctx->f32, "");
 683                         out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
 684                                                   LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
 685                 }
 686                 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 687                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
 688                         out[3] = LLVMConstReal(ctx->f32, 1);
 689                 break;
 690         case SI_FIX_FETCH_RGBA_32_SNORM:
 691         case SI_FIX_FETCH_RGBX_32_SNORM:
 692         case SI_FIX_FETCH_RGBA_32_FIXED:
 693         case SI_FIX_FETCH_RGBX_32_FIXED: {
 694                 double scale;
 695                 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
 696                         scale = 1.0 / 0x10000;
 697                 else
 698                         scale = 1.0 / INT_MAX;
 699
 700                 for (chan = 0; chan < 4; chan++) {
 701                         out[chan] = ac_to_integer(&ctx->ac, out[chan]);
 702                         out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
 703                                                     out[chan], ctx->f32, "");
 704                         out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
 705                                                   LLVMConstReal(ctx->f32, scale), "");
 706                 }
 707                 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 708                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
 709                     fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
 710                         out[3] = LLVMConstReal(ctx->f32, 1);
 711                 break;
 712         }
 713         case SI_FIX_FETCH_RGBA_32_USCALED:
 714                 for (chan = 0; chan < 4; chan++) {
 715                         out[chan] = ac_to_integer(&ctx->ac, out[chan]);
 716                         out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
 717                                                     out[chan], ctx->f32, "");
 718                 }
 719                 break;
 720         case SI_FIX_FETCH_RGBA_32_SSCALED:
 721                 for (chan = 0; chan < 4; chan++) {
 722                         out[chan] = ac_to_integer(&ctx->ac, out[chan]);
 723                         out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
 724                                                     out[chan], ctx->f32, "");
 725                 }
 726                 break;
 727         case SI_FIX_FETCH_RG_64_FLOAT:
 728                 for (chan = 0; chan < 2; chan++)
 729                         out[chan] = extract_double_to_float(ctx, input[0], chan);
 730
 731                 out[2] = LLVMConstReal(ctx->f32, 0);
 732                 out[3] = LLVMConstReal(ctx->f32, 1);
 733                 break;
 734         case SI_FIX_FETCH_RGB_64_FLOAT:
 735                 for (chan = 0; chan < 3; chan++)
 736                         out[chan] = extract_double_to_float(ctx, input[chan], 0);
 737
 738                 out[3] = LLVMConstReal(ctx->f32, 1);
 739                 break;
 740         case SI_FIX_FETCH_RGBA_64_FLOAT:
 741                 for (chan = 0; chan < 4; chan++) {
 742                         out[chan] = extract_double_to_float(ctx, input[chan / 2],
 743                                                             chan % 2);
 744                 }
 745                 break;
 746         case SI_FIX_FETCH_RGB_8:
 747         case SI_FIX_FETCH_RGB_8_INT:
 748         case SI_FIX_FETCH_RGB_16:
 749         case SI_FIX_FETCH_RGB_16_INT:
 750                 for (chan = 0; chan < 3; chan++) {
 751                         out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 752                                                             input[chan],
 753                                                             ctx->i32_0, "");
 754                 }
 755                 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
 756                     fix_fetch == SI_FIX_FETCH_RGB_16) {
 757                         out[3] = LLVMConstReal(ctx->f32, 1);
 758                 } else {
 759                         out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
 760                 }
 761                 break;
 762         }
 763 }
 764
 765 static void declare_input_vs(
 766         struct si_shader_context *ctx,
 767         unsigned input_index,
 768         const struct tgsi_full_declaration *decl,
 769         LLVMValueRef out[4])
 770 {
 771         si_llvm_load_input_vs(ctx, input_index, out);
 772 }
 773
 774 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
 775                                      unsigned swizzle)
 776 {
 777         if (swizzle > 0)
 778                 return ctx->i32_0;
 779
 780         switch (ctx->type) {
 781         case PIPE_SHADER_VERTEX:
 782                 return LLVMGetParam(ctx->main_fn,
 783                                     ctx->param_vs_prim_id);
 784         case PIPE_SHADER_TESS_CTRL:
 785                 return ctx->abi.tcs_patch_id;
 786         case PIPE_SHADER_TESS_EVAL:
 787                 return ctx->abi.tes_patch_id;
 788         case PIPE_SHADER_GEOMETRY:
 789                 return ctx->abi.gs_prim_id;
 790         default:
 791                 assert(0);
 792                 return ctx->i32_0;
 793         }
 794 }
 795
 796 /**
 797  * Return the value of tgsi_ind_register for indexing.
 798  * This is the indirect index with the constant offset added to it.
 799  */
 800 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
 801                                    const struct tgsi_ind_register *ind,
 802                                    unsigned addr_mul,
 803                                    int rel_index)
 804 {
 805         LLVMValueRef result;
 806
 807         if (ind->File == TGSI_FILE_ADDRESS) {
 808                 result = ctx->addrs[ind->Index][ind->Swizzle];
 809                 result = LLVMBuildLoad(ctx->ac.builder, result, "");
 810         } else {
 811                 struct tgsi_full_src_register src = {};
 812
 813                 src.Register.File = ind->File;
 814                 src.Register.Index = ind->Index;
 815
 816                 /* Set the second index to 0 for constants. */
 817                 if (ind->File == TGSI_FILE_CONSTANT)
 818                         src.Register.Dimension = 1;
 819
 820                 result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src,
 821                                                                    TGSI_TYPE_SIGNED,
 822                                                                    ind->Swizzle);
 823                 result = ac_to_integer(&ctx->ac, result);
 824         }
 825
 826         if (addr_mul != 1)
 827                 result = LLVMBuildMul(ctx->ac.builder, result,
 828                                       LLVMConstInt(ctx->i32, addr_mul, 0), "");
 829         result = LLVMBuildAdd(ctx->ac.builder, result,
 830                               LLVMConstInt(ctx->i32, rel_index, 0), "");
 831         return result;
 832 }
 833
 834 /**
 835  * Like si_get_indirect_index, but restricts the return value to a (possibly
 836  * undefined) value inside [0..num).
 837  */
 838 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
 839                                            const struct tgsi_ind_register *ind,
 840                                            int rel_index, unsigned num)
 841 {
 842         LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index);
 843
 844         return si_llvm_bound_index(ctx, result, num);
 845 }
 846
 847 static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
 848                                                         LLVMValueRef vertex_dw_stride,
 849                                                         LLVMValueRef base_addr,
 850                                                         LLVMValueRef vertex_index,
 851                                                         LLVMValueRef param_index,
 852                                                         unsigned input_index,
 853                                                         ubyte *name,
 854                                                         ubyte *index,
 855                                                         bool is_patch)
 856 {
 857         if (vertex_dw_stride) {
 858                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 859                                          LLVMBuildMul(ctx->ac.builder, vertex_index,
 860                                                       vertex_dw_stride, ""), "");
 861         }
 862
 863         if (param_index) {
 864                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 865                                          LLVMBuildMul(ctx->ac.builder, param_index,
 866                                                       LLVMConstInt(ctx->i32, 4, 0), ""), "");
 867         }
 868
 869         int param = is_patch ?
 870                 si_shader_io_get_unique_index_patch(name[input_index],
 871                                                     index[input_index]) :
 872                 si_shader_io_get_unique_index(name[input_index],
 873                                               index[input_index], false);
 874
 875         /* Add the base address of the element. */
 876         return LLVMBuildAdd(ctx->ac.builder, base_addr,
 877                             LLVMConstInt(ctx->i32, param * 4, 0), "");
 878 }
 879
 880 /**
 881  * Calculate a dword address given an input or output register and a stride.
 882  */
 883 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 884                                    const struct tgsi_full_dst_register *dst,
 885                                    const struct tgsi_full_src_register *src,
 886                                    LLVMValueRef vertex_dw_stride,
 887                                    LLVMValueRef base_addr)
 888 {
 889         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 890         ubyte *name, *index, *array_first;
 891         int input_index;
 892         struct tgsi_full_dst_register reg;
 893         LLVMValueRef vertex_index = NULL;
 894         LLVMValueRef ind_index = NULL;
 895
 896         /* Set the register description. The address computation is the same
 897          * for sources and destinations. */
 898         if (src) {
 899                 reg.Register.File = src->Register.File;
 900                 reg.Register.Index = src->Register.Index;
 901                 reg.Register.Indirect = src->Register.Indirect;
 902                 reg.Register.Dimension = src->Register.Dimension;
 903                 reg.Indirect = src->Indirect;
 904                 reg.Dimension = src->Dimension;
 905                 reg.DimIndirect = src->DimIndirect;
 906         } else
 907                 reg = *dst;
 908
 909         /* If the register is 2-dimensional (e.g. an array of vertices
 910          * in a primitive), calculate the base address of the vertex. */
 911         if (reg.Register.Dimension) {
 912                 if (reg.Dimension.Indirect)
 913                         vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
 914                                                       1, reg.Dimension.Index);
 915                 else
 916                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 917         }
 918
 919         /* Get information about the register. */
 920         if (reg.Register.File == TGSI_FILE_INPUT) {
 921                 name = info->input_semantic_name;
 922                 index = info->input_semantic_index;
 923                 array_first = info->input_array_first;
 924         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 925                 name = info->output_semantic_name;
 926                 index = info->output_semantic_index;
 927                 array_first = info->output_array_first;
 928         } else {
 929                 assert(0);
 930                 return NULL;
 931         }
 932
 933         if (reg.Register.Indirect) {
 934                 /* Add the relative address of the element. */
 935                 if (reg.Indirect.ArrayID)
 936                         input_index = array_first[reg.Indirect.ArrayID];
 937                 else
 938                         input_index = reg.Register.Index;
 939
 940                 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
 941                                                   1, reg.Register.Index - input_index);
 942         } else {
 943                 input_index = reg.Register.Index;
 944         }
 945
 946         return get_dw_address_from_generic_indices(ctx, vertex_dw_stride,
 947                                                    base_addr, vertex_index,
 948                                                    ind_index, input_index,
 949                                                    name, index,
 950                                                    !reg.Register.Dimension);
 951 }
 952
 953 /* The offchip buffer layout for TCS->TES is
 954  *
 955  * - attribute 0 of patch 0 vertex 0
 956  * - attribute 0 of patch 0 vertex 1
 957  * - attribute 0 of patch 0 vertex 2
 958  *   ...
 959  * - attribute 0 of patch 1 vertex 0
 960  * - attribute 0 of patch 1 vertex 1
 961  *   ...
 962  * - attribute 1 of patch 0 vertex 0
 963  * - attribute 1 of patch 0 vertex 1
 964  *   ...
 965  * - per patch attribute 0 of patch 0
 966  * - per patch attribute 0 of patch 1
 967  *   ...
 968  *
 969  * Note that every attribute has 4 components.
 970  */
 971 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 972                                                LLVMValueRef rel_patch_id,
 973                                                LLVMValueRef vertex_index,
 974                                                LLVMValueRef param_index)
 975 {
 976         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 977         LLVMValueRef param_stride, constant16;
 978
 979         vertices_per_patch = get_num_tcs_out_vertices(ctx);
 980         num_patches = si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
 981         total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
 982                                       num_patches, "");
 983
 984         constant16 = LLVMConstInt(ctx->i32, 16, 0);
 985         if (vertex_index) {
 986                 base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
 987                                          vertices_per_patch, "");
 988
 989                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 990                                          vertex_index, "");
 991
 992                 param_stride = total_vertices;
 993         } else {
 994                 base_addr = rel_patch_id;
 995                 param_stride = num_patches;
 996         }
 997
 998         base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 999                                  LLVMBuildMul(ctx->ac.builder, param_index,
1000                                               param_stride, ""), "");
1001
1002         base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
1003
1004         if (!vertex_index) {
1005                 LLVMValueRef patch_data_offset =
1006                            si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
1007
1008                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
1009                                          patch_data_offset, "");
1010         }
1011         return base_addr;
1012 }
1013
1014 /* This is a generic helper that can be shared by the NIR and TGSI backends */
1015 static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
1016                                         struct si_shader_context *ctx,
1017                                         LLVMValueRef vertex_index,
1018                                         LLVMValueRef param_index,
1019                                         unsigned param_base,
1020                                         ubyte *name,
1021                                         ubyte *index,
1022                                         bool is_patch)
1023 {
1024         unsigned param_index_base;
1025
1026         param_index_base = is_patch ?
1027                 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) :
1028                 si_shader_io_get_unique_index(name[param_base], index[param_base], false);
1029
1030         if (param_index) {
1031                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1032                                            LLVMConstInt(ctx->i32, param_index_base, 0),
1033                                            "");
1034         } else {
1035                 param_index = LLVMConstInt(ctx->i32, param_index_base, 0);
1036         }
1037
1038         return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
1039                                           vertex_index, param_index);
1040 }
1041
1042 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
1043                                        struct si_shader_context *ctx,
1044                                        const struct tgsi_full_dst_register *dst,
1045                                        const struct tgsi_full_src_register *src)
1046 {
1047         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1048         ubyte *name, *index, *array_first;
1049         struct tgsi_full_src_register reg;
1050         LLVMValueRef vertex_index = NULL;
1051         LLVMValueRef param_index = NULL;
1052         unsigned param_base;
1053
1054         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
1055
1056         if (reg.Register.Dimension) {
1057
1058                 if (reg.Dimension.Indirect)
1059                         vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
1060                                                              1, reg.Dimension.Index);
1061                 else
1062                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
1063         }
1064
1065         /* Get information about the register. */
1066         if (reg.Register.File == TGSI_FILE_INPUT) {
1067                 name = info->input_semantic_name;
1068                 index = info->input_semantic_index;
1069                 array_first = info->input_array_first;
1070         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1071                 name = info->output_semantic_name;
1072                 index = info->output_semantic_index;
1073                 array_first = info->output_array_first;
1074         } else {
1075                 assert(0);
1076                 return NULL;
1077         }
1078
1079         if (reg.Register.Indirect) {
1080                 if (reg.Indirect.ArrayID)
1081                         param_base = array_first[reg.Indirect.ArrayID];
1082                 else
1083                         param_base = reg.Register.Index;
1084
1085                 param_index = si_get_indirect_index(ctx, &reg.Indirect,
1086                                                     1, reg.Register.Index - param_base);
1087
1088         } else {
1089                 param_base = reg.Register.Index;
1090         }
1091
1092         return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1093                                                                param_index, param_base,
1094                                                                name, index, !reg.Register.Dimension);
1095 }
1096
1097 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
1098                                 LLVMTypeRef type, unsigned swizzle,
1099                                 LLVMValueRef buffer, LLVMValueRef offset,
1100                                 LLVMValueRef base, bool can_speculate)
1101 {
1102         struct si_shader_context *ctx = si_shader_context(bld_base);
1103         LLVMValueRef value, value2;
1104         LLVMTypeRef vec_type = LLVMVectorType(type, 4);
1105
1106         if (swizzle == ~0) {
1107                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1108                                              0, 1, 0, can_speculate, false);
1109
1110                 return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1111         }
1112
1113         if (!llvm_type_is_64bit(ctx, type)) {
1114                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1115                                              0, 1, 0, can_speculate, false);
1116
1117                 value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1118                 return LLVMBuildExtractElement(ctx->ac.builder, value,
1119                                     LLVMConstInt(ctx->i32, swizzle, 0), "");
1120         }
1121
1122         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1123                                   swizzle * 4, 1, 0, can_speculate, false);
1124
1125         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1126                                    swizzle * 4 + 4, 1, 0, can_speculate, false);
1127
1128         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1129 }
1130
1131 /**
1132  * Load from LDS.
1133  *
1134  * \param type          output value type
1135  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
1136  * \param dw_addr       address in dwords
1137  */
1138 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
1139                              LLVMTypeRef type, unsigned swizzle,
1140                              LLVMValueRef dw_addr)
1141 {
1142         struct si_shader_context *ctx = si_shader_context(bld_base);
1143         LLVMValueRef value;
1144
1145         if (swizzle == ~0) {
1146                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1147
1148                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1149                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
1150
1151                 return lp_build_gather_values(&ctx->gallivm, values,
1152                                               TGSI_NUM_CHANNELS);
1153         }
1154
1155         /* Split 64-bit loads. */
1156         if (llvm_type_is_64bit(ctx, type)) {
1157                 LLVMValueRef lo, hi;
1158
1159                 lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr);
1160                 hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr);
1161                 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
1162         }
1163
1164         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1165                             LLVMConstInt(ctx->i32, swizzle, 0));
1166
1167         value = ac_lds_load(&ctx->ac, dw_addr);
1168
1169         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1170 }
1171
1172 /**
1173  * Store to LDS.
1174  *
1175  * \param swizzle       offset (typically 0..3)
1176  * \param dw_addr       address in dwords
1177  * \param value         value to store
1178  */
1179 static void lds_store(struct si_shader_context *ctx,
1180                       unsigned dw_offset_imm, LLVMValueRef dw_addr,
1181                       LLVMValueRef value)
1182 {
1183         dw_addr = lp_build_add(&ctx->bld_base.uint_bld, dw_addr,
1184                             LLVMConstInt(ctx->i32, dw_offset_imm, 0));
1185
1186         ac_lds_store(&ctx->ac, dw_addr, value);
1187 }
1188
1189 enum si_tess_ring {
1190         TCS_FACTOR_RING,
1191         TESS_OFFCHIP_RING_TCS,
1192         TESS_OFFCHIP_RING_TES,
1193 };
1194
1195 static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
1196                                              enum si_tess_ring ring)
1197 {
1198         LLVMBuilderRef builder = ctx->ac.builder;
1199         unsigned param = ring == TESS_OFFCHIP_RING_TES ? ctx->param_tes_offchip_addr :
1200                                                          ctx->param_tcs_out_lds_layout;
1201         LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1202
1203         /* TCS only receives high 13 bits of the address. */
1204         if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
1205                 addr = LLVMBuildAnd(builder, addr,
1206                                     LLVMConstInt(ctx->i32, 0xfff80000, 0), "");
1207         }
1208
1209         if (ring == TCS_FACTOR_RING) {
1210                 unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
1211                 addr = LLVMBuildAdd(builder, addr,
1212                                     LLVMConstInt(ctx->i32, tf_offset, 0), "");
1213         }
1214
1215         LLVMValueRef desc[4];
1216         desc[0] = addr;
1217         desc[1] = LLVMConstInt(ctx->i32,
1218                                S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
1219         desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0);
1220         desc[3] = LLVMConstInt(ctx->i32,
1221                                S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1222                                S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1223                                S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1224                                S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1225                                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1226                                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0);
1227
1228         return ac_build_gather_values(&ctx->ac, desc, 4);
1229 }
1230
1231 static LLVMValueRef fetch_input_tcs(
1232         struct lp_build_tgsi_context *bld_base,
1233         const struct tgsi_full_src_register *reg,
1234         enum tgsi_opcode_type type, unsigned swizzle)
1235 {
1236         struct si_shader_context *ctx = si_shader_context(bld_base);
1237         LLVMValueRef dw_addr, stride;
1238
1239         stride = get_tcs_in_vertex_dw_stride(ctx);
1240         dw_addr = get_tcs_in_current_patch_offset(ctx);
1241         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1242
1243         return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1244 }
1245
1246 static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
1247                                              LLVMTypeRef type,
1248                                              LLVMValueRef vertex_index,
1249                                              LLVMValueRef param_index,
1250                                              unsigned const_index,
1251                                              unsigned location,
1252                                              unsigned driver_location,
1253                                              unsigned component,
1254                                              unsigned num_components,
1255                                              bool is_patch,
1256                                              bool is_compact,
1257                                              bool load_input)
1258 {
1259         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1260         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1261         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1262         LLVMValueRef dw_addr, stride;
1263
1264         driver_location = driver_location / 4;
1265
1266         if (load_input) {
1267                 stride = get_tcs_in_vertex_dw_stride(ctx);
1268                 dw_addr = get_tcs_in_current_patch_offset(ctx);
1269         } else {
1270                 if (is_patch) {
1271                         stride = NULL;
1272                         dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1273                 } else {
1274                         stride = get_tcs_out_vertex_dw_stride(ctx);
1275                         dw_addr = get_tcs_out_current_patch_offset(ctx);
1276                 }
1277         }
1278
1279         if (param_index) {
1280                 /* Add the constant index to the indirect index */
1281                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1282                                            LLVMConstInt(ctx->i32, const_index, 0), "");
1283         } else {
1284                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
1285         }
1286
1287         ubyte *names;
1288         ubyte *indices;
1289         if (load_input) {
1290                 names = info->input_semantic_name;
1291                 indices = info->input_semantic_index;
1292         } else {
1293                 names = info->output_semantic_name;
1294                 indices = info->output_semantic_index;
1295         }
1296
1297         dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1298                                                       vertex_index, param_index,
1299                                                       driver_location,
1300                                                       names, indices,
1301                                                       is_patch);
1302
1303         LLVMValueRef value[4];
1304         for (unsigned i = 0; i < num_components; i++) {
1305                 unsigned offset = i;
1306                 if (llvm_type_is_64bit(ctx, type))
1307                         offset *= 2;
1308
1309                 offset += component;
1310                 value[i + component] = lds_load(bld_base, type, offset, dw_addr);
1311         }
1312
1313         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1314 }
1315
1316 static LLVMValueRef fetch_output_tcs(
1317                 struct lp_build_tgsi_context *bld_base,
1318                 const struct tgsi_full_src_register *reg,
1319                 enum tgsi_opcode_type type, unsigned swizzle)
1320 {
1321         struct si_shader_context *ctx = si_shader_context(bld_base);
1322         LLVMValueRef dw_addr, stride;
1323
1324         if (reg->Register.Dimension) {
1325                 stride = get_tcs_out_vertex_dw_stride(ctx);
1326                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1327                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1328         } else {
1329                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1330                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1331         }
1332
1333         return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1334 }
1335
1336 static LLVMValueRef fetch_input_tes(
1337         struct lp_build_tgsi_context *bld_base,
1338         const struct tgsi_full_src_register *reg,
1339         enum tgsi_opcode_type type, unsigned swizzle)
1340 {
1341         struct si_shader_context *ctx = si_shader_context(bld_base);
1342         LLVMValueRef base, addr;
1343
1344         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1345         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1346
1347         return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle,
1348                            ctx->tess_offchip_ring, base, addr, true);
1349 }
1350
1351 LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
1352                                    LLVMTypeRef type,
1353                                    LLVMValueRef vertex_index,
1354                                    LLVMValueRef param_index,
1355                                    unsigned const_index,
1356                                    unsigned location,
1357                                    unsigned driver_location,
1358                                    unsigned component,
1359                                    unsigned num_components,
1360                                    bool is_patch,
1361                                    bool is_compact,
1362                                    bool load_input)
1363 {
1364         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1365         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1366         LLVMValueRef base, addr;
1367
1368         driver_location = driver_location / 4;
1369
1370         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1371
1372         if (param_index) {
1373                 /* Add the constant index to the indirect index */
1374                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1375                                            LLVMConstInt(ctx->i32, const_index, 0), "");
1376         } else {
1377                 param_index = LLVMConstInt(ctx->i32, const_index, 0);
1378         }
1379
1380         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1381                                                                param_index, driver_location,
1382                                                                info->input_semantic_name,
1383                                                                info->input_semantic_index,
1384                                                                is_patch);
1385
1386         /* TODO: This will generate rather ordinary llvm code, although it
1387          * should be easy for the optimiser to fix up. In future we might want
1388          * to refactor buffer_load(), but for now this maximises code sharing
1389          * between the NIR and TGSI backends.
1390          */
1391         LLVMValueRef value[4];
1392         for (unsigned i = 0; i < num_components; i++) {
1393                 unsigned offset = i;
1394                 if (llvm_type_is_64bit(ctx, type))
1395                         offset *= 2;
1396
1397                 offset += component;
1398                 value[i + component] = buffer_load(&ctx->bld_base, type, offset,
1399                                                    ctx->tess_offchip_ring, base, addr, true);
1400         }
1401
1402         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1403 }
1404
1405 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1406                              const struct tgsi_full_instruction *inst,
1407                              const struct tgsi_opcode_info *info,
1408                              unsigned index,
1409                              LLVMValueRef dst[4])
1410 {
1411         struct si_shader_context *ctx = si_shader_context(bld_base);
1412         const struct tgsi_full_dst_register *reg = &inst->Dst[index];
1413         const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1414         unsigned chan_index;
1415         LLVMValueRef dw_addr, stride;
1416         LLVMValueRef buffer, base, buf_addr;
1417         LLVMValueRef values[4];
1418         bool skip_lds_store;
1419         bool is_tess_factor = false, is_tess_inner = false;
1420
1421         /* Only handle per-patch and per-vertex outputs here.
1422          * Vectors will be lowered to scalars and this function will be called again.
1423          */
1424         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1425             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1426                 si_llvm_emit_store(bld_base, inst, info, index, dst);
1427                 return;
1428         }
1429
1430         if (reg->Register.Dimension) {
1431                 stride = get_tcs_out_vertex_dw_stride(ctx);
1432                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1433                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1434                 skip_lds_store = !sh_info->reads_pervertex_outputs;
1435         } else {
1436                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1437                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1438                 skip_lds_store = !sh_info->reads_perpatch_outputs;
1439
1440                 if (!reg->Register.Indirect) {
1441                         int name = sh_info->output_semantic_name[reg->Register.Index];
1442
1443                         /* Always write tess factors into LDS for the TCS epilog. */
1444                         if (name == TGSI_SEMANTIC_TESSINNER ||
1445                             name == TGSI_SEMANTIC_TESSOUTER) {
1446                                 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1447                                 skip_lds_store = !sh_info->reads_tessfactor_outputs &&
1448                                                  ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1449                                 is_tess_factor = true;
1450                                 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1451                         }
1452                 }
1453         }
1454
1455         buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1456
1457         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1458         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1459
1460         uint32_t writemask = reg->Register.WriteMask;
1461         while (writemask) {
1462                 chan_index = u_bit_scan(&writemask);
1463                 LLVMValueRef value = dst[chan_index];
1464
1465                 if (inst->Instruction.Saturate)
1466                         value = ac_build_clamp(&ctx->ac, value);
1467
1468                 /* Skip LDS stores if there is no LDS read of this output. */
1469                 if (!skip_lds_store)
1470                         lds_store(ctx, chan_index, dw_addr, value);
1471
1472                 value = ac_to_integer(&ctx->ac, value);
1473                 values[chan_index] = value;
1474
1475                 if (reg->Register.WriteMask != 0xF && !is_tess_factor) {
1476                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1477                                                     buf_addr, base,
1478                                                     4 * chan_index, 1, 0, true, false);
1479                 }
1480
1481                 /* Write tess factors into VGPRs for the epilog. */
1482                 if (is_tess_factor &&
1483                     ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1484                         if (!is_tess_inner) {
1485                                 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1486                                                ctx->invoc0_tess_factors[chan_index]);
1487                         } else if (chan_index < 2) {
1488                                 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1489                                                ctx->invoc0_tess_factors[4 + chan_index]);
1490                         }
1491                 }
1492         }
1493
1494         if (reg->Register.WriteMask == 0xF && !is_tess_factor) {
1495                 LLVMValueRef value = lp_build_gather_values(&ctx->gallivm,
1496                                                             values, 4);
1497                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1498                                             base, 0, 1, 0, true, false);
1499         }
1500 }
1501
1502 static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
1503                                     const struct nir_variable *var,
1504                                     LLVMValueRef vertex_index,
1505                                     LLVMValueRef param_index,
1506                                     unsigned const_index,
1507                                     LLVMValueRef src,
1508                                     unsigned writemask)
1509 {
1510         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1511         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1512         const unsigned component = var->data.location_frac;
1513         const bool is_patch = var->data.patch;
1514         unsigned driver_location = var->data.driver_location;
1515         LLVMValueRef dw_addr, stride;
1516         LLVMValueRef buffer, base, addr;
1517         LLVMValueRef values[4];
1518         bool skip_lds_store;
1519         bool is_tess_factor = false, is_tess_inner = false;
1520
1521         driver_location = driver_location / 4;
1522
1523         if (param_index) {
1524                 /* Add the constant index to the indirect index */
1525                 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1526                                            LLVMConstInt(ctx->i32, const_index, 0), "");
1527         } else {
1528                 if (const_index != 0)
1529                         param_index = LLVMConstInt(ctx->i32, const_index, 0);
1530         }
1531
1532         if (!is_patch) {
1533                 stride = get_tcs_out_vertex_dw_stride(ctx);
1534                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1535                 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1536                                                               vertex_index, param_index,
1537                                                               driver_location,
1538                                                               info->output_semantic_name,
1539                                                               info->output_semantic_index,
1540                                                               is_patch);
1541
1542                 skip_lds_store = !info->reads_pervertex_outputs;
1543         } else {
1544                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1545                 dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
1546                                                               vertex_index, param_index,
1547                                                               driver_location,
1548                                                               info->output_semantic_name,
1549                                                               info->output_semantic_index,
1550                                                               is_patch);
1551
1552                 skip_lds_store = !info->reads_perpatch_outputs;
1553
1554                 if (!param_index) {
1555                         int name = info->output_semantic_name[driver_location];
1556
1557                         /* Always write tess factors into LDS for the TCS epilog. */
1558                         if (name == TGSI_SEMANTIC_TESSINNER ||
1559                             name == TGSI_SEMANTIC_TESSOUTER) {
1560                                 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1561                                 skip_lds_store = !info->reads_tessfactor_outputs &&
1562                                                  ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1563                                 is_tess_factor = true;
1564                                 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1565                         }
1566                 }
1567         }
1568
1569         buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1570
1571         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1572
1573         addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1574                                                                param_index, driver_location,
1575                                                                info->output_semantic_name,
1576                                                                info->output_semantic_index,
1577                                                                is_patch);
1578
1579         for (unsigned chan = 0; chan < 4; chan++) {
1580                 if (!(writemask & (1 << chan)))
1581                         continue;
1582                 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
1583
1584                 /* Skip LDS stores if there is no LDS read of this output. */
1585                 if (!skip_lds_store)
1586                         lds_store(ctx, chan, dw_addr, value);
1587
1588                 value = ac_to_integer(&ctx->ac, value);
1589                 values[chan] = value;
1590
1591                 if (writemask != 0xF && !is_tess_factor) {
1592                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1593                                                     addr, base,
1594                                                     4 * chan, 1, 0, true, false);
1595                 }
1596
1597                 /* Write tess factors into VGPRs for the epilog. */
1598                 if (is_tess_factor &&
1599                     ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1600                         if (!is_tess_inner) {
1601                                 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1602                                                ctx->invoc0_tess_factors[chan]);
1603                         } else if (chan < 2) {
1604                                 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1605                                                ctx->invoc0_tess_factors[4 + chan]);
1606                         }
1607                 }
1608         }
1609
1610         if (writemask == 0xF && !is_tess_factor) {
1611                 LLVMValueRef value = lp_build_gather_values(&ctx->gallivm,
1612                                                             values, 4);
1613                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
1614                                             base, 0, 1, 0, true, false);
1615         }
1616 }
1617
1618 LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
1619                                    unsigned input_index,
1620                                    unsigned vtx_offset_param,
1621                                    LLVMTypeRef type,
1622                                    unsigned swizzle)
1623 {
1624         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1625         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1626         struct si_shader *shader = ctx->shader;
1627         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1628         LLVMValueRef vtx_offset, soffset;
1629         struct tgsi_shader_info *info = &shader->selector->info;
1630         unsigned semantic_name = info->input_semantic_name[input_index];
1631         unsigned semantic_index = info->input_semantic_index[input_index];
1632         unsigned param;
1633         LLVMValueRef value;
1634
1635         param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
1636
1637         /* GFX9 has the ESGS ring in LDS. */
1638         if (ctx->screen->info.chip_class >= GFX9) {
1639                 unsigned index = vtx_offset_param;
1640
1641                 switch (index / 2) {
1642                 case 0:
1643                         vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx01_offset,
1644                                                   index % 2 ? 16 : 0, 16);
1645                         break;
1646                 case 1:
1647                         vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx23_offset,
1648                                                   index % 2 ? 16 : 0, 16);
1649                         break;
1650                 case 2:
1651                         vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx45_offset,
1652                                                   index % 2 ? 16 : 0, 16);
1653                         break;
1654                 default:
1655                         assert(0);
1656                         return NULL;
1657                 }
1658
1659                 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
1660                                           LLVMConstInt(ctx->i32, param * 4, 0), "");
1661                 return lds_load(bld_base, type, swizzle, vtx_offset);
1662         }
1663
1664         /* GFX6: input load from the ESGS ring in memory. */
1665         if (swizzle == ~0) {
1666                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1667                 unsigned chan;
1668                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1669                         values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
1670                                                              type, chan);
1671                 }
1672                 return lp_build_gather_values(&ctx->gallivm, values,
1673                                               TGSI_NUM_CHANNELS);
1674         }
1675
1676         /* Get the vertex offset parameter on GFX6. */
1677         LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param];
1678
1679         vtx_offset = lp_build_mul_imm(uint, gs_vtx_offset, 4);
1680
1681         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1682
1683         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1684                                      vtx_offset, soffset, 0, 1, 0, true, false);
1685         if (llvm_type_is_64bit(ctx, type)) {
1686                 LLVMValueRef value2;
1687                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1688
1689                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1690                                               ctx->i32_0, vtx_offset, soffset,
1691                                               0, 1, 0, true, false);
1692                 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1693         }
1694         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1695 }
1696
1697 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
1698                                          unsigned location,
1699                                          unsigned driver_location,
1700                                          unsigned component,
1701                                          unsigned num_components,
1702                                          unsigned vertex_index,
1703                                          unsigned const_index,
1704                                          LLVMTypeRef type)
1705 {
1706         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1707
1708         LLVMValueRef value[4];
1709         for (unsigned i = 0; i < num_components; i++) {
1710                 unsigned offset = i;
1711                 if (llvm_type_is_64bit(ctx, type))
1712                         offset *= 2;
1713
1714                 offset += component;
1715                 value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location  / 4,
1716                                                              vertex_index, type, offset);
1717         }
1718
1719         return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1720 }
1721
1722 static LLVMValueRef fetch_input_gs(
1723         struct lp_build_tgsi_context *bld_base,
1724         const struct tgsi_full_src_register *reg,
1725         enum tgsi_opcode_type type,
1726         unsigned swizzle)
1727 {
1728         struct si_shader_context *ctx = si_shader_context(bld_base);
1729         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1730
1731         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1732         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1733                 return get_primitive_id(ctx, swizzle);
1734
1735         if (!reg->Register.Dimension)
1736                 return NULL;
1737
1738         return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index,
1739                                      reg->Dimension.Index,
1740                                      tgsi2llvmtype(bld_base, type),
1741                                      swizzle);
1742 }
1743
1744 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1745 {
1746         switch (interpolate) {
1747         case TGSI_INTERPOLATE_CONSTANT:
1748                 return 0;
1749
1750         case TGSI_INTERPOLATE_LINEAR:
1751                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1752                         return SI_PARAM_LINEAR_SAMPLE;
1753                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1754                         return SI_PARAM_LINEAR_CENTROID;
1755                 else
1756                         return SI_PARAM_LINEAR_CENTER;
1757                 break;
1758         case TGSI_INTERPOLATE_COLOR:
1759         case TGSI_INTERPOLATE_PERSPECTIVE:
1760                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1761                         return SI_PARAM_PERSP_SAMPLE;
1762                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1763                         return SI_PARAM_PERSP_CENTROID;
1764                 else
1765                         return SI_PARAM_PERSP_CENTER;
1766                 break;
1767         default:
1768                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1769                 return -1;
1770         }
1771 }
1772
1773 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1774                                        unsigned attr_index, unsigned chan,
1775                                        LLVMValueRef prim_mask,
1776                                        LLVMValueRef i, LLVMValueRef j)
1777 {
1778         if (i || j) {
1779                 return ac_build_fs_interp(&ctx->ac,
1780                                           LLVMConstInt(ctx->i32, chan, 0),
1781                                           LLVMConstInt(ctx->i32, attr_index, 0),
1782                                           prim_mask, i, j);
1783         }
1784         return ac_build_fs_interp_mov(&ctx->ac,
1785                                       LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1786                                       LLVMConstInt(ctx->i32, chan, 0),
1787                                       LLVMConstInt(ctx->i32, attr_index, 0),
1788                                       prim_mask);
1789 }
1790
1791 /**
1792  * Interpolate a fragment shader input.
1793  *
1794  * @param ctx           context
1795  * @param input_index           index of the input in hardware
1796  * @param semantic_name         TGSI_SEMANTIC_*
1797  * @param semantic_index        semantic index
1798  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1799  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1800  * @param interp_param          interpolation weights (i,j)
1801  * @param prim_mask             SI_PARAM_PRIM_MASK
1802  * @param face                  SI_PARAM_FRONT_FACE
1803  * @param result                the return value (4 components)
1804  */
1805 static void interp_fs_input(struct si_shader_context *ctx,
1806                             unsigned input_index,
1807                             unsigned semantic_name,
1808                             unsigned semantic_index,
1809                             unsigned num_interp_inputs,
1810                             unsigned colors_read_mask,
1811                             LLVMValueRef interp_param,
1812                             LLVMValueRef prim_mask,
1813                             LLVMValueRef face,
1814                             LLVMValueRef result[4])
1815 {
1816         LLVMValueRef i = NULL, j = NULL;
1817         unsigned chan;
1818
1819         /* fs.constant returns the param from the middle vertex, so it's not
1820          * really useful for flat shading. It's meant to be used for custom
1821          * interpolation (but the intrinsic can't fetch from the other two
1822          * vertices).
1823          *
1824          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1825          * to do the right thing. The only reason we use fs.constant is that
1826          * fs.interp cannot be used on integers, because they can be equal
1827          * to NaN.
1828          *
1829          * When interp is false we will use fs.constant or for newer llvm,
1830          * amdgcn.interp.mov.
1831          */
1832         bool interp = interp_param != NULL;
1833
1834         if (interp) {
1835                 interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
1836                                                 LLVMVectorType(ctx->f32, 2), "");
1837
1838                 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1839                                                 ctx->i32_0, "");
1840                 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1841                                                 ctx->i32_1, "");
1842         }
1843
1844         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1845             ctx->shader->key.part.ps.prolog.color_two_side) {
1846                 LLVMValueRef is_face_positive;
1847
1848                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1849                  * otherwise it's at offset "num_inputs".
1850                  */
1851                 unsigned back_attr_offset = num_interp_inputs;
1852                 if (semantic_index == 1 && colors_read_mask & 0xf)
1853                         back_attr_offset += 1;
1854
1855                 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
1856                                                  face, ctx->i32_0, "");
1857
1858                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1859                         LLVMValueRef front, back;
1860
1861                         front = si_build_fs_interp(ctx,
1862                                                    input_index, chan,
1863                                                    prim_mask, i, j);
1864                         back = si_build_fs_interp(ctx,
1865                                                   back_attr_offset, chan,
1866                                                   prim_mask, i, j);
1867
1868                         result[chan] = LLVMBuildSelect(ctx->ac.builder,
1869                                                 is_face_positive,
1870                                                 front,
1871                                                 back,
1872                                                 "");
1873                 }
1874         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1875                 result[0] = si_build_fs_interp(ctx, input_index,
1876                                                0, prim_mask, i, j);
1877                 result[1] =
1878                 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1879                 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1880         } else {
1881                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1882                         result[chan] = si_build_fs_interp(ctx,
1883                                                           input_index, chan,
1884                                                           prim_mask, i, j);
1885                 }
1886         }
1887 }
1888
1889 void si_llvm_load_input_fs(
1890         struct si_shader_context *ctx,
1891         unsigned input_index,
1892         LLVMValueRef out[4])
1893 {
1894         struct lp_build_context *base = &ctx->bld_base.base;
1895         struct si_shader *shader = ctx->shader;
1896         struct tgsi_shader_info *info = &shader->selector->info;
1897         LLVMValueRef main_fn = ctx->main_fn;
1898         LLVMValueRef interp_param = NULL;
1899         int interp_param_idx;
1900         enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1901         unsigned semantic_index = info->input_semantic_index[input_index];
1902         enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1903         enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1904
1905         /* Get colors from input VGPRs (set by the prolog). */
1906         if (semantic_name == TGSI_SEMANTIC_COLOR) {
1907                 unsigned colors_read = shader->selector->info.colors_read;
1908                 unsigned mask = colors_read >> (semantic_index * 4);
1909                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1910                                   (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1911
1912                 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1913                 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1914                 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1915                 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1916                 return;
1917         }
1918
1919         interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1920         if (interp_param_idx == -1)
1921                 return;
1922         else if (interp_param_idx) {
1923                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1924         }
1925
1926         interp_fs_input(ctx, input_index, semantic_name,
1927                         semantic_index, 0, /* this param is unused */
1928                         shader->selector->info.colors_read, interp_param,
1929                         ctx->abi.prim_mask,
1930                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1931                         &out[0]);
1932 }
1933
1934 static void declare_input_fs(
1935         struct si_shader_context *ctx,
1936         unsigned input_index,
1937         const struct tgsi_full_declaration *decl,
1938         LLVMValueRef out[4])
1939 {
1940         si_llvm_load_input_fs(ctx, input_index, out);
1941 }
1942
1943 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
1944 {
1945         return si_unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1946 }
1947
1948 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
1949 {
1950         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1951
1952         /* For non-indexed draws, the base vertex set by the driver
1953          * (for direct draws) or the CP (for indirect draws) is the
1954          * first vertex ID, but GLSL expects 0 to be returned.
1955          */
1956         LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn,
1957                                              ctx->param_vs_state_bits);
1958         LLVMValueRef indexed;
1959
1960         indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
1961         indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
1962
1963         return LLVMBuildSelect(ctx->ac.builder, indexed, ctx->abi.base_vertex,
1964                                ctx->i32_0, "");
1965 }
1966
1967 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
1968 {
1969         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1970
1971         LLVMValueRef values[3];
1972         LLVMValueRef result;
1973         unsigned i;
1974         unsigned *properties = ctx->shader->selector->info.properties;
1975
1976         if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1977                 unsigned sizes[3] = {
1978                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1979                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1980                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1981                 };
1982
1983                 for (i = 0; i < 3; ++i)
1984                         values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1985
1986                 result = lp_build_gather_values(&ctx->gallivm, values, 3);
1987         } else {
1988                 result = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1989         }
1990
1991         return result;
1992 }
1993
1994 /**
1995  * Load a dword from a constant buffer.
1996  */
1997 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1998                                       LLVMValueRef resource,
1999                                       LLVMValueRef offset)
2000 {
2001         return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
2002                                     0, 0, 0, true, true);
2003 }
2004
2005 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
2006 {
2007         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2008         struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
2009         LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2010         LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
2011         LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
2012
2013         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
2014         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
2015         LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
2016
2017         LLVMValueRef pos[4] = {
2018                 buffer_load_const(ctx, resource, offset0),
2019                 buffer_load_const(ctx, resource, offset1),
2020                 LLVMConstReal(ctx->f32, 0),
2021                 LLVMConstReal(ctx->f32, 0)
2022         };
2023
2024         return lp_build_gather_values(&ctx->gallivm, pos, 4);
2025 }
2026
2027 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
2028 {
2029         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2030         return ac_to_integer(&ctx->ac, abi->sample_coverage);
2031 }
2032
2033 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
2034 {
2035         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2036         struct lp_build_context *bld = &ctx->bld_base.base;
2037
2038         LLVMValueRef coord[4] = {
2039                 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
2040                 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
2041                 ctx->ac.f32_0,
2042                 ctx->ac.f32_0
2043         };
2044
2045         /* For triangles, the vector should be (u, v, 1-u-v). */
2046         if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
2047             PIPE_PRIM_TRIANGLES)
2048                 coord[2] = lp_build_sub(bld, ctx->ac.f32_1,
2049                                         lp_build_add(bld, coord[0], coord[1]));
2050
2051         return lp_build_gather_values(&ctx->gallivm, coord, 4);
2052 }
2053
2054 static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
2055                                     unsigned semantic_name)
2056 {
2057         LLVMValueRef base, addr;
2058
2059         int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
2060
2061         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2062         addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
2063                                           LLVMConstInt(ctx->i32, param, 0));
2064
2065         return buffer_load(&ctx->bld_base, ctx->f32,
2066                            ~0, ctx->tess_offchip_ring, base, addr, true);
2067
2068 }
2069
2070 static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
2071                                        unsigned varying_id)
2072 {
2073         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2074         unsigned semantic_name;
2075
2076         switch (varying_id) {
2077         case VARYING_SLOT_TESS_LEVEL_INNER:
2078                 semantic_name = TGSI_SEMANTIC_TESSINNER;
2079                 break;
2080         case VARYING_SLOT_TESS_LEVEL_OUTER:
2081                 semantic_name = TGSI_SEMANTIC_TESSOUTER;
2082                 break;
2083         default:
2084                 unreachable("unknown tess level");
2085         }
2086
2087         return load_tess_level(ctx, semantic_name);
2088
2089 }
2090
2091 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
2092 {
2093         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2094         if (ctx->type == PIPE_SHADER_TESS_CTRL)
2095                 return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 6);
2096         else if (ctx->type == PIPE_SHADER_TESS_EVAL)
2097                 return get_num_tcs_out_vertices(ctx);
2098         else
2099                 unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
2100 }
2101
2102 void si_load_system_value(struct si_shader_context *ctx,
2103                           unsigned index,
2104                           const struct tgsi_full_declaration *decl)
2105 {
2106         LLVMValueRef value = 0;
2107
2108         assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
2109
2110         switch (decl->Semantic.Name) {
2111         case TGSI_SEMANTIC_INSTANCEID:
2112                 value = ctx->abi.instance_id;
2113                 break;
2114
2115         case TGSI_SEMANTIC_VERTEXID:
2116                 value = LLVMBuildAdd(ctx->ac.builder,
2117                                      ctx->abi.vertex_id,
2118                                      ctx->abi.base_vertex, "");
2119                 break;
2120
2121         case TGSI_SEMANTIC_VERTEXID_NOBASE:
2122                 /* Unused. Clarify the meaning in indexed vs. non-indexed
2123                  * draws if this is ever used again. */
2124                 assert(false);
2125                 break;
2126
2127         case TGSI_SEMANTIC_BASEVERTEX:
2128                 value = get_base_vertex(&ctx->abi);
2129                 break;
2130
2131         case TGSI_SEMANTIC_BASEINSTANCE:
2132                 value = ctx->abi.start_instance;
2133                 break;
2134
2135         case TGSI_SEMANTIC_DRAWID:
2136                 value = ctx->abi.draw_id;
2137                 break;
2138
2139         case TGSI_SEMANTIC_INVOCATIONID:
2140                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
2141                         value = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
2142                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
2143                         value = ctx->abi.gs_invocation_id;
2144                 else
2145                         assert(!"INVOCATIONID not implemented");
2146                 break;
2147
2148         case TGSI_SEMANTIC_POSITION:
2149         {
2150                 LLVMValueRef pos[4] = {
2151                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
2152                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
2153                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
2154                         lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
2155                                                  LLVMGetParam(ctx->main_fn,
2156                                                               SI_PARAM_POS_W_FLOAT)),
2157                 };
2158                 value = lp_build_gather_values(&ctx->gallivm, pos, 4);
2159                 break;
2160         }
2161
2162         case TGSI_SEMANTIC_FACE:
2163                 value = ctx->abi.front_face;
2164                 break;
2165
2166         case TGSI_SEMANTIC_SAMPLEID:
2167                 value = si_get_sample_id(ctx);
2168                 break;
2169
2170         case TGSI_SEMANTIC_SAMPLEPOS: {
2171                 LLVMValueRef pos[4] = {
2172                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
2173                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
2174                         LLVMConstReal(ctx->f32, 0),
2175                         LLVMConstReal(ctx->f32, 0)
2176                 };
2177                 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
2178                                                   TGSI_OPCODE_FRC, pos[0]);
2179                 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
2180                                                   TGSI_OPCODE_FRC, pos[1]);
2181                 value = lp_build_gather_values(&ctx->gallivm, pos, 4);
2182                 break;
2183         }
2184
2185         case TGSI_SEMANTIC_SAMPLEMASK:
2186                 /* This can only occur with the OpenGL Core profile, which
2187                  * doesn't support smoothing.
2188                  */
2189                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
2190                 break;
2191
2192         case TGSI_SEMANTIC_TESSCOORD:
2193                 value = si_load_tess_coord(&ctx->abi);
2194                 break;
2195
2196         case TGSI_SEMANTIC_VERTICESIN:
2197                 value = si_load_patch_vertices_in(&ctx->abi);
2198                 break;
2199
2200         case TGSI_SEMANTIC_TESSINNER:
2201         case TGSI_SEMANTIC_TESSOUTER:
2202                 value = load_tess_level(ctx, decl->Semantic.Name);
2203                 break;
2204
2205         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
2206         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
2207         {
2208                 LLVMValueRef buf, slot, val[4];
2209                 int i, offset;
2210
2211                 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
2212                 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2213                 buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
2214                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
2215
2216                 for (i = 0; i < 4; i++)
2217                         val[i] = buffer_load_const(ctx, buf,
2218                                                    LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
2219                 value = lp_build_gather_values(&ctx->gallivm, val, 4);
2220                 break;
2221         }
2222
2223         case TGSI_SEMANTIC_PRIMID:
2224                 value = get_primitive_id(ctx, 0);
2225                 break;
2226
2227         case TGSI_SEMANTIC_GRID_SIZE:
2228                 value = ctx->abi.num_work_groups;
2229                 break;
2230
2231         case TGSI_SEMANTIC_BLOCK_SIZE:
2232                 value = get_block_size(&ctx->abi);
2233                 break;
2234
2235         case TGSI_SEMANTIC_BLOCK_ID:
2236         {
2237                 LLVMValueRef values[3];
2238
2239                 for (int i = 0; i < 3; i++) {
2240                         values[i] = ctx->i32_0;
2241                         if (ctx->abi.workgroup_ids[i]) {
2242                                 values[i] = ctx->abi.workgroup_ids[i];
2243                         }
2244                 }
2245                 value = lp_build_gather_values(&ctx->gallivm, values, 3);
2246                 break;
2247         }
2248
2249         case TGSI_SEMANTIC_THREAD_ID:
2250                 value = ctx->abi.local_invocation_ids;
2251                 break;
2252
2253         case TGSI_SEMANTIC_HELPER_INVOCATION:
2254                 value = lp_build_intrinsic(ctx->ac.builder,
2255                                            "llvm.amdgcn.ps.live",
2256                                            ctx->i1, NULL, 0,
2257                                            LP_FUNC_ATTR_READNONE);
2258                 value = LLVMBuildNot(ctx->ac.builder, value, "");
2259                 value = LLVMBuildSExt(ctx->ac.builder, value, ctx->i32, "");
2260                 break;
2261
2262         case TGSI_SEMANTIC_SUBGROUP_SIZE:
2263                 value = LLVMConstInt(ctx->i32, 64, 0);
2264                 break;
2265
2266         case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
2267                 value = ac_get_thread_id(&ctx->ac);
2268                 break;
2269
2270         case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
2271         {
2272                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
2273                 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
2274                 value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
2275                 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
2276                 break;
2277         }
2278
2279         case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
2280         case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
2281         case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
2282         case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
2283         {
2284                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
2285                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
2286                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
2287                         /* All bits set except LSB */
2288                         value = LLVMConstInt(ctx->i64, -2, 0);
2289                 } else {
2290                         /* All bits set */
2291                         value = LLVMConstInt(ctx->i64, -1, 0);
2292                 }
2293                 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
2294                 value = LLVMBuildShl(ctx->ac.builder, value, id, "");
2295                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
2296                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
2297                         value = LLVMBuildNot(ctx->ac.builder, value, "");
2298                 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
2299                 break;
2300         }
2301
2302         default:
2303                 assert(!"unknown system value");
2304                 return;
2305         }
2306
2307         ctx->system_values[index] = value;
2308 }
2309
2310 void si_declare_compute_memory(struct si_shader_context *ctx)
2311 {
2312         struct si_shader_selector *sel = ctx->shader->selector;
2313
2314         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_LOCAL_ADDR_SPACE);
2315         LLVMValueRef var;
2316
2317         assert(!ctx->ac.lds);
2318
2319         var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2320                                           LLVMArrayType(ctx->i8, sel->local_size),
2321                                           "compute_lds",
2322                                           AC_LOCAL_ADDR_SPACE);
2323         LLVMSetAlignment(var, 4);
2324
2325         ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
2326 }
2327
2328 void si_tgsi_declare_compute_memory(struct si_shader_context *ctx,
2329                                     const struct tgsi_full_declaration *decl)
2330 {
2331         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
2332         assert(decl->Range.First == decl->Range.Last);
2333
2334         si_declare_compute_memory(ctx);
2335 }
2336
2337 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
2338 {
2339         LLVMValueRef ptr =
2340                 LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2341         struct si_shader_selector *sel = ctx->shader->selector;
2342
2343         /* Do the bounds checking with a descriptor, because
2344          * doing computation and manual bounds checking of 64-bit
2345          * addresses generates horrible VALU code with very high
2346          * VGPR usage and very low SIMD occupancy.
2347          */
2348         ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
2349
2350         LLVMValueRef desc0, desc1;
2351         if (HAVE_32BIT_POINTERS) {
2352                 desc0 = ptr;
2353                 desc1 = LLVMConstInt(ctx->i32,
2354                                      S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
2355         } else {
2356                 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
2357                 desc0 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, "");
2358                 desc1 = LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, "");
2359                 /* Mask out all bits except BASE_ADDRESS_HI. */
2360                 desc1 = LLVMBuildAnd(ctx->ac.builder, desc1,
2361                                      LLVMConstInt(ctx->i32, ~C_008F04_BASE_ADDRESS_HI, 0), "");
2362         }
2363
2364         LLVMValueRef desc_elems[] = {
2365                 desc0,
2366                 desc1,
2367                 LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
2368                 LLVMConstInt(ctx->i32,
2369                         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2370                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2371                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2372                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2373                         S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
2374                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
2375         };
2376
2377         return ac_build_gather_values(&ctx->ac, desc_elems, 4);
2378 }
2379
2380 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
2381 {
2382         LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
2383                                              ctx->param_const_and_shader_buffers);
2384
2385         return ac_build_load_to_sgpr(&ctx->ac, list_ptr,
2386                                      LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
2387 }
2388
2389 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
2390 {
2391         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2392         struct si_shader_selector *sel = ctx->shader->selector;
2393
2394         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2395
2396         if (sel->info.const_buffers_declared == 1 &&
2397             sel->info.shader_buffers_declared == 0) {
2398                 return load_const_buffer_desc_fast_path(ctx);
2399         }
2400
2401         index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
2402         index = LLVMBuildAdd(ctx->ac.builder, index,
2403                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2404
2405         return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2406 }
2407
2408 static LLVMValueRef
2409 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
2410 {
2411         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2412         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
2413                                              ctx->param_const_and_shader_buffers);
2414
2415         index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
2416         index = LLVMBuildSub(ctx->ac.builder,
2417                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
2418                              index, "");
2419
2420         return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
2421 }
2422
2423 static LLVMValueRef fetch_constant(
2424         struct lp_build_tgsi_context *bld_base,
2425         const struct tgsi_full_src_register *reg,
2426         enum tgsi_opcode_type type,
2427         unsigned swizzle)
2428 {
2429         struct si_shader_context *ctx = si_shader_context(bld_base);
2430         struct si_shader_selector *sel = ctx->shader->selector;
2431         const struct tgsi_ind_register *ireg = &reg->Indirect;
2432         unsigned buf, idx;
2433
2434         LLVMValueRef addr, bufp;
2435
2436         if (swizzle == LP_CHAN_ALL) {
2437                 unsigned chan;
2438                 LLVMValueRef values[4];
2439                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
2440                         values[chan] = fetch_constant(bld_base, reg, type, chan);
2441
2442                 return lp_build_gather_values(&ctx->gallivm, values, 4);
2443         }
2444
2445         /* Split 64-bit loads. */
2446         if (tgsi_type_is_64bit(type)) {
2447                 LLVMValueRef lo, hi;
2448
2449                 lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
2450                 hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1);
2451                 return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type),
2452                                                 lo, hi);
2453         }
2454
2455         idx = reg->Register.Index * 4 + swizzle;
2456         if (reg->Register.Indirect) {
2457                 addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
2458         } else {
2459                 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
2460         }
2461
2462         /* Fast path when user data SGPRs point to constant buffer 0 directly. */
2463         if (sel->info.const_buffers_declared == 1 &&
2464             sel->info.shader_buffers_declared == 0) {
2465
2466                 /* This enables use of s_load_dword and flat_load_dword for const buffer 0
2467                  * loads, and up to x4 load opcode merging. However, it leads to horrible
2468                  * code reducing SIMD wave occupancy from 8 to 2 in many cases.
2469                  *
2470                  * Using s_buffer_load_dword (x1) seems to be the best option right now.
2471                  *
2472                  * LLVM 5.0 on SI doesn't insert a required s_nop between SALU setting
2473                  * a descriptor and s_buffer_load_dword using it, so we can't expand
2474                  * the pointer into a full descriptor like below. We have to use
2475                  * s_load_dword instead. The only case when LLVM 5.0 would select
2476                  * s_buffer_load_dword (that we have to prevent) is when we use use
2477                  * a literal offset where we don't need bounds checking.
2478                  */
2479                 if (ctx->screen->info.chip_class == SI && HAVE_LLVM < 0x0600 &&
2480                     !reg->Register.Indirect) {
2481                         LLVMValueRef ptr =
2482                                 LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2483
2484                         addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
2485                         LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
2486                         return bitcast(bld_base, type, result);
2487                 }
2488
2489                 LLVMValueRef desc = load_const_buffer_desc_fast_path(ctx);
2490                 LLVMValueRef result = buffer_load_const(ctx, desc, addr);
2491                 return bitcast(bld_base, type, result);
2492         }
2493
2494         assert(reg->Register.Dimension);
2495         buf = reg->Dimension.Index;
2496
2497         if (reg->Dimension.Indirect) {
2498                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2499                 LLVMValueRef index;
2500                 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
2501                                                       reg->Dimension.Index,
2502                                                       ctx->num_const_buffers);
2503                 index = LLVMBuildAdd(ctx->ac.builder, index,
2504                                      LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2505                 bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2506         } else
2507                 bufp = load_const_buffer_desc(ctx, buf);
2508
2509         return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
2510 }
2511
2512 /* Initialize arguments for the shader export intrinsic */
2513 static void si_llvm_init_export_args(struct si_shader_context *ctx,
2514                                      LLVMValueRef *values,
2515                                      unsigned target,
2516                                      struct ac_export_args *args)
2517 {
2518         LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
2519         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
2520         unsigned chan;
2521         bool is_int8, is_int10;
2522
2523         /* Default is 0xf. Adjusted below depending on the format. */
2524         args->enabled_channels = 0xf; /* writemask */
2525
2526         /* Specify whether the EXEC mask represents the valid mask */
2527         args->valid_mask = 0;
2528
2529         /* Specify whether this is the last export */
2530         args->done = 0;
2531
2532         /* Specify the target we are exporting */
2533         args->target = target;
2534
2535         if (ctx->type == PIPE_SHADER_FRAGMENT) {
2536                 const struct si_shader_key *key = &ctx->shader->key;
2537                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
2538                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
2539
2540                 assert(cbuf >= 0 && cbuf < 8);
2541                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2542                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
2543                 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
2544         }
2545
2546         args->compr = false;
2547         args->out[0] = f32undef;
2548         args->out[1] = f32undef;
2549         args->out[2] = f32undef;
2550         args->out[3] = f32undef;
2551
2552         LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
2553         LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
2554                               unsigned bits, bool hi) = NULL;
2555
2556         switch (spi_shader_col_format) {
2557         case V_028714_SPI_SHADER_ZERO:
2558                 args->enabled_channels = 0; /* writemask */
2559                 args->target = V_008DFC_SQ_EXP_NULL;
2560                 break;
2561
2562         case V_028714_SPI_SHADER_32_R:
2563                 args->enabled_channels = 1; /* writemask */
2564                 args->out[0] = values[0];
2565                 break;
2566
2567         case V_028714_SPI_SHADER_32_GR:
2568                 args->enabled_channels = 0x3; /* writemask */
2569                 args->out[0] = values[0];
2570                 args->out[1] = values[1];
2571                 break;
2572
2573         case V_028714_SPI_SHADER_32_AR:
2574                 args->enabled_channels = 0x9; /* writemask */
2575                 args->out[0] = values[0];
2576                 args->out[3] = values[3];
2577                 break;
2578
2579         case V_028714_SPI_SHADER_FP16_ABGR:
2580                 packf = ac_build_cvt_pkrtz_f16;
2581                 break;
2582
2583         case V_028714_SPI_SHADER_UNORM16_ABGR:
2584                 packf = ac_build_cvt_pknorm_u16;
2585                 break;
2586
2587         case V_028714_SPI_SHADER_SNORM16_ABGR:
2588                 packf = ac_build_cvt_pknorm_i16;
2589                 break;
2590
2591         case V_028714_SPI_SHADER_UINT16_ABGR:
2592                 packi = ac_build_cvt_pk_u16;
2593                 break;
2594
2595         case V_028714_SPI_SHADER_SINT16_ABGR:
2596                 packi = ac_build_cvt_pk_i16;
2597                 break;
2598
2599         case V_028714_SPI_SHADER_32_ABGR:
2600                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2601                 break;
2602         }
2603
2604         /* Pack f16 or norm_i16/u16. */
2605         if (packf) {
2606                 for (chan = 0; chan < 2; chan++) {
2607                         LLVMValueRef pack_args[2] = {
2608                                 values[2 * chan],
2609                                 values[2 * chan + 1]
2610                         };
2611                         LLVMValueRef packed;
2612
2613                         packed = packf(&ctx->ac, pack_args);
2614                         args->out[chan] = ac_to_float(&ctx->ac, packed);
2615                 }
2616                 args->compr = 1; /* COMPR flag */
2617         }
2618         /* Pack i16/u16. */
2619         if (packi) {
2620                 for (chan = 0; chan < 2; chan++) {
2621                         LLVMValueRef pack_args[2] = {
2622                                 ac_to_integer(&ctx->ac, values[2 * chan]),
2623                                 ac_to_integer(&ctx->ac, values[2 * chan + 1])
2624                         };
2625                         LLVMValueRef packed;
2626
2627                         packed = packi(&ctx->ac, pack_args,
2628                                        is_int8 ? 8 : is_int10 ? 10 : 16,
2629                                        chan == 1);
2630                         args->out[chan] = ac_to_float(&ctx->ac, packed);
2631                 }
2632                 args->compr = 1; /* COMPR flag */
2633         }
2634 }
2635
2636 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2637                           LLVMValueRef alpha)
2638 {
2639         struct si_shader_context *ctx = si_shader_context(bld_base);
2640
2641         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2642                 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
2643                         [PIPE_FUNC_LESS] = LLVMRealOLT,
2644                         [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
2645                         [PIPE_FUNC_LEQUAL] = LLVMRealOLE,
2646                         [PIPE_FUNC_GREATER] = LLVMRealOGT,
2647                         [PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
2648                         [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
2649                 };
2650                 LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
2651                 assert(cond);
2652
2653                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2654                                 SI_PARAM_ALPHA_REF);
2655                 LLVMValueRef alpha_pass =
2656                         LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
2657                 ac_build_kill_if_false(&ctx->ac, alpha_pass);
2658         } else {
2659                 ac_build_kill_if_false(&ctx->ac, LLVMConstInt(ctx->i1, 0, 0));
2660         }
2661 }
2662
2663 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2664                                                   LLVMValueRef alpha,
2665                                                   unsigned samplemask_param)
2666 {
2667         struct si_shader_context *ctx = si_shader_context(bld_base);
2668         LLVMValueRef coverage;
2669
2670         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2671         coverage = LLVMGetParam(ctx->main_fn,
2672                                 samplemask_param);
2673         coverage = ac_to_integer(&ctx->ac, coverage);
2674
2675         coverage = lp_build_intrinsic(ctx->ac.builder, "llvm.ctpop.i32",
2676                                    ctx->i32,
2677                                    &coverage, 1, LP_FUNC_ATTR_READNONE);
2678
2679         coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
2680                                    ctx->f32, "");
2681
2682         coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
2683                                  LLVMConstReal(ctx->f32,
2684                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2685
2686         return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
2687 }
2688
2689 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
2690                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
2691 {
2692         unsigned reg_index;
2693         unsigned chan;
2694         unsigned const_chan;
2695         LLVMValueRef base_elt;
2696         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2697         LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2698                                                    SI_VS_CONST_CLIP_PLANES, 0);
2699         LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
2700
2701         for (reg_index = 0; reg_index < 2; reg_index ++) {
2702                 struct ac_export_args *args = &pos[2 + reg_index];
2703
2704                 args->out[0] =
2705                 args->out[1] =
2706                 args->out[2] =
2707                 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2708
2709                 /* Compute dot products of position and user clip plane vectors */
2710                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2711                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2712                                 LLVMValueRef addr =
2713                                         LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2714                                                                 const_chan) * 4, 0);
2715                                 base_elt = buffer_load_const(ctx, const_resource,
2716                                                              addr);
2717                                 args->out[chan] =
2718                                         lp_build_add(&ctx->bld_base.base, args->out[chan],
2719                                                      lp_build_mul(&ctx->bld_base.base, base_elt,
2720                                                                   out_elts[const_chan]));
2721                         }
2722                 }
2723
2724                 args->enabled_channels = 0xf;
2725                 args->valid_mask = 0;
2726                 args->done = 0;
2727                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2728                 args->compr = 0;
2729         }
2730 }
2731
2732 static void si_dump_streamout(struct pipe_stream_output_info *so)
2733 {
2734         unsigned i;
2735
2736         if (so->num_outputs)
2737                 fprintf(stderr, "STREAMOUT\n");
2738
2739         for (i = 0; i < so->num_outputs; i++) {
2740                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2741                                 so->output[i].start_component;
2742                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2743                         i, so->output[i].output_buffer,
2744                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2745                         so->output[i].register_index,
2746                         mask & 1 ? "x" : "",
2747                         mask & 2 ? "y" : "",
2748                         mask & 4 ? "z" : "",
2749                         mask & 8 ? "w" : "");
2750         }
2751 }
2752
2753 static void emit_streamout_output(struct si_shader_context *ctx,
2754                                   LLVMValueRef const *so_buffers,
2755                                   LLVMValueRef const *so_write_offsets,
2756                                   struct pipe_stream_output *stream_out,
2757                                   struct si_shader_output_values *shader_out)
2758 {
2759         unsigned buf_idx = stream_out->output_buffer;
2760         unsigned start = stream_out->start_component;
2761         unsigned num_comps = stream_out->num_components;
2762         LLVMValueRef out[4];
2763
2764         assert(num_comps && num_comps <= 4);
2765         if (!num_comps || num_comps > 4)
2766                 return;
2767
2768         /* Load the output as int. */
2769         for (int j = 0; j < num_comps; j++) {
2770                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2771
2772                 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
2773         }
2774
2775         /* Pack the output. */
2776         LLVMValueRef vdata = NULL;
2777
2778         switch (num_comps) {
2779         case 1: /* as i32 */
2780                 vdata = out[0];
2781                 break;
2782         case 2: /* as v2i32 */
2783         case 3: /* as v4i32 (aligned to 4) */
2784         case 4: /* as v4i32 */
2785                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2786                 for (int j = 0; j < num_comps; j++) {
2787                         vdata = LLVMBuildInsertElement(ctx->ac.builder, vdata, out[j],
2788                                                        LLVMConstInt(ctx->i32, j, 0), "");
2789                 }
2790                 break;
2791         }
2792
2793         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2794                                     vdata, num_comps,
2795                                     so_write_offsets[buf_idx],
2796                                     ctx->i32_0,
2797                                     stream_out->dst_offset * 4, 1, 1, true, false);
2798 }
2799
2800 /**
2801  * Write streamout data to buffers for vertex stream @p stream (different
2802  * vertex streams can occur for GS copy shaders).
2803  */
2804 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2805                                    struct si_shader_output_values *outputs,
2806                                    unsigned noutput, unsigned stream)
2807 {
2808         struct si_shader_selector *sel = ctx->shader->selector;
2809         struct pipe_stream_output_info *so = &sel->so;
2810         LLVMBuilderRef builder = ctx->ac.builder;
2811         int i;
2812         struct lp_build_if_state if_ctx;
2813
2814         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2815         LLVMValueRef so_vtx_count =
2816                 si_unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2817
2818         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2819
2820         /* can_emit = tid < so_vtx_count; */
2821         LLVMValueRef can_emit =
2822                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2823
2824         /* Emit the streamout code conditionally. This actually avoids
2825          * out-of-bounds buffer access. The hw tells us via the SGPR
2826          * (so_vtx_count) which threads are allowed to emit streamout data. */
2827         lp_build_if(&if_ctx, &ctx->gallivm, can_emit);
2828         {
2829                 /* The buffer offset is computed as follows:
2830                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2831                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2832                  *                attrib_offset
2833                  */
2834
2835                 LLVMValueRef so_write_index =
2836                         LLVMGetParam(ctx->main_fn,
2837                                      ctx->param_streamout_write_index);
2838
2839                 /* Compute (streamout_write_index + thread_id). */
2840                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2841
2842                 /* Load the descriptor and compute the write offset for each
2843                  * enabled buffer. */
2844                 LLVMValueRef so_write_offset[4] = {};
2845                 LLVMValueRef so_buffers[4];
2846                 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2847                                                     ctx->param_rw_buffers);
2848
2849                 for (i = 0; i < 4; i++) {
2850                         if (!so->stride[i])
2851                                 continue;
2852
2853                         LLVMValueRef offset = LLVMConstInt(ctx->i32,
2854                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
2855
2856                         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
2857
2858                         LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2859                                                               ctx->param_streamout_offset[i]);
2860                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2861
2862                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2863                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2864                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2865                 }
2866
2867                 /* Write streamout data. */
2868                 for (i = 0; i < so->num_outputs; i++) {
2869                         unsigned reg = so->output[i].register_index;
2870
2871                         if (reg >= noutput)
2872                                 continue;
2873
2874                         if (stream != so->output[i].stream)
2875                                 continue;
2876
2877                         emit_streamout_output(ctx, so_buffers, so_write_offset,
2878                                               &so->output[i], &outputs[reg]);
2879                 }
2880         }
2881         lp_build_endif(&if_ctx);
2882 }
2883
2884 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2885                             LLVMValueRef *values)
2886 {
2887         struct ac_export_args args;
2888
2889         si_llvm_init_export_args(ctx, values,
2890                                  V_008DFC_SQ_EXP_PARAM + index, &args);
2891         ac_build_export(&ctx->ac, &args);
2892 }
2893
2894 static void si_build_param_exports(struct si_shader_context *ctx,
2895                                    struct si_shader_output_values *outputs,
2896                                    unsigned noutput)
2897 {
2898         struct si_shader *shader = ctx->shader;
2899         unsigned param_count = 0;
2900
2901         for (unsigned i = 0; i < noutput; i++) {
2902                 unsigned semantic_name = outputs[i].semantic_name;
2903                 unsigned semantic_index = outputs[i].semantic_index;
2904
2905                 if (outputs[i].vertex_stream[0] != 0 &&
2906                     outputs[i].vertex_stream[1] != 0 &&
2907                     outputs[i].vertex_stream[2] != 0 &&
2908                     outputs[i].vertex_stream[3] != 0)
2909                         continue;
2910
2911                 switch (semantic_name) {
2912                 case TGSI_SEMANTIC_LAYER:
2913                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2914                 case TGSI_SEMANTIC_CLIPDIST:
2915                 case TGSI_SEMANTIC_COLOR:
2916                 case TGSI_SEMANTIC_BCOLOR:
2917                 case TGSI_SEMANTIC_PRIMID:
2918                 case TGSI_SEMANTIC_FOG:
2919                 case TGSI_SEMANTIC_TEXCOORD:
2920                 case TGSI_SEMANTIC_GENERIC:
2921                         break;
2922                 default:
2923                         continue;
2924                 }
2925
2926                 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2927                      semantic_index < SI_MAX_IO_GENERIC) &&
2928                     shader->key.opt.kill_outputs &
2929                     (1ull << si_shader_io_get_unique_index(semantic_name,
2930                                                            semantic_index, true)))
2931                         continue;
2932
2933                 si_export_param(ctx, param_count, outputs[i].values);
2934
2935                 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2936                 shader->info.vs_output_param_offset[i] = param_count++;
2937         }
2938
2939         shader->info.nr_param_exports = param_count;
2940 }
2941
2942 /* Generate export instructions for hardware VS shader stage */
2943 static void si_llvm_export_vs(struct si_shader_context *ctx,
2944                               struct si_shader_output_values *outputs,
2945                               unsigned noutput)
2946 {
2947         struct si_shader *shader = ctx->shader;
2948         struct ac_export_args pos_args[4] = {};
2949         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2950         unsigned pos_idx;
2951         int i;
2952
2953         /* Build position exports. */
2954         for (i = 0; i < noutput; i++) {
2955                 switch (outputs[i].semantic_name) {
2956                 case TGSI_SEMANTIC_POSITION:
2957                         si_llvm_init_export_args(ctx, outputs[i].values,
2958                                                  V_008DFC_SQ_EXP_POS, &pos_args[0]);
2959                         break;
2960                 case TGSI_SEMANTIC_PSIZE:
2961                         psize_value = outputs[i].values[0];
2962                         break;
2963                 case TGSI_SEMANTIC_LAYER:
2964                         layer_value = outputs[i].values[0];
2965                         break;
2966                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2967                         viewport_index_value = outputs[i].values[0];
2968                         break;
2969                 case TGSI_SEMANTIC_EDGEFLAG:
2970                         edgeflag_value = outputs[i].values[0];
2971                         break;
2972                 case TGSI_SEMANTIC_CLIPDIST:
2973                         if (!shader->key.opt.clip_disable) {
2974                                 unsigned index = 2 + outputs[i].semantic_index;
2975                                 si_llvm_init_export_args(ctx, outputs[i].values,
2976                                                          V_008DFC_SQ_EXP_POS + index,
2977                                                          &pos_args[index]);
2978                         }
2979                         break;
2980                 case TGSI_SEMANTIC_CLIPVERTEX:
2981                         if (!shader->key.opt.clip_disable) {
2982                                 si_llvm_emit_clipvertex(ctx, pos_args,
2983                                                         outputs[i].values);
2984                         }
2985                         break;
2986                 }
2987         }
2988
2989         /* We need to add the position output manually if it's missing. */
2990         if (!pos_args[0].out[0]) {
2991                 pos_args[0].enabled_channels = 0xf; /* writemask */
2992                 pos_args[0].valid_mask = 0; /* EXEC mask */
2993                 pos_args[0].done = 0; /* last export? */
2994                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2995                 pos_args[0].compr = 0; /* COMPR flag */
2996                 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2997                 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2998                 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2999                 pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
3000         }
3001
3002         /* Write the misc vector (point size, edgeflag, layer, viewport). */
3003         if (shader->selector->info.writes_psize ||
3004             shader->selector->info.writes_edgeflag ||
3005             shader->selector->info.writes_viewport_index ||
3006             shader->selector->info.writes_layer) {
3007                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
3008                                                (shader->selector->info.writes_edgeflag << 1) |
3009                                                (shader->selector->info.writes_layer << 2);
3010
3011                 pos_args[1].valid_mask = 0; /* EXEC mask */
3012                 pos_args[1].done = 0; /* last export? */
3013                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
3014                 pos_args[1].compr = 0; /* COMPR flag */
3015                 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
3016                 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
3017                 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
3018                 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
3019
3020                 if (shader->selector->info.writes_psize)
3021                         pos_args[1].out[0] = psize_value;
3022
3023                 if (shader->selector->info.writes_edgeflag) {
3024                         /* The output is a float, but the hw expects an integer
3025                          * with the first bit containing the edge flag. */
3026                         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
3027                                                          edgeflag_value,
3028                                                          ctx->i32, "");
3029                         edgeflag_value = ac_build_umin(&ctx->ac,
3030                                                       edgeflag_value,
3031                                                       ctx->i32_1);
3032
3033                         /* The LLVM intrinsic expects a float. */
3034                         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
3035                 }
3036
3037                 if (ctx->screen->info.chip_class >= GFX9) {
3038                         /* GFX9 has the layer in out.z[10:0] and the viewport
3039                          * index in out.z[19:16].
3040                          */
3041                         if (shader->selector->info.writes_layer)
3042                                 pos_args[1].out[2] = layer_value;
3043
3044                         if (shader->selector->info.writes_viewport_index) {
3045                                 LLVMValueRef v = viewport_index_value;
3046
3047                                 v = ac_to_integer(&ctx->ac, v);
3048                                 v = LLVMBuildShl(ctx->ac.builder, v,
3049                                                  LLVMConstInt(ctx->i32, 16, 0), "");
3050                                 v = LLVMBuildOr(ctx->ac.builder, v,
3051                                                 ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
3052                                 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
3053                                 pos_args[1].enabled_channels |= 1 << 2;
3054                         }
3055                 } else {
3056                         if (shader->selector->info.writes_layer)
3057                                 pos_args[1].out[2] = layer_value;
3058
3059                         if (shader->selector->info.writes_viewport_index) {
3060                                 pos_args[1].out[3] = viewport_index_value;
3061                                 pos_args[1].enabled_channels |= 1 << 3;
3062                         }
3063                 }
3064         }
3065
3066         for (i = 0; i < 4; i++)
3067                 if (pos_args[i].out[0])
3068                         shader->info.nr_pos_exports++;
3069
3070         pos_idx = 0;
3071         for (i = 0; i < 4; i++) {
3072                 if (!pos_args[i].out[0])
3073                         continue;
3074
3075                 /* Specify the target we are exporting */
3076                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
3077
3078                 if (pos_idx == shader->info.nr_pos_exports)
3079                         /* Specify that this is the last export */
3080                         pos_args[i].done = 1;
3081
3082                 ac_build_export(&ctx->ac, &pos_args[i]);
3083         }
3084
3085         /* Build parameter exports. */
3086         si_build_param_exports(ctx, outputs, noutput);
3087 }
3088
3089 /**
3090  * Forward all outputs from the vertex shader to the TES. This is only used
3091  * for the fixed function TCS.
3092  */
3093 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
3094 {
3095         struct si_shader_context *ctx = si_shader_context(bld_base);
3096         LLVMValueRef invocation_id, buffer, buffer_offset;
3097         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
3098         uint64_t inputs;
3099
3100         invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
3101         buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
3102         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
3103
3104         lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
3105         lds_vertex_offset = LLVMBuildMul(ctx->ac.builder, invocation_id,
3106                                          lds_vertex_stride, "");
3107         lds_base = get_tcs_in_current_patch_offset(ctx);
3108         lds_base = LLVMBuildAdd(ctx->ac.builder, lds_base, lds_vertex_offset, "");
3109
3110         inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
3111         while (inputs) {
3112                 unsigned i = u_bit_scan64(&inputs);
3113
3114                 LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
3115                                             LLVMConstInt(ctx->i32, 4 * i, 0),
3116                                              "");
3117
3118                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
3119                                               get_rel_patch_id(ctx),
3120                                               invocation_id,
3121                                               LLVMConstInt(ctx->i32, i, 0));
3122
3123                 LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0,
3124                                               lds_ptr);
3125
3126                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
3127                                             buffer_offset, 0, 1, 0, true, false);
3128         }
3129 }
3130
3131 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
3132                                   LLVMValueRef rel_patch_id,
3133                                   LLVMValueRef invocation_id,
3134                                   LLVMValueRef tcs_out_current_patch_data_offset,
3135                                   LLVMValueRef invoc0_tf_outer[4],
3136                                   LLVMValueRef invoc0_tf_inner[2])
3137 {
3138         struct si_shader_context *ctx = si_shader_context(bld_base);
3139         struct si_shader *shader = ctx->shader;
3140         unsigned tess_inner_index, tess_outer_index;
3141         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
3142         LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
3143         unsigned stride, outer_comps, inner_comps, i, offset;
3144         struct lp_build_if_state if_ctx, inner_if_ctx;
3145
3146         /* Add a barrier before loading tess factors from LDS. */
3147         if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
3148                 si_llvm_emit_barrier(NULL, bld_base, NULL);
3149
3150         /* Do this only for invocation 0, because the tess levels are per-patch,
3151          * not per-vertex.
3152          *
3153          * This can't jump, because invocation 0 executes this. It should
3154          * at least mask out the loads and stores for other invocations.
3155          */
3156         lp_build_if(&if_ctx, &ctx->gallivm,
3157                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3158                                   invocation_id, ctx->i32_0, ""));
3159
3160         /* Determine the layout of one tess factor element in the buffer. */
3161         switch (shader->key.part.tcs.epilog.prim_mode) {
3162         case PIPE_PRIM_LINES:
3163                 stride = 2; /* 2 dwords, 1 vec2 store */
3164                 outer_comps = 2;
3165                 inner_comps = 0;
3166                 break;
3167         case PIPE_PRIM_TRIANGLES:
3168                 stride = 4; /* 4 dwords, 1 vec4 store */
3169                 outer_comps = 3;
3170                 inner_comps = 1;
3171                 break;
3172         case PIPE_PRIM_QUADS:
3173                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
3174                 outer_comps = 4;
3175                 inner_comps = 2;
3176                 break;
3177         default:
3178                 assert(0);
3179                 return;
3180         }
3181
3182         for (i = 0; i < 4; i++) {
3183                 inner[i] = LLVMGetUndef(ctx->i32);
3184                 outer[i] = LLVMGetUndef(ctx->i32);
3185         }
3186
3187         if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
3188                 /* Tess factors are in VGPRs. */
3189                 for (i = 0; i < outer_comps; i++)
3190                         outer[i] = out[i] = invoc0_tf_outer[i];
3191                 for (i = 0; i < inner_comps; i++)
3192                         inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
3193         } else {
3194                 /* Load tess_inner and tess_outer from LDS.
3195                  * Any invocation can write them, so we can't get them from a temporary.
3196                  */
3197                 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
3198                 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
3199
3200                 lds_base = tcs_out_current_patch_data_offset;
3201                 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
3202                                          LLVMConstInt(ctx->i32,
3203                                                       tess_inner_index * 4, 0), "");
3204                 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
3205                                          LLVMConstInt(ctx->i32,
3206                                                       tess_outer_index * 4, 0), "");
3207
3208                 for (i = 0; i < outer_comps; i++) {
3209                         outer[i] = out[i] =
3210                                 lds_load(bld_base, ctx->ac.i32, i, lds_outer);
3211                 }
3212                 for (i = 0; i < inner_comps; i++) {
3213                         inner[i] = out[outer_comps+i] =
3214                                 lds_load(bld_base, ctx->ac.i32, i, lds_inner);
3215                 }
3216         }
3217
3218         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
3219                 /* For isolines, the hardware expects tess factors in the
3220                  * reverse order from what GLSL / TGSI specify.
3221                  */
3222                 LLVMValueRef tmp = out[0];
3223                 out[0] = out[1];
3224                 out[1] = tmp;
3225         }
3226
3227         /* Convert the outputs to vectors for stores. */
3228         vec0 = lp_build_gather_values(&ctx->gallivm, out, MIN2(stride, 4));
3229         vec1 = NULL;
3230
3231         if (stride > 4)
3232                 vec1 = lp_build_gather_values(&ctx->gallivm, out+4, stride - 4);
3233
3234         /* Get the buffer. */
3235         buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
3236
3237         /* Get the offset. */
3238         tf_base = LLVMGetParam(ctx->main_fn,
3239                                ctx->param_tcs_factor_offset);
3240         byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
3241                                   LLVMConstInt(ctx->i32, 4 * stride, 0), "");
3242
3243         lp_build_if(&inner_if_ctx, &ctx->gallivm,
3244                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3245                                   rel_patch_id, ctx->i32_0, ""));
3246
3247         /* Store the dynamic HS control word. */
3248         offset = 0;
3249         if (ctx->screen->info.chip_class <= VI) {
3250                 ac_build_buffer_store_dword(&ctx->ac, buffer,
3251                                             LLVMConstInt(ctx->i32, 0x80000000, 0),
3252                                             1, ctx->i32_0, tf_base,
3253                                             offset, 1, 0, true, false);
3254                 offset += 4;
3255         }
3256
3257         lp_build_endif(&inner_if_ctx);
3258
3259         /* Store the tessellation factors. */
3260         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
3261                                     MIN2(stride, 4), byteoffset, tf_base,
3262                                     offset, 1, 0, true, false);
3263         offset += 16;
3264         if (vec1)
3265                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
3266                                             stride - 4, byteoffset, tf_base,
3267                                             offset, 1, 0, true, false);
3268
3269         /* Store the tess factors into the offchip buffer if TES reads them. */
3270         if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
3271                 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
3272                 LLVMValueRef tf_inner_offset;
3273                 unsigned param_outer, param_inner;
3274
3275                 buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
3276                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
3277
3278                 param_outer = si_shader_io_get_unique_index_patch(
3279                                       TGSI_SEMANTIC_TESSOUTER, 0);
3280                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
3281                                         LLVMConstInt(ctx->i32, param_outer, 0));
3282
3283                 outer_vec = lp_build_gather_values(&ctx->gallivm, outer,
3284                                                    util_next_power_of_two(outer_comps));
3285
3286                 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
3287                                             outer_comps, tf_outer_offset,
3288                                             base, 0, 1, 0, true, false);
3289                 if (inner_comps) {
3290                         param_inner = si_shader_io_get_unique_index_patch(
3291                                               TGSI_SEMANTIC_TESSINNER, 0);
3292                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
3293                                         LLVMConstInt(ctx->i32, param_inner, 0));
3294
3295                         inner_vec = inner_comps == 1 ? inner[0] :
3296                                     lp_build_gather_values(&ctx->gallivm, inner, inner_comps);
3297                         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
3298                                                     inner_comps, tf_inner_offset,
3299                                                     base, 0, 1, 0, true, false);
3300                 }
3301         }
3302
3303         lp_build_endif(&if_ctx);
3304 }
3305
3306 static LLVMValueRef
3307 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
3308                     unsigned param, unsigned return_index)
3309 {
3310         return LLVMBuildInsertValue(ctx->ac.builder, ret,
3311                                     LLVMGetParam(ctx->main_fn, param),
3312                                     return_index, "");
3313 }
3314
3315 static LLVMValueRef
3316 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
3317                           unsigned param, unsigned return_index)
3318 {
3319         LLVMBuilderRef builder = ctx->ac.builder;
3320         LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
3321
3322         return LLVMBuildInsertValue(builder, ret,
3323                                     ac_to_float(&ctx->ac, p),
3324                                     return_index, "");
3325 }
3326
3327 static LLVMValueRef
3328 si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
3329                     unsigned param, unsigned return_index)
3330 {
3331         LLVMBuilderRef builder = ctx->ac.builder;
3332         LLVMValueRef ptr, lo, hi;
3333
3334         if (HAVE_32BIT_POINTERS) {
3335                 ptr = LLVMGetParam(ctx->main_fn, param);
3336                 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
3337                 return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
3338         }
3339
3340         ptr = LLVMGetParam(ctx->main_fn, param);
3341         ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
3342         ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
3343         lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
3344         hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
3345         ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
3346         return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
3347 }
3348
3349 /* This only writes the tessellation factor levels. */
3350 static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
3351                                       unsigned max_outputs,
3352                                       LLVMValueRef *addrs)
3353 {
3354         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3355         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
3356         LLVMBuilderRef builder = ctx->ac.builder;
3357         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
3358
3359         si_copy_tcs_inputs(bld_base);
3360
3361         rel_patch_id = get_rel_patch_id(ctx);
3362         invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
3363         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
3364
3365         if (ctx->screen->info.chip_class >= GFX9) {
3366                 LLVMBasicBlockRef blocks[2] = {
3367                         LLVMGetInsertBlock(builder),
3368                         ctx->merged_wrap_if_state.entry_block
3369                 };
3370                 LLVMValueRef values[2];
3371
3372                 lp_build_endif(&ctx->merged_wrap_if_state);
3373
3374                 values[0] = rel_patch_id;
3375                 values[1] = LLVMGetUndef(ctx->i32);
3376                 rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3377
3378                 values[0] = tf_lds_offset;
3379                 values[1] = LLVMGetUndef(ctx->i32);
3380                 tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3381
3382                 values[0] = invocation_id;
3383                 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
3384                 invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3385         }
3386
3387         /* Return epilog parameters from this function. */
3388         LLVMValueRef ret = ctx->return_value;
3389         unsigned vgpr;
3390
3391         if (ctx->screen->info.chip_class >= GFX9) {
3392                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3393                                           8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3394                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3395                                           8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3396                 /* Tess offchip and tess factor offsets are at the beginning. */
3397                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3398                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3399                 vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
3400         } else {
3401                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3402                                           GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
3403                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3404                                           GFX6_SGPR_TCS_OUT_LAYOUT);
3405                 /* Tess offchip and tess factor offsets are after user SGPRs. */
3406                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
3407                                           GFX6_TCS_NUM_USER_SGPR);
3408                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
3409                                           GFX6_TCS_NUM_USER_SGPR + 1);
3410                 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
3411         }
3412
3413         /* VGPRs */
3414         rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
3415         invocation_id = ac_to_float(&ctx->ac, invocation_id);
3416         tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
3417
3418         /* Leave a hole corresponding to the two input VGPRs. This ensures that
3419          * the invocation_id output does not alias the tcs_rel_ids input,
3420          * which saves a V_MOV on gfx9.
3421          */
3422         vgpr += 2;
3423
3424         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
3425         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
3426
3427         if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
3428                 vgpr++; /* skip the tess factor LDS offset */
3429                 for (unsigned i = 0; i < 6; i++) {
3430                         LLVMValueRef value =
3431                                 LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
3432                         value = ac_to_float(&ctx->ac, value);
3433                         ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
3434                 }
3435         } else {
3436                 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
3437         }
3438         ctx->return_value = ret;
3439 }
3440
3441 /* Pass TCS inputs from LS to TCS on GFX9. */
3442 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
3443 {
3444         LLVMValueRef ret = ctx->return_value;
3445
3446         ret = si_insert_input_ptr(ctx, ret, 0, 0);
3447         if (HAVE_32BIT_POINTERS)
3448                 ret = si_insert_input_ptr(ctx, ret, 1, 1);
3449         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3450         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3451         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3452         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3453
3454         ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers,
3455                                   8 + SI_SGPR_RW_BUFFERS);
3456         ret = si_insert_input_ptr(ctx, ret,
3457                                   ctx->param_bindless_samplers_and_images,
3458                                   8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3459
3460         ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
3461                                   8 + SI_SGPR_VS_STATE_BITS);
3462
3463 #if !HAVE_32BIT_POINTERS
3464         ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 1,
3465                                   8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
3466 #endif
3467
3468         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3469                                   8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3470         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
3471                                   8 + GFX9_SGPR_TCS_OUT_OFFSETS);
3472         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3473                                   8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3474
3475         unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
3476         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
3477                                    ac_to_float(&ctx->ac, ctx->abi.tcs_patch_id),
3478                                    vgpr++, "");
3479         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
3480                                    ac_to_float(&ctx->ac, ctx->abi.tcs_rel_ids),
3481                                    vgpr++, "");
3482         ctx->return_value = ret;
3483 }
3484
3485 /* Pass GS inputs from ES to GS on GFX9. */
3486 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
3487 {
3488         LLVMValueRef ret = ctx->return_value;
3489
3490         ret = si_insert_input_ptr(ctx, ret, 0, 0);
3491         if (HAVE_32BIT_POINTERS)
3492                 ret = si_insert_input_ptr(ctx, ret, 1, 1);
3493         ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
3494         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3495         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3496
3497         ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers,
3498                                   8 + SI_SGPR_RW_BUFFERS);
3499         ret = si_insert_input_ptr(ctx, ret,
3500                                   ctx->param_bindless_samplers_and_images,
3501                                   8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3502
3503 #if !HAVE_32BIT_POINTERS
3504         ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 1,
3505                                   8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
3506 #endif
3507
3508         unsigned vgpr;
3509         if (ctx->type == PIPE_SHADER_VERTEX)
3510                 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
3511         else
3512                 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
3513
3514         for (unsigned i = 0; i < 5; i++) {
3515                 unsigned param = ctx->param_gs_vtx01_offset + i;
3516                 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
3517         }
3518         ctx->return_value = ret;
3519 }
3520
3521 static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi,
3522                                      unsigned max_outputs,
3523                                      LLVMValueRef *addrs)
3524 {
3525         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3526         struct si_shader *shader = ctx->shader;
3527         struct tgsi_shader_info *info = &shader->selector->info;
3528         unsigned i, chan;
3529         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3530                                               ctx->param_rel_auto_id);
3531         LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3532         LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
3533                                                  vertex_dw_stride, "");
3534
3535         /* Write outputs to LDS. The next shader (TCS aka HS) will read
3536          * its inputs from it. */
3537         for (i = 0; i < info->num_outputs; i++) {
3538                 unsigned name = info->output_semantic_name[i];
3539                 unsigned index = info->output_semantic_index[i];
3540
3541                 /* The ARB_shader_viewport_layer_array spec contains the
3542                  * following issue:
3543                  *
3544                  *    2) What happens if gl_ViewportIndex or gl_Layer is
3545                  *    written in the vertex shader and a geometry shader is
3546                  *    present?
3547                  *
3548                  *    RESOLVED: The value written by the last vertex processing
3549                  *    stage is used. If the last vertex processing stage
3550                  *    (vertex, tessellation evaluation or geometry) does not
3551                  *    statically assign to gl_ViewportIndex or gl_Layer, index
3552                  *    or layer zero is assumed.
3553                  *
3554                  * So writes to those outputs in VS-as-LS are simply ignored.
3555                  */
3556                 if (name == TGSI_SEMANTIC_LAYER ||
3557                     name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3558                         continue;
3559
3560                 int param = si_shader_io_get_unique_index(name, index, false);
3561                 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
3562                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
3563
3564                 for (chan = 0; chan < 4; chan++) {
3565                         if (!(info->output_usagemask[i] & (1 << chan)))
3566                                 continue;
3567
3568                         lds_store(ctx, chan, dw_addr,
3569                                   LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
3570                 }
3571         }
3572
3573         if (ctx->screen->info.chip_class >= GFX9)
3574                 si_set_ls_return_value_for_tcs(ctx);
3575 }
3576
3577 static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
3578                                      unsigned max_outputs,
3579                                      LLVMValueRef *addrs)
3580 {
3581         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3582         struct si_shader *es = ctx->shader;
3583         struct tgsi_shader_info *info = &es->selector->info;
3584         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3585                                             ctx->param_es2gs_offset);
3586         LLVMValueRef lds_base = NULL;
3587         unsigned chan;
3588         int i;
3589
3590         if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
3591                 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3592                 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3593                 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3594                 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
3595                                          LLVMBuildMul(ctx->ac.builder, wave_idx,
3596                                                       LLVMConstInt(ctx->i32, 64, false), ""), "");
3597                 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
3598                                         LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3599         }
3600
3601         for (i = 0; i < info->num_outputs; i++) {
3602                 int param;
3603
3604                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3605                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3606                         continue;
3607
3608                 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3609                                                       info->output_semantic_index[i], false);
3610
3611                 for (chan = 0; chan < 4; chan++) {
3612                         if (!(info->output_usagemask[i] & (1 << chan)))
3613                                 continue;
3614
3615                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
3616                         out_val = ac_to_integer(&ctx->ac, out_val);
3617
3618                         /* GFX9 has the ESGS ring in LDS. */
3619                         if (ctx->screen->info.chip_class >= GFX9) {
3620                                 lds_store(ctx, param * 4 + chan, lds_base, out_val);
3621                                 continue;
3622                         }
3623
3624                         ac_build_buffer_store_dword(&ctx->ac,
3625                                                     ctx->esgs_ring,
3626                                                     out_val, 1, NULL, soffset,
3627                                                     (4 * param + chan) * 4,
3628                                                     1, 1, true, true);
3629                 }
3630         }
3631
3632         if (ctx->screen->info.chip_class >= GFX9)
3633                 si_set_es_return_value_for_gs(ctx);
3634 }
3635
3636 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3637 {
3638         if (ctx->screen->info.chip_class >= GFX9)
3639                 return si_unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3640         else
3641                 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3642 }
3643
3644 static void emit_gs_epilogue(struct si_shader_context *ctx)
3645 {
3646         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3647                          si_get_gs_wave_id(ctx));
3648
3649         if (ctx->screen->info.chip_class >= GFX9)
3650                 lp_build_endif(&ctx->merged_wrap_if_state);
3651 }
3652
3653 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
3654                                      unsigned max_outputs,
3655                                      LLVMValueRef *addrs)
3656 {
3657         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3658         struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info;
3659
3660         assert(info->num_outputs <= max_outputs);
3661
3662         emit_gs_epilogue(ctx);
3663 }
3664
3665 static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3666 {
3667         struct si_shader_context *ctx = si_shader_context(bld_base);
3668         emit_gs_epilogue(ctx);
3669 }
3670
3671 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3672                                      unsigned max_outputs,
3673                                      LLVMValueRef *addrs)
3674 {
3675         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3676         struct tgsi_shader_info *info = &ctx->shader->selector->info;
3677         struct si_shader_output_values *outputs = NULL;
3678         int i,j;
3679
3680         assert(!ctx->shader->is_gs_copy_shader);
3681         assert(info->num_outputs <= max_outputs);
3682
3683         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3684
3685         /* Vertex color clamping.
3686          *
3687          * This uses a state constant loaded in a user data SGPR and
3688          * an IF statement is added that clamps all colors if the constant
3689          * is true.
3690          */
3691         if (ctx->type == PIPE_SHADER_VERTEX) {
3692                 struct lp_build_if_state if_ctx;
3693                 LLVMValueRef cond = NULL;
3694                 LLVMValueRef addr, val;
3695
3696                 for (i = 0; i < info->num_outputs; i++) {
3697                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3698                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3699                                 continue;
3700
3701                         /* We've found a color. */
3702                         if (!cond) {
3703                                 /* The state is in the first bit of the user SGPR. */
3704                                 cond = LLVMGetParam(ctx->main_fn,
3705                                                     ctx->param_vs_state_bits);
3706                                 cond = LLVMBuildTrunc(ctx->ac.builder, cond,
3707                                                       ctx->i1, "");
3708                                 lp_build_if(&if_ctx, &ctx->gallivm, cond);
3709                         }
3710
3711                         for (j = 0; j < 4; j++) {
3712                                 addr = addrs[4 * i + j];
3713                                 val = LLVMBuildLoad(ctx->ac.builder, addr, "");
3714                                 val = ac_build_clamp(&ctx->ac, val);
3715                                 LLVMBuildStore(ctx->ac.builder, val, addr);
3716                         }
3717                 }
3718
3719                 if (cond)
3720                         lp_build_endif(&if_ctx);
3721         }
3722
3723         for (i = 0; i < info->num_outputs; i++) {
3724                 outputs[i].semantic_name = info->output_semantic_name[i];
3725                 outputs[i].semantic_index = info->output_semantic_index[i];
3726
3727                 for (j = 0; j < 4; j++) {
3728                         outputs[i].values[j] =
3729                                 LLVMBuildLoad(ctx->ac.builder,
3730                                               addrs[4 * i + j],
3731                                               "");
3732                         outputs[i].vertex_stream[j] =
3733                                 (info->output_streams[i] >> (2 * j)) & 3;
3734                 }
3735         }
3736
3737         if (ctx->shader->selector->so.num_outputs)
3738                 si_llvm_emit_streamout(ctx, outputs, i, 0);
3739
3740         /* Export PrimitiveID. */
3741         if (ctx->shader->key.mono.u.vs_export_prim_id) {
3742                 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3743                 outputs[i].semantic_index = 0;
3744                 outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0));
3745                 for (j = 1; j < 4; j++)
3746                         outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3747
3748                 memset(outputs[i].vertex_stream, 0,
3749                        sizeof(outputs[i].vertex_stream));
3750                 i++;
3751         }
3752
3753         si_llvm_export_vs(ctx, outputs, i);
3754         FREE(outputs);
3755 }
3756
3757 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3758 {
3759         struct si_shader_context *ctx = si_shader_context(bld_base);
3760
3761         ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3762                               &ctx->outputs[0][0]);
3763 }
3764
3765 struct si_ps_exports {
3766         unsigned num;
3767         struct ac_export_args args[10];
3768 };
3769
3770 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3771                             LLVMValueRef depth, LLVMValueRef stencil,
3772                             LLVMValueRef samplemask, struct si_ps_exports *exp)
3773 {
3774         struct si_shader_context *ctx = si_shader_context(bld_base);
3775         struct ac_export_args args;
3776
3777         ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
3778
3779         memcpy(&exp->args[exp->num++], &args, sizeof(args));
3780 }
3781
3782 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3783                                 LLVMValueRef *color, unsigned index,
3784                                 unsigned samplemask_param,
3785                                 bool is_last, struct si_ps_exports *exp)
3786 {
3787         struct si_shader_context *ctx = si_shader_context(bld_base);
3788         int i;
3789
3790         /* Clamp color */
3791         if (ctx->shader->key.part.ps.epilog.clamp_color)
3792                 for (i = 0; i < 4; i++)
3793                         color[i] = ac_build_clamp(&ctx->ac, color[i]);
3794
3795         /* Alpha to one */
3796         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3797                 color[3] = ctx->ac.f32_1;
3798
3799         /* Alpha test */
3800         if (index == 0 &&
3801             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3802                 si_alpha_test(bld_base, color[3]);
3803
3804         /* Line & polygon smoothing */
3805         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3806                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3807                                                          samplemask_param);
3808
3809         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3810         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3811                 struct ac_export_args args[8];
3812                 int c, last = -1;
3813
3814                 /* Get the export arguments, also find out what the last one is. */
3815                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3816                         si_llvm_init_export_args(ctx, color,
3817                                                  V_008DFC_SQ_EXP_MRT + c, &args[c]);
3818                         if (args[c].enabled_channels)
3819                                 last = c;
3820                 }
3821
3822                 /* Emit all exports. */
3823                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3824                         if (is_last && last == c) {
3825                                 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3826                                 args[c].done = 1; /* DONE bit */
3827                         } else if (!args[c].enabled_channels)
3828                                 continue; /* unnecessary NULL export */
3829
3830                         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3831                 }
3832         } else {
3833                 struct ac_export_args args;
3834
3835                 /* Export */
3836                 si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
3837                                          &args);
3838                 if (is_last) {
3839                         args.valid_mask = 1; /* whether the EXEC mask is valid */
3840                         args.done = 1; /* DONE bit */
3841                 } else if (!args.enabled_channels)
3842                         return; /* unnecessary NULL export */
3843
3844                 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3845         }
3846 }
3847
3848 static void si_emit_ps_exports(struct si_shader_context *ctx,
3849                                struct si_ps_exports *exp)
3850 {
3851         for (unsigned i = 0; i < exp->num; i++)
3852                 ac_build_export(&ctx->ac, &exp->args[i]);
3853 }
3854
3855 /**
3856  * Return PS outputs in this order:
3857  *
3858  * v[0:3] = color0.xyzw
3859  * v[4:7] = color1.xyzw
3860  * ...
3861  * vN+0 = Depth
3862  * vN+1 = Stencil
3863  * vN+2 = SampleMask
3864  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3865  *
3866  * The alpha-ref SGPR is returned via its original location.
3867  */
3868 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3869                                       unsigned max_outputs,
3870                                       LLVMValueRef *addrs)
3871 {
3872         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3873         struct si_shader *shader = ctx->shader;
3874         struct tgsi_shader_info *info = &shader->selector->info;
3875         LLVMBuilderRef builder = ctx->ac.builder;
3876         unsigned i, j, first_vgpr, vgpr;
3877
3878         LLVMValueRef color[8][4] = {};
3879         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3880         LLVMValueRef ret;
3881
3882         if (ctx->postponed_kill)
3883                 ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3884
3885         /* Read the output values. */
3886         for (i = 0; i < info->num_outputs; i++) {
3887                 unsigned semantic_name = info->output_semantic_name[i];
3888                 unsigned semantic_index = info->output_semantic_index[i];
3889
3890                 switch (semantic_name) {
3891                 case TGSI_SEMANTIC_COLOR:
3892                         assert(semantic_index < 8);
3893                         for (j = 0; j < 4; j++) {
3894                                 LLVMValueRef ptr = addrs[4 * i + j];
3895                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3896                                 color[semantic_index][j] = result;
3897                         }
3898                         break;
3899                 case TGSI_SEMANTIC_POSITION:
3900                         depth = LLVMBuildLoad(builder,
3901                                               addrs[4 * i + 2], "");
3902                         break;
3903                 case TGSI_SEMANTIC_STENCIL:
3904                         stencil = LLVMBuildLoad(builder,
3905                                                 addrs[4 * i + 1], "");
3906                         break;
3907                 case TGSI_SEMANTIC_SAMPLEMASK:
3908                         samplemask = LLVMBuildLoad(builder,
3909                                                    addrs[4 * i + 0], "");
3910                         break;
3911                 default:
3912                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3913                                 semantic_name);
3914                 }
3915         }
3916
3917         /* Fill the return structure. */
3918         ret = ctx->return_value;
3919
3920         /* Set SGPRs. */
3921         ret = LLVMBuildInsertValue(builder, ret,
3922                                    ac_to_integer(&ctx->ac,
3923                                                  LLVMGetParam(ctx->main_fn,
3924                                                               SI_PARAM_ALPHA_REF)),
3925                                    SI_SGPR_ALPHA_REF, "");
3926
3927         /* Set VGPRs */
3928         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3929         for (i = 0; i < ARRAY_SIZE(color); i++) {
3930                 if (!color[i][0])
3931                         continue;
3932
3933                 for (j = 0; j < 4; j++)
3934                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3935         }
3936         if (depth)
3937                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3938         if (stencil)
3939                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3940         if (samplemask)
3941                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3942
3943         /* Add the input sample mask for smoothing at the end. */
3944         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3945                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3946         ret = LLVMBuildInsertValue(builder, ret,
3947                                    LLVMGetParam(ctx->main_fn,
3948                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3949
3950         ctx->return_value = ret;
3951 }
3952
3953 static void membar_emit(
3954                 const struct lp_build_tgsi_action *action,
3955                 struct lp_build_tgsi_context *bld_base,
3956                 struct lp_build_emit_data *emit_data)
3957 {
3958         struct si_shader_context *ctx = si_shader_context(bld_base);
3959         LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3960         unsigned flags = LLVMConstIntGetZExtValue(src0);
3961         unsigned waitcnt = NOOP_WAITCNT;
3962
3963         if (flags & TGSI_MEMBAR_THREAD_GROUP)
3964                 waitcnt &= VM_CNT & LGKM_CNT;
3965
3966         if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3967                      TGSI_MEMBAR_SHADER_BUFFER |
3968                      TGSI_MEMBAR_SHADER_IMAGE))
3969                 waitcnt &= VM_CNT;
3970
3971         if (flags & TGSI_MEMBAR_SHARED)
3972                 waitcnt &= LGKM_CNT;
3973
3974         if (waitcnt != NOOP_WAITCNT)
3975                 ac_build_waitcnt(&ctx->ac, waitcnt);
3976 }
3977
3978 static void clock_emit(
3979                 const struct lp_build_tgsi_action *action,
3980                 struct lp_build_tgsi_context *bld_base,
3981                 struct lp_build_emit_data *emit_data)
3982 {
3983         struct si_shader_context *ctx = si_shader_context(bld_base);
3984         LLVMValueRef tmp = ac_build_shader_clock(&ctx->ac);
3985
3986         emit_data->output[0] =
3987                 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, "");
3988         emit_data->output[1] =
3989                 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, "");
3990 }
3991
3992 static void si_llvm_emit_ddxy(
3993         const struct lp_build_tgsi_action *action,
3994         struct lp_build_tgsi_context *bld_base,
3995         struct lp_build_emit_data *emit_data)
3996 {
3997         struct si_shader_context *ctx = si_shader_context(bld_base);
3998         unsigned opcode = emit_data->info->opcode;
3999         LLVMValueRef val;
4000         int idx;
4001         unsigned mask;
4002
4003         if (opcode == TGSI_OPCODE_DDX_FINE)
4004                 mask = AC_TID_MASK_LEFT;
4005         else if (opcode == TGSI_OPCODE_DDY_FINE)
4006                 mask = AC_TID_MASK_TOP;
4007         else
4008                 mask = AC_TID_MASK_TOP_LEFT;
4009
4010         /* for DDX we want to next X pixel, DDY next Y pixel. */
4011         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4012
4013         val = ac_to_integer(&ctx->ac, emit_data->args[0]);
4014         val = ac_build_ddxy(&ctx->ac, mask, idx, val);
4015         emit_data->output[emit_data->chan] = val;
4016 }
4017
4018 /*
4019  * this takes an I,J coordinate pair,
4020  * and works out the X and Y derivatives.
4021  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4022  */
4023 static LLVMValueRef si_llvm_emit_ddxy_interp(
4024         struct lp_build_tgsi_context *bld_base,
4025         LLVMValueRef interp_ij)
4026 {
4027         struct si_shader_context *ctx = si_shader_context(bld_base);
4028         LLVMValueRef result[4], a;
4029         unsigned i;
4030
4031         for (i = 0; i < 2; i++) {
4032                 a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij,
4033                                             LLVMConstInt(ctx->i32, i, 0), "");
4034                 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
4035                 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
4036         }
4037
4038         return lp_build_gather_values(&ctx->gallivm, result, 4);
4039 }
4040
4041 static void interp_fetch_args(
4042         struct lp_build_tgsi_context *bld_base,
4043         struct lp_build_emit_data *emit_data)
4044 {
4045         struct si_shader_context *ctx = si_shader_context(bld_base);
4046         const struct tgsi_full_instruction *inst = emit_data->inst;
4047
4048         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
4049                 /* offset is in second src, first two channels */
4050                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
4051                                                          emit_data->inst, 1,
4052                                                          TGSI_CHAN_X);
4053                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
4054                                                          emit_data->inst, 1,
4055                                                          TGSI_CHAN_Y);
4056                 emit_data->arg_count = 2;
4057         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4058                 LLVMValueRef sample_position;
4059                 LLVMValueRef sample_id;
4060                 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
4061
4062                 /* fetch sample ID, then fetch its sample position,
4063                  * and place into first two channels.
4064                  */
4065                 sample_id = lp_build_emit_fetch(bld_base,
4066                                                 emit_data->inst, 1, TGSI_CHAN_X);
4067                 sample_id = ac_to_integer(&ctx->ac, sample_id);
4068
4069                 /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading
4070                  * Language 4.50 spec says about interpolateAtSample:
4071                  *
4072                  *    "Returns the value of the input interpolant variable at
4073                  *     the location of sample number sample. If multisample
4074                  *     buffers are not available, the input variable will be
4075                  *     evaluated at the center of the pixel. If sample sample
4076                  *     does not exist, the position used to interpolate the
4077                  *     input variable is undefined."
4078                  *
4079                  * This means that sample_id values outside of the valid are
4080                  * in fact valid input, and the usual mechanism for loading the
4081                  * sample position doesn't work.
4082                  */
4083                 if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) {
4084                         LLVMValueRef center[4] = {
4085                                 LLVMConstReal(ctx->f32, 0.5),
4086                                 LLVMConstReal(ctx->f32, 0.5),
4087                                 ctx->ac.f32_0,
4088                                 ctx->ac.f32_0,
4089                         };
4090
4091                         sample_position = lp_build_gather_values(&ctx->gallivm, center, 4);
4092                 } else {
4093                         sample_position = load_sample_position(&ctx->abi, sample_id);
4094                 }
4095
4096                 emit_data->args[0] = LLVMBuildExtractElement(ctx->ac.builder,
4097                                                              sample_position,
4098                                                              ctx->i32_0, "");
4099
4100                 emit_data->args[0] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[0], halfval, "");
4101                 emit_data->args[1] = LLVMBuildExtractElement(ctx->ac.builder,
4102                                                              sample_position,
4103                                                              ctx->i32_1, "");
4104                 emit_data->args[1] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[1], halfval, "");
4105                 emit_data->arg_count = 2;
4106         }
4107 }
4108
4109 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
4110                                 struct lp_build_tgsi_context *bld_base,
4111                                 struct lp_build_emit_data *emit_data)
4112 {
4113         struct si_shader_context *ctx = si_shader_context(bld_base);
4114         struct si_shader *shader = ctx->shader;
4115         const struct tgsi_shader_info *info = &shader->selector->info;
4116         LLVMValueRef interp_param;
4117         const struct tgsi_full_instruction *inst = emit_data->inst;
4118         const struct tgsi_full_src_register *input = &inst->Src[0];
4119         int input_base, input_array_size;
4120         int chan;
4121         int i;
4122         LLVMValueRef prim_mask = ctx->abi.prim_mask;
4123         LLVMValueRef array_idx;
4124         int interp_param_idx;
4125         unsigned interp;
4126         unsigned location;
4127
4128         assert(input->Register.File == TGSI_FILE_INPUT);
4129
4130         if (input->Register.Indirect) {
4131                 unsigned array_id = input->Indirect.ArrayID;
4132
4133                 if (array_id) {
4134                         input_base = info->input_array_first[array_id];
4135                         input_array_size = info->input_array_last[array_id] - input_base + 1;
4136                 } else {
4137                         input_base = inst->Src[0].Register.Index;
4138                         input_array_size = info->num_inputs - input_base;
4139                 }
4140
4141                 array_idx = si_get_indirect_index(ctx, &input->Indirect,
4142                                                   1, input->Register.Index - input_base);
4143         } else {
4144                 input_base = inst->Src[0].Register.Index;
4145                 input_array_size = 1;
4146                 array_idx = ctx->i32_0;
4147         }
4148
4149         interp = shader->selector->info.input_interpolate[input_base];
4150
4151         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4152             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
4153                 location = TGSI_INTERPOLATE_LOC_CENTER;
4154         else
4155                 location = TGSI_INTERPOLATE_LOC_CENTROID;
4156
4157         interp_param_idx = lookup_interp_param_index(interp, location);
4158         if (interp_param_idx == -1)
4159                 return;
4160         else if (interp_param_idx)
4161                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
4162         else
4163                 interp_param = NULL;
4164
4165         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4166             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4167                 LLVMValueRef ij_out[2];
4168                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
4169
4170                 /*
4171                  * take the I then J parameters, and the DDX/Y for it, and
4172                  * calculate the IJ inputs for the interpolator.
4173                  * temp1 = ddx * offset/sample.x + I;
4174                  * interp_param.I = ddy * offset/sample.y + temp1;
4175                  * temp1 = ddx * offset/sample.x + J;
4176                  * interp_param.J = ddy * offset/sample.y + temp1;
4177                  */
4178                 for (i = 0; i < 2; i++) {
4179                         LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
4180                         LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
4181                         LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder,
4182                                                                       ddxy_out, ix_ll, "");
4183                         LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder,
4184                                                                       ddxy_out, iy_ll, "");
4185                         LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
4186                                                                          interp_param, ix_ll, "");
4187                         LLVMValueRef temp1, temp2;
4188
4189                         interp_el = ac_to_float(&ctx->ac, interp_el);
4190
4191                         temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, emit_data->args[0], "");
4192
4193                         temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, "");
4194
4195                         temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, emit_data->args[1], "");
4196
4197                         ij_out[i] = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, "");
4198                 }
4199                 interp_param = lp_build_gather_values(&ctx->gallivm, ij_out, 2);
4200         }
4201
4202         if (interp_param)
4203                 interp_param = ac_to_float(&ctx->ac, interp_param);
4204
4205         for (chan = 0; chan < 4; chan++) {
4206                 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
4207                 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
4208
4209                 for (unsigned idx = 0; idx < input_array_size; ++idx) {
4210                         LLVMValueRef v, i = NULL, j = NULL;
4211
4212                         if (interp_param) {
4213                                 i = LLVMBuildExtractElement(
4214                                         ctx->ac.builder, interp_param, ctx->i32_0, "");
4215                                 j = LLVMBuildExtractElement(
4216                                         ctx->ac.builder, interp_param, ctx->i32_1, "");
4217                         }
4218                         v = si_build_fs_interp(ctx, input_base + idx, schan,
4219                                                prim_mask, i, j);
4220
4221                         gather = LLVMBuildInsertElement(ctx->ac.builder,
4222                                 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
4223                 }
4224
4225                 emit_data->output[chan] = LLVMBuildExtractElement(
4226                         ctx->ac.builder, gather, array_idx, "");
4227         }
4228 }
4229
4230 static void vote_all_emit(
4231         const struct lp_build_tgsi_action *action,
4232         struct lp_build_tgsi_context *bld_base,
4233         struct lp_build_emit_data *emit_data)
4234 {
4235         struct si_shader_context *ctx = si_shader_context(bld_base);
4236
4237         LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]);
4238         emit_data->output[emit_data->chan] =
4239                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4240 }
4241
4242 static void vote_any_emit(
4243         const struct lp_build_tgsi_action *action,
4244         struct lp_build_tgsi_context *bld_base,
4245         struct lp_build_emit_data *emit_data)
4246 {
4247         struct si_shader_context *ctx = si_shader_context(bld_base);
4248
4249         LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]);
4250         emit_data->output[emit_data->chan] =
4251                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4252 }
4253
4254 static void vote_eq_emit(
4255         const struct lp_build_tgsi_action *action,
4256         struct lp_build_tgsi_context *bld_base,
4257         struct lp_build_emit_data *emit_data)
4258 {
4259         struct si_shader_context *ctx = si_shader_context(bld_base);
4260
4261         LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]);
4262         emit_data->output[emit_data->chan] =
4263                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4264 }
4265
4266 static void ballot_emit(
4267         const struct lp_build_tgsi_action *action,
4268         struct lp_build_tgsi_context *bld_base,
4269         struct lp_build_emit_data *emit_data)
4270 {
4271         struct si_shader_context *ctx = si_shader_context(bld_base);
4272         LLVMBuilderRef builder = ctx->ac.builder;
4273         LLVMValueRef tmp;
4274
4275         tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4276         tmp = ac_build_ballot(&ctx->ac, tmp);
4277         tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
4278
4279         emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
4280         emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
4281 }
4282
4283 static void read_invoc_fetch_args(
4284         struct lp_build_tgsi_context *bld_base,
4285         struct lp_build_emit_data *emit_data)
4286 {
4287         emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
4288                                                  0, emit_data->src_chan);
4289
4290         /* Always read the source invocation (= lane) from the X channel. */
4291         emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
4292                                                  1, TGSI_CHAN_X);
4293         emit_data->arg_count = 2;
4294 }
4295
4296 static void read_lane_emit(
4297         const struct lp_build_tgsi_action *action,
4298         struct lp_build_tgsi_context *bld_base,
4299         struct lp_build_emit_data *emit_data)
4300 {
4301         struct si_shader_context *ctx = si_shader_context(bld_base);
4302
4303         /* We currently have no other way to prevent LLVM from lifting the icmp
4304          * calls to a dominating basic block.
4305          */
4306         ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]);
4307
4308         for (unsigned i = 0; i < emit_data->arg_count; ++i)
4309                 emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]);
4310
4311         emit_data->output[emit_data->chan] =
4312                 ac_build_intrinsic(&ctx->ac, action->intr_name,
4313                                    ctx->i32, emit_data->args, emit_data->arg_count,
4314                                    AC_FUNC_ATTR_READNONE |
4315                                    AC_FUNC_ATTR_CONVERGENT);
4316 }
4317
4318 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4319                                        struct lp_build_emit_data *emit_data)
4320 {
4321         struct si_shader_context *ctx = si_shader_context(bld_base);
4322         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4323         LLVMValueRef imm;
4324         unsigned stream;
4325
4326         assert(src0.File == TGSI_FILE_IMMEDIATE);
4327
4328         imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
4329         stream = LLVMConstIntGetZExtValue(imm) & 0x3;
4330         return stream;
4331 }
4332
4333 /* Emit one vertex from the geometry shader */
4334 static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
4335                                 unsigned stream,
4336                                 LLVMValueRef *addrs)
4337 {
4338         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4339         struct tgsi_shader_info *info = &ctx->shader->selector->info;
4340         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
4341         struct si_shader *shader = ctx->shader;
4342         struct lp_build_if_state if_state;
4343         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
4344                                             ctx->param_gs2vs_offset);
4345         LLVMValueRef gs_next_vertex;
4346         LLVMValueRef can_emit;
4347         unsigned chan, offset;
4348         int i;
4349
4350         /* Write vertex attribute values to GSVS ring */
4351         gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
4352                                        ctx->gs_next_vertex[stream],
4353                                        "");
4354
4355         /* If this thread has already emitted the declared maximum number of
4356          * vertices, skip the write: excessive vertex emissions are not
4357          * supposed to have any effect.
4358          *
4359          * If the shader has no writes to memory, kill it instead. This skips
4360          * further memory loads and may allow LLVM to skip to the end
4361          * altogether.
4362          */
4363         can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
4364                                  LLVMConstInt(ctx->i32,
4365                                               shader->selector->gs_max_out_vertices, 0), "");
4366
4367         bool use_kill = !info->writes_memory;
4368         if (use_kill) {
4369                 ac_build_kill_if_false(&ctx->ac, can_emit);
4370         } else {
4371                 lp_build_if(&if_state, &ctx->gallivm, can_emit);
4372         }
4373
4374         offset = 0;
4375         for (i = 0; i < info->num_outputs; i++) {
4376                 for (chan = 0; chan < 4; chan++) {
4377                         if (!(info->output_usagemask[i] & (1 << chan)) ||
4378                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4379                                 continue;
4380
4381                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
4382                         LLVMValueRef voffset =
4383                                 LLVMConstInt(ctx->i32, offset *
4384                                              shader->selector->gs_max_out_vertices, 0);
4385                         offset++;
4386
4387                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
4388                         voffset = lp_build_mul_imm(uint, voffset, 4);
4389
4390                         out_val = ac_to_integer(&ctx->ac, out_val);
4391
4392                         ac_build_buffer_store_dword(&ctx->ac,
4393                                                     ctx->gsvs_ring[stream],
4394                                                     out_val, 1,
4395                                                     voffset, soffset, 0,
4396                                                     1, 1, true, true);
4397                 }
4398         }
4399
4400         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4401                                       ctx->i32_1);
4402
4403         LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4404
4405         /* Signal vertex emission */
4406         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4407                          si_get_gs_wave_id(ctx));
4408         if (!use_kill)
4409                 lp_build_endif(&if_state);
4410 }
4411
4412 /* Emit one vertex from the geometry shader */
4413 static void si_tgsi_emit_vertex(
4414         const struct lp_build_tgsi_action *action,
4415         struct lp_build_tgsi_context *bld_base,
4416         struct lp_build_emit_data *emit_data)
4417 {
4418         struct si_shader_context *ctx = si_shader_context(bld_base);
4419         unsigned stream = si_llvm_get_stream(bld_base, emit_data);
4420
4421         si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]);
4422 }
4423
4424 /* Cut one primitive from the geometry shader */
4425 static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
4426                                    unsigned stream)
4427 {
4428         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4429
4430         /* Signal primitive cut */
4431         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4432                          si_get_gs_wave_id(ctx));
4433 }
4434
4435 /* Cut one primitive from the geometry shader */
4436 static void si_tgsi_emit_primitive(
4437         const struct lp_build_tgsi_action *action,
4438         struct lp_build_tgsi_context *bld_base,
4439         struct lp_build_emit_data *emit_data)
4440 {
4441         struct si_shader_context *ctx = si_shader_context(bld_base);
4442
4443         si_llvm_emit_primitive(&ctx->abi, si_llvm_get_stream(bld_base, emit_data));
4444 }
4445
4446 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4447                                  struct lp_build_tgsi_context *bld_base,
4448                                  struct lp_build_emit_data *emit_data)
4449 {
4450         struct si_shader_context *ctx = si_shader_context(bld_base);
4451
4452         /* SI only (thanks to a hw bug workaround):
4453          * The real barrier instruction isn’t needed, because an entire patch
4454          * always fits into a single wave.
4455          */
4456         if (ctx->screen->info.chip_class == SI &&
4457             ctx->type == PIPE_SHADER_TESS_CTRL) {
4458                 ac_build_waitcnt(&ctx->ac, LGKM_CNT & VM_CNT);
4459                 return;
4460         }
4461
4462         lp_build_intrinsic(ctx->ac.builder,
4463                            "llvm.amdgcn.s.barrier",
4464                            ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4465 }
4466
4467 static const struct lp_build_tgsi_action interp_action = {
4468         .fetch_args = interp_fetch_args,
4469         .emit = build_interp_intrinsic,
4470 };
4471
4472 static void si_create_function(struct si_shader_context *ctx,
4473                                const char *name,
4474                                LLVMTypeRef *returns, unsigned num_returns,
4475                                struct si_function_info *fninfo,
4476                                unsigned max_workgroup_size)
4477 {
4478         int i;
4479
4480         si_llvm_create_func(ctx, name, returns, num_returns,
4481                             fninfo->types, fninfo->num_params);
4482         ctx->return_value = LLVMGetUndef(ctx->return_type);
4483
4484         for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4485                 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4486
4487                 /* The combination of:
4488                  * - noalias
4489                  * - dereferenceable
4490                  * - invariant.load
4491                  * allows the optimization passes to move loads and reduces
4492                  * SGPR spilling significantly.
4493                  */
4494                 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4495
4496                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4497                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4498                         ac_add_attr_dereferenceable(P, UINT64_MAX);
4499                 }
4500         }
4501
4502         for (i = 0; i < fninfo->num_params; ++i) {
4503                 if (fninfo->assign[i])
4504                         *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4505         }
4506
4507         if (ctx->screen->info.address32_hi) {
4508                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4509                                                      "amdgpu-32bit-address-high-bits",
4510                                                      ctx->screen->info.address32_hi);
4511         }
4512
4513         if (max_workgroup_size) {
4514                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4515                                                      "amdgpu-max-work-group-size",
4516                                                      max_workgroup_size);
4517         }
4518         LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4519                                            "no-signed-zeros-fp-math",
4520                                            "true");
4521
4522         if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) {
4523                 /* These were copied from some LLVM test. */
4524                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4525                                                    "less-precise-fpmad",
4526                                                    "true");
4527                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4528                                                    "no-infs-fp-math",
4529                                                    "true");
4530                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4531                                                    "no-nans-fp-math",
4532                                                    "true");
4533                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4534                                                    "unsafe-fp-math",
4535                                                    "true");
4536         }
4537 }
4538
4539 static void declare_streamout_params(struct si_shader_context *ctx,
4540                                      struct pipe_stream_output_info *so,
4541                                      struct si_function_info *fninfo)
4542 {
4543         int i;
4544
4545         /* Streamout SGPRs. */
4546         if (so->num_outputs) {
4547                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4548                         ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4549                 else
4550                         ctx->param_streamout_config = fninfo->num_params - 1;
4551
4552                 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4553         }
4554         /* A streamout buffer offset is loaded if the stride is non-zero. */
4555         for (i = 0; i < 4; i++) {
4556                 if (!so->stride[i])
4557                         continue;
4558
4559                 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4560         }
4561 }
4562
4563 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4564 {
4565         switch (shader->selector->type) {
4566         case PIPE_SHADER_TESS_CTRL:
4567                 /* Return this so that LLVM doesn't remove s_barrier
4568                  * instructions on chips where we use s_barrier. */
4569                 return shader->selector->screen->info.chip_class >= CIK ? 128 : 64;
4570
4571         case PIPE_SHADER_GEOMETRY:
4572                 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64;
4573
4574         case PIPE_SHADER_COMPUTE:
4575                 break; /* see below */
4576
4577         default:
4578                 return 0;
4579         }
4580
4581         const unsigned *properties = shader->selector->info.properties;
4582         unsigned max_work_group_size =
4583                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4584                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4585                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4586
4587         if (!max_work_group_size) {
4588                 /* This is a variable group size compute shader,
4589                  * compile it for the maximum possible group size.
4590                  */
4591                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4592         }
4593         return max_work_group_size;
4594 }
4595
4596 static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
4597                                              struct si_function_info *fninfo,
4598                                              bool assign_params)
4599 {
4600         LLVMTypeRef const_shader_buf_type;
4601
4602         if (ctx->shader->selector->info.const_buffers_declared == 1 &&
4603             ctx->shader->selector->info.shader_buffers_declared == 0)
4604                 const_shader_buf_type = ctx->f32;
4605         else
4606                 const_shader_buf_type = ctx->v4i32;
4607
4608         unsigned const_and_shader_buffers =
4609                 add_arg(fninfo, ARG_SGPR,
4610                         ac_array_in_const32_addr_space(const_shader_buf_type));
4611
4612         if (assign_params)
4613                 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4614 }
4615
4616 static void declare_samplers_and_images(struct si_shader_context *ctx,
4617                                         struct si_function_info *fninfo,
4618                                         bool assign_params)
4619 {
4620         unsigned samplers_and_images =
4621                 add_arg(fninfo, ARG_SGPR,
4622                         ac_array_in_const32_addr_space(ctx->v8i32));
4623
4624         if (assign_params)
4625                 ctx->param_samplers_and_images = samplers_and_images;
4626 }
4627
4628 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4629                                             struct si_function_info *fninfo,
4630                                             bool assign_params)
4631 {
4632         declare_const_and_shader_buffers(ctx, fninfo, assign_params);
4633         declare_samplers_and_images(ctx, fninfo, assign_params);
4634 }
4635
4636 static void declare_global_desc_pointers(struct si_shader_context *ctx,
4637                                          struct si_function_info *fninfo)
4638 {
4639         ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4640                 ac_array_in_const32_addr_space(ctx->v4i32));
4641         ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4642                 ac_array_in_const32_addr_space(ctx->v8i32));
4643 }
4644
4645 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4646                                             struct si_function_info *fninfo)
4647 {
4648         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4649         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4650         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4651         ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4652 }
4653
4654 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4655                                    struct si_function_info *fninfo,
4656                                    unsigned *num_prolog_vgprs)
4657 {
4658         struct si_shader *shader = ctx->shader;
4659
4660         add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4661         if (shader->key.as_ls) {
4662                 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4663                 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4664         } else {
4665                 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4666                 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4667         }
4668         add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4669
4670         if (!shader->is_gs_copy_shader) {
4671                 /* Vertex load indices. */
4672                 ctx->param_vertex_index0 = fninfo->num_params;
4673                 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4674                         add_arg(fninfo, ARG_VGPR, ctx->i32);
4675                 *num_prolog_vgprs += shader->selector->info.num_inputs;
4676         }
4677 }
4678
4679 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4680                                     struct si_function_info *fninfo)
4681 {
4682         ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4683         ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4684         ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4685         add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id);
4686 }
4687
4688 enum {
4689         /* Convenient merged shader definitions. */
4690         SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4691         SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4692 };
4693
4694 static void create_function(struct si_shader_context *ctx)
4695 {
4696         struct si_shader *shader = ctx->shader;
4697         struct si_function_info fninfo;
4698         LLVMTypeRef returns[16+32*4];
4699         unsigned i, num_return_sgprs;
4700         unsigned num_returns = 0;
4701         unsigned num_prolog_vgprs = 0;
4702         unsigned type = ctx->type;
4703         unsigned vs_blit_property =
4704                 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
4705
4706         si_init_function_info(&fninfo);
4707
4708         /* Set MERGED shaders. */
4709         if (ctx->screen->info.chip_class >= GFX9) {
4710                 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4711                         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4712                 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4713                         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4714         }
4715
4716         LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4717
4718         switch (type) {
4719         case PIPE_SHADER_VERTEX:
4720                 declare_global_desc_pointers(ctx, &fninfo);
4721
4722                 if (vs_blit_property) {
4723                         ctx->param_vs_blit_inputs = fninfo.num_params;
4724                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
4725                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
4726                         add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */
4727
4728                         if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
4729                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */
4730                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */
4731                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */
4732                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */
4733                         } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
4734                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
4735                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
4736                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
4737                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
4738                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
4739                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
4740                         }
4741
4742                         /* VGPRs */
4743                         declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4744                         break;
4745                 }
4746
4747                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4748                 declare_vs_specific_input_sgprs(ctx, &fninfo);
4749                 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4750                         ac_array_in_const32_addr_space(ctx->v4i32));
4751
4752                 if (shader->key.as_es) {
4753                         ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4754                 } else if (shader->key.as_ls) {
4755                         /* no extra parameters */
4756                 } else {
4757                         if (shader->is_gs_copy_shader) {
4758                                 fninfo.num_params = ctx->param_rw_buffers + 1;
4759                                 fninfo.num_sgpr_params = fninfo.num_params;
4760                         }
4761
4762                         /* The locations of the other parameters are assigned dynamically. */
4763                         declare_streamout_params(ctx, &shader->selector->so,
4764                                                  &fninfo);
4765                 }
4766
4767                 /* VGPRs */
4768                 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4769                 break;
4770
4771         case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4772                 declare_global_desc_pointers(ctx, &fninfo);
4773                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4774                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4775                 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4776                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4777                 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4778                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4779                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4780
4781                 /* VGPRs */
4782                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id);
4783                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids);
4784
4785                 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4786                  * placed after the user SGPRs.
4787                  */
4788                 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4789                         returns[num_returns++] = ctx->i32; /* SGPRs */
4790                 for (i = 0; i < 11; i++)
4791                         returns[num_returns++] = ctx->f32; /* VGPRs */
4792                 break;
4793
4794         case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4795                 /* Merged stages have 8 system SGPRs at the beginning. */
4796                 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
4797                 if (HAVE_32BIT_POINTERS) {
4798                         declare_per_stage_desc_pointers(ctx, &fninfo,
4799                                                         ctx->type == PIPE_SHADER_TESS_CTRL);
4800                 } else {
4801                         declare_const_and_shader_buffers(ctx, &fninfo,
4802                                                          ctx->type == PIPE_SHADER_TESS_CTRL);
4803                 }
4804                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4805                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4806                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4807                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4808                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4809                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4810
4811                 declare_global_desc_pointers(ctx, &fninfo);
4812                 declare_per_stage_desc_pointers(ctx, &fninfo,
4813                                                 ctx->type == PIPE_SHADER_VERTEX);
4814                 declare_vs_specific_input_sgprs(ctx, &fninfo);
4815
4816                 if (!HAVE_32BIT_POINTERS) {
4817                         declare_samplers_and_images(ctx, &fninfo,
4818                                                     ctx->type == PIPE_SHADER_TESS_CTRL);
4819                 }
4820                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4821                 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4822                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4823                 if (!HAVE_32BIT_POINTERS) /* Align to 2 dwords. */
4824                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4825                 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4826                         ac_array_in_const32_addr_space(ctx->v4i32));
4827
4828                 /* VGPRs (first TCS, then VS) */
4829                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id);
4830                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids);
4831
4832                 if (ctx->type == PIPE_SHADER_VERTEX) {
4833                         declare_vs_input_vgprs(ctx, &fninfo,
4834                                                &num_prolog_vgprs);
4835
4836                         /* LS return values are inputs to the TCS main shader part. */
4837                         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4838                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4839                         for (i = 0; i < 2; i++)
4840                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4841                 } else {
4842                         /* TCS return values are inputs to the TCS epilog.
4843                          *
4844                          * param_tcs_offchip_offset, param_tcs_factor_offset,
4845                          * param_tcs_offchip_layout, and param_rw_buffers
4846                          * should be passed to the epilog.
4847                          */
4848                         for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
4849                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4850                         for (i = 0; i < 11; i++)
4851                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4852                 }
4853                 break;
4854
4855         case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4856                 /* Merged stages have 8 system SGPRs at the beginning. */
4857                 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
4858                 if (HAVE_32BIT_POINTERS) {
4859                         declare_per_stage_desc_pointers(ctx, &fninfo,
4860                                                         ctx->type == PIPE_SHADER_GEOMETRY);
4861                 } else {
4862                         declare_const_and_shader_buffers(ctx, &fninfo,
4863                                                          ctx->type == PIPE_SHADER_GEOMETRY);
4864                 }
4865                 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4866                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4867                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4868                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4869                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4870                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4871
4872                 declare_global_desc_pointers(ctx, &fninfo);
4873                 declare_per_stage_desc_pointers(ctx, &fninfo,
4874                                                 (ctx->type == PIPE_SHADER_VERTEX ||
4875                                                  ctx->type == PIPE_SHADER_TESS_EVAL));
4876                 if (ctx->type == PIPE_SHADER_VERTEX) {
4877                         declare_vs_specific_input_sgprs(ctx, &fninfo);
4878                 } else {
4879                         ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4880                         ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4881                         if (!HAVE_32BIT_POINTERS) {
4882                                 /* Declare as many input SGPRs as the VS has. */
4883                                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4884                                 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4885                         }
4886                 }
4887
4888                 if (!HAVE_32BIT_POINTERS) {
4889                         declare_samplers_and_images(ctx, &fninfo,
4890                                                     ctx->type == PIPE_SHADER_GEOMETRY);
4891                 }
4892                 if (ctx->type == PIPE_SHADER_VERTEX) {
4893                         ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4894                                 ac_array_in_const32_addr_space(ctx->v4i32));
4895                 }
4896
4897                 /* VGPRs (first GS, then VS/TES) */
4898                 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4899                 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4900                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4901                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4902                 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4903
4904                 if (ctx->type == PIPE_SHADER_VERTEX) {
4905                         declare_vs_input_vgprs(ctx, &fninfo,
4906                                                &num_prolog_vgprs);
4907                 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4908                         declare_tes_input_vgprs(ctx, &fninfo);
4909                 }
4910
4911                 if (ctx->type == PIPE_SHADER_VERTEX ||
4912                     ctx->type == PIPE_SHADER_TESS_EVAL) {
4913                         unsigned num_user_sgprs;
4914
4915                         if (ctx->type == PIPE_SHADER_VERTEX)
4916                                 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
4917                         else
4918                                 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
4919
4920                         /* ES return values are inputs to GS. */
4921                         for (i = 0; i < 8 + num_user_sgprs; i++)
4922                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4923                         for (i = 0; i < 5; i++)
4924                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4925                 }
4926                 break;
4927
4928         case PIPE_SHADER_TESS_EVAL:
4929                 declare_global_desc_pointers(ctx, &fninfo);
4930                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4931                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4932                 ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4933
4934                 if (shader->key.as_es) {
4935                         ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4936                         add_arg(&fninfo, ARG_SGPR, ctx->i32);
4937                         ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4938                 } else {
4939                         add_arg(&fninfo, ARG_SGPR, ctx->i32);
4940                         declare_streamout_params(ctx, &shader->selector->so,
4941                                                  &fninfo);
4942                         ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4943                 }
4944
4945                 /* VGPRs */
4946                 declare_tes_input_vgprs(ctx, &fninfo);
4947                 break;
4948
4949         case PIPE_SHADER_GEOMETRY:
4950                 declare_global_desc_pointers(ctx, &fninfo);
4951                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4952                 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4953                 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4954
4955                 /* VGPRs */
4956                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]);
4957                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]);
4958                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4959                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]);
4960                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]);
4961                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]);
4962                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]);
4963                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4964                 break;
4965
4966         case PIPE_SHADER_FRAGMENT:
4967                 declare_global_desc_pointers(ctx, &fninfo);
4968                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4969                 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4970                 add_arg_assign_checked(&fninfo, ARG_SGPR, ctx->i32,
4971                                        &ctx->abi.prim_mask, SI_PARAM_PRIM_MASK);
4972
4973                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4974                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4975                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4976                 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4977                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4978                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4979                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4980                 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4981                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4982                                        &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4983                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4984                                        &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4985                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4986                                        &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4987                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4988                                        &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4989                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4990                                        &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4991                 shader->info.face_vgpr_index = 20;
4992                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4993                                        &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4994                 shader->info.ancillary_vgpr_index = 21;
4995                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4996                                        &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4997                 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4998
4999                 /* Color inputs from the prolog. */
5000                 if (shader->selector->info.colors_read) {
5001                         unsigned num_color_elements =
5002                                 util_bitcount(shader->selector->info.colors_read);
5003
5004                         assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
5005                         for (i = 0; i < num_color_elements; i++)
5006                                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
5007
5008                         num_prolog_vgprs += num_color_elements;
5009                 }
5010
5011                 /* Outputs for the epilog. */
5012                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5013                 num_returns =
5014                         num_return_sgprs +
5015                         util_bitcount(shader->selector->info.colors_written) * 4 +
5016                         shader->selector->info.writes_z +
5017                         shader->selector->info.writes_stencil +
5018                         shader->selector->info.writes_samplemask +
5019                         1 /* SampleMaskIn */;
5020
5021                 num_returns = MAX2(num_returns,
5022                                    num_return_sgprs +
5023                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5024
5025                 for (i = 0; i < num_return_sgprs; i++)
5026                         returns[i] = ctx->i32;
5027                 for (; i < num_returns; i++)
5028                         returns[i] = ctx->f32;
5029                 break;
5030
5031         case PIPE_SHADER_COMPUTE:
5032                 declare_global_desc_pointers(ctx, &fninfo);
5033                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
5034                 if (shader->selector->info.uses_grid_size)
5035                         add_arg_assign(&fninfo, ARG_SGPR, v3i32, &ctx->abi.num_work_groups);
5036                 if (shader->selector->info.uses_block_size)
5037                         ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
5038
5039                 for (i = 0; i < 3; i++) {
5040                         ctx->abi.workgroup_ids[i] = NULL;
5041                         if (shader->selector->info.uses_block_id[i])
5042                                 add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->abi.workgroup_ids[i]);
5043                 }
5044
5045                 add_arg_assign(&fninfo, ARG_VGPR, v3i32, &ctx->abi.local_invocation_ids);
5046                 break;
5047         default:
5048                 assert(0 && "unimplemented shader");
5049                 return;
5050         }
5051
5052         si_create_function(ctx, "main", returns, num_returns, &fninfo,
5053                            si_get_max_workgroup_size(shader));
5054
5055         /* Reserve register locations for VGPR inputs the PS prolog may need. */
5056         if (ctx->type == PIPE_SHADER_FRAGMENT &&
5057             ctx->separate_prolog) {
5058                 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
5059                                                      "InitialPSInputAddr",
5060                                                      S_0286D0_PERSP_SAMPLE_ENA(1) |
5061                                                      S_0286D0_PERSP_CENTER_ENA(1) |
5062                                                      S_0286D0_PERSP_CENTROID_ENA(1) |
5063                                                      S_0286D0_LINEAR_SAMPLE_ENA(1) |
5064                                                      S_0286D0_LINEAR_CENTER_ENA(1) |
5065                                                      S_0286D0_LINEAR_CENTROID_ENA(1) |
5066                                                      S_0286D0_FRONT_FACE_ENA(1) |
5067                                                      S_0286D0_ANCILLARY_ENA(1) |
5068                                                      S_0286D0_POS_FIXED_PT_ENA(1));
5069         }
5070
5071         shader->info.num_input_sgprs = 0;
5072         shader->info.num_input_vgprs = 0;
5073
5074         for (i = 0; i < fninfo.num_sgpr_params; ++i)
5075                 shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4;
5076
5077         for (; i < fninfo.num_params; ++i)
5078                 shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4;
5079
5080         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
5081         shader->info.num_input_vgprs -= num_prolog_vgprs;
5082
5083         if (shader->key.as_ls ||
5084             ctx->type == PIPE_SHADER_TESS_CTRL ||
5085             /* GFX9 has the ESGS ring buffer in LDS. */
5086             type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY)
5087                 ac_declare_lds_as_pointer(&ctx->ac);
5088 }
5089
5090 /**
5091  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5092  * for later use.
5093  */
5094 static void preload_ring_buffers(struct si_shader_context *ctx)
5095 {
5096         LLVMBuilderRef builder = ctx->ac.builder;
5097
5098         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
5099                                             ctx->param_rw_buffers);
5100
5101         if (ctx->screen->info.chip_class <= VI &&
5102             (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
5103                 unsigned ring =
5104                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5105                                                              : SI_ES_RING_ESGS;
5106                 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
5107
5108                 ctx->esgs_ring =
5109                         ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
5110         }
5111
5112         if (ctx->shader->is_gs_copy_shader) {
5113                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
5114
5115                 ctx->gsvs_ring[0] =
5116                         ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
5117         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
5118                 const struct si_shader_selector *sel = ctx->shader->selector;
5119                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
5120                 LLVMValueRef base_ring;
5121
5122                 base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
5123
5124                 /* The conceptual layout of the GSVS ring is
5125                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
5126                  * but the real memory layout is swizzled across
5127                  * threads:
5128                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
5129                  *   t16v0c0 ..
5130                  * Override the buffer descriptor accordingly.
5131                  */
5132                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
5133                 uint64_t stream_offset = 0;
5134
5135                 for (unsigned stream = 0; stream < 4; ++stream) {
5136                         unsigned num_components;
5137                         unsigned stride;
5138                         unsigned num_records;
5139                         LLVMValueRef ring, tmp;
5140
5141                         num_components = sel->info.num_stream_output_components[stream];
5142                         if (!num_components)
5143                                 continue;
5144
5145                         stride = 4 * num_components * sel->gs_max_out_vertices;
5146
5147                         /* Limit on the stride field for <= CIK. */
5148                         assert(stride < (1 << 14));
5149
5150                         num_records = 64;
5151
5152                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
5153                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
5154                         tmp = LLVMBuildAdd(builder, tmp,
5155                                            LLVMConstInt(ctx->i64,
5156                                                         stream_offset, 0), "");
5157                         stream_offset += stride * 64;
5158
5159                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
5160                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
5161                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
5162                         tmp = LLVMBuildOr(builder, tmp,
5163                                 LLVMConstInt(ctx->i32,
5164                                              S_008F04_STRIDE(stride) |
5165                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
5166                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
5167                         ring = LLVMBuildInsertElement(builder, ring,
5168                                         LLVMConstInt(ctx->i32, num_records, 0),
5169                                         LLVMConstInt(ctx->i32, 2, 0), "");
5170                         ring = LLVMBuildInsertElement(builder, ring,
5171                                 LLVMConstInt(ctx->i32,
5172                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5173                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5174                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5175                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
5176                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5177                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
5178                                              S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
5179                                              S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
5180                                              S_008F0C_ADD_TID_ENABLE(1),
5181                                              0),
5182                                 LLVMConstInt(ctx->i32, 3, 0), "");
5183
5184                         ctx->gsvs_ring[stream] = ring;
5185                 }
5186         } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
5187                 ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
5188         }
5189 }
5190
5191 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5192                                          LLVMValueRef param_rw_buffers,
5193                                          unsigned param_pos_fixed_pt)
5194 {
5195         LLVMBuilderRef builder = ctx->ac.builder;
5196         LLVMValueRef slot, desc, offset, row, bit, address[2];
5197
5198         /* Use the fixed-point gl_FragCoord input.
5199          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5200          * per coordinate to get the repeating effect.
5201          */
5202         address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5203         address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5204
5205         /* Load the buffer descriptor. */
5206         slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
5207         desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
5208
5209         /* The stipple pattern is 32x32, each row has 32 bits. */
5210         offset = LLVMBuildMul(builder, address[1],
5211                               LLVMConstInt(ctx->i32, 4, 0), "");
5212         row = buffer_load_const(ctx, desc, offset);
5213         row = ac_to_integer(&ctx->ac, row);
5214         bit = LLVMBuildLShr(builder, row, address[0], "");
5215         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5216         ac_build_kill_if_false(&ctx->ac, bit);
5217 }
5218
5219 void si_shader_binary_read_config(struct ac_shader_binary *binary,
5220                                   struct si_shader_config *conf,
5221                                   unsigned symbol_offset)
5222 {
5223         unsigned i;
5224         const unsigned char *config =
5225                 ac_shader_binary_config_start(binary, symbol_offset);
5226         bool really_needs_scratch = false;
5227
5228         /* LLVM adds SGPR spills to the scratch size.
5229          * Find out if we really need the scratch buffer.
5230          */
5231         for (i = 0; i < binary->reloc_count; i++) {
5232                 const struct ac_shader_reloc *reloc = &binary->relocs[i];
5233
5234                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5235                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5236                         really_needs_scratch = true;
5237                         break;
5238                 }
5239         }
5240
5241         /* XXX: We may be able to emit some of these values directly rather than
5242          * extracting fields to be emitted later.
5243          */
5244
5245         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5246                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5247                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5248                 switch (reg) {
5249                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5250                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5251                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5252                 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
5253                 case R_00B848_COMPUTE_PGM_RSRC1:
5254                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5255                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5256                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
5257                         conf->rsrc1 = value;
5258                         break;
5259                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5260                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5261                         break;
5262                 case R_00B84C_COMPUTE_PGM_RSRC2:
5263                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5264                         conf->rsrc2 = value;
5265                         break;
5266                 case R_0286CC_SPI_PS_INPUT_ENA:
5267                         conf->spi_ps_input_ena = value;
5268                         break;
5269                 case R_0286D0_SPI_PS_INPUT_ADDR:
5270                         conf->spi_ps_input_addr = value;
5271                         break;
5272                 case R_0286E8_SPI_TMPRING_SIZE:
5273                 case R_00B860_COMPUTE_TMPRING_SIZE:
5274                         /* WAVESIZE is in units of 256 dwords. */
5275                         if (really_needs_scratch)
5276                                 conf->scratch_bytes_per_wave =
5277                                         G_00B860_WAVESIZE(value) * 256 * 4;
5278                         break;
5279                 case 0x4: /* SPILLED_SGPRS */
5280                         conf->spilled_sgprs = value;
5281                         break;
5282                 case 0x8: /* SPILLED_VGPRS */
5283                         conf->spilled_vgprs = value;
5284                         break;
5285                 default:
5286                         {
5287                                 static bool printed;
5288
5289                                 if (!printed) {
5290                                         fprintf(stderr, "Warning: LLVM emitted unknown "
5291                                                 "config register: 0x%x\n", reg);
5292                                         printed = true;
5293                                 }
5294                         }
5295                         break;
5296                 }
5297         }
5298
5299         if (!conf->spi_ps_input_addr)
5300                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5301 }
5302
5303 void si_shader_apply_scratch_relocs(struct si_shader *shader,
5304                                     uint64_t scratch_va)
5305 {
5306         unsigned i;
5307         uint32_t scratch_rsrc_dword0 = scratch_va;
5308         uint32_t scratch_rsrc_dword1 =
5309                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5310
5311         /* Enable scratch coalescing. */
5312         scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5313
5314         for (i = 0 ; i < shader->binary.reloc_count; i++) {
5315                 const struct ac_shader_reloc *reloc =
5316                                         &shader->binary.relocs[i];
5317                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5318                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5319                         &scratch_rsrc_dword0, 4);
5320                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5321                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5322                         &scratch_rsrc_dword1, 4);
5323                 }
5324         }
5325 }
5326
5327 /* For the UMR disassembler. */
5328 #define DEBUGGER_END_OF_CODE_MARKER     0xbf9f0000 /* invalid instruction */
5329 #define DEBUGGER_NUM_MARKERS            5
5330
5331 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
5332 {
5333         unsigned size = shader->binary.code_size;
5334
5335         if (shader->prolog)
5336                 size += shader->prolog->binary.code_size;
5337         if (shader->previous_stage)
5338                 size += shader->previous_stage->binary.code_size;
5339         if (shader->prolog2)
5340                 size += shader->prolog2->binary.code_size;
5341         if (shader->epilog)
5342                 size += shader->epilog->binary.code_size;
5343         return size + DEBUGGER_NUM_MARKERS * 4;
5344 }
5345
5346 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5347 {
5348         const struct ac_shader_binary *prolog =
5349                 shader->prolog ? &shader->prolog->binary : NULL;
5350         const struct ac_shader_binary *previous_stage =
5351                 shader->previous_stage ? &shader->previous_stage->binary : NULL;
5352         const struct ac_shader_binary *prolog2 =
5353                 shader->prolog2 ? &shader->prolog2->binary : NULL;
5354         const struct ac_shader_binary *epilog =
5355                 shader->epilog ? &shader->epilog->binary : NULL;
5356         const struct ac_shader_binary *mainb = &shader->binary;
5357         unsigned bo_size = si_get_shader_binary_size(shader) +
5358                            (!epilog ? mainb->rodata_size : 0);
5359         unsigned char *ptr;
5360
5361         assert(!prolog || !prolog->rodata_size);
5362         assert(!previous_stage || !previous_stage->rodata_size);
5363         assert(!prolog2 || !prolog2->rodata_size);
5364         assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
5365                !mainb->rodata_size);
5366         assert(!epilog || !epilog->rodata_size);
5367
5368         r600_resource_reference(&shader->bo, NULL);
5369         shader->bo = si_aligned_buffer_create(&sscreen->b,
5370                                               sscreen->cpdma_prefetch_writes_memory ?
5371                                                 0 : SI_RESOURCE_FLAG_READ_ONLY,
5372                                               PIPE_USAGE_IMMUTABLE,
5373                                               align(bo_size, SI_CPDMA_ALIGNMENT),
5374                                               256);
5375         if (!shader->bo)
5376                 return -ENOMEM;
5377
5378         /* Upload. */
5379         ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
5380                                         PIPE_TRANSFER_READ_WRITE |
5381                                         PIPE_TRANSFER_UNSYNCHRONIZED);
5382
5383         /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
5384          * endian-independent. */
5385         if (prolog) {
5386                 memcpy(ptr, prolog->code, prolog->code_size);
5387                 ptr += prolog->code_size;
5388         }
5389         if (previous_stage) {
5390                 memcpy(ptr, previous_stage->code, previous_stage->code_size);
5391                 ptr += previous_stage->code_size;
5392         }
5393         if (prolog2) {
5394                 memcpy(ptr, prolog2->code, prolog2->code_size);
5395                 ptr += prolog2->code_size;
5396         }
5397
5398         memcpy(ptr, mainb->code, mainb->code_size);
5399         ptr += mainb->code_size;
5400
5401         if (epilog) {
5402                 memcpy(ptr, epilog->code, epilog->code_size);
5403                 ptr += epilog->code_size;
5404         } else if (mainb->rodata_size > 0) {
5405                 memcpy(ptr, mainb->rodata, mainb->rodata_size);
5406                 ptr += mainb->rodata_size;
5407         }
5408
5409         /* Add end-of-code markers for the UMR disassembler. */
5410         uint32_t *ptr32 = (uint32_t*)ptr;
5411         for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
5412                 ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
5413
5414         sscreen->ws->buffer_unmap(shader->bo->buf);
5415         return 0;
5416 }
5417
5418 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5419                                        struct pipe_debug_callback *debug,
5420                                        const char *name, FILE *file)
5421 {
5422         char *line, *p;
5423         unsigned i, count;
5424
5425         if (binary->disasm_string) {
5426                 fprintf(file, "Shader %s disassembly:\n", name);
5427                 fprintf(file, "%s", binary->disasm_string);
5428
5429                 if (debug && debug->debug_message) {
5430                         /* Very long debug messages are cut off, so send the
5431                          * disassembly one line at a time. This causes more
5432                          * overhead, but on the plus side it simplifies
5433                          * parsing of resulting logs.
5434                          */
5435                         pipe_debug_message(debug, SHADER_INFO,
5436                                            "Shader Disassembly Begin");
5437
5438                         line = binary->disasm_string;
5439                         while (*line) {
5440                                 p = util_strchrnul(line, '\n');
5441                                 count = p - line;
5442
5443                                 if (count) {
5444                                         pipe_debug_message(debug, SHADER_INFO,
5445                                                            "%.*s", count, line);
5446                                 }
5447
5448                                 if (!*p)
5449                                         break;
5450                                 line = p + 1;
5451                         }
5452
5453                         pipe_debug_message(debug, SHADER_INFO,
5454                                            "Shader Disassembly End");
5455                 }
5456         } else {
5457                 fprintf(file, "Shader %s binary:\n", name);
5458                 for (i = 0; i < binary->code_size; i += 4) {
5459                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5460                                 binary->code[i + 3], binary->code[i + 2],
5461                                 binary->code[i + 1], binary->code[i]);
5462                 }
5463         }
5464 }
5465
5466 static void si_calculate_max_simd_waves(struct si_shader *shader)
5467 {
5468         struct si_screen *sscreen = shader->selector->screen;
5469         struct si_shader_config *conf = &shader->config;
5470         unsigned num_inputs = shader->selector->info.num_inputs;
5471         unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256;
5472         unsigned lds_per_wave = 0;
5473         unsigned max_simd_waves;
5474
5475         max_simd_waves = ac_get_max_simd_waves(sscreen->info.family);
5476
5477         /* Compute LDS usage for PS. */
5478         switch (shader->selector->type) {
5479         case PIPE_SHADER_FRAGMENT:
5480                 /* The minimum usage per wave is (num_inputs * 48). The maximum
5481                  * usage is (num_inputs * 48 * 16).
5482                  * We can get anything in between and it varies between waves.
5483                  *
5484                  * The 48 bytes per input for a single primitive is equal to
5485                  * 4 bytes/component * 4 components/input * 3 points.
5486                  *
5487                  * Other stages don't know the size at compile time or don't
5488                  * allocate LDS per wave, but instead they do it per thread group.
5489                  */
5490                 lds_per_wave = conf->lds_size * lds_increment +
5491                                align(num_inputs * 48, lds_increment);
5492                 break;
5493         case PIPE_SHADER_COMPUTE:
5494                 if (shader->selector) {
5495                         unsigned max_workgroup_size =
5496                                 si_get_max_workgroup_size(shader);
5497                         lds_per_wave = (conf->lds_size * lds_increment) /
5498                                        DIV_ROUND_UP(max_workgroup_size, 64);
5499                 }
5500                 break;
5501         }
5502
5503         /* Compute the per-SIMD wave counts. */
5504         if (conf->num_sgprs) {
5505                 if (sscreen->info.chip_class >= VI)
5506                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5507                 else
5508                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5509         }
5510
5511         if (conf->num_vgprs)
5512                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5513
5514         /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5515          * 16KB makes some SIMDs unoccupied). */
5516         if (lds_per_wave)
5517                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5518
5519         conf->max_simd_waves = max_simd_waves;
5520 }
5521
5522 void si_shader_dump_stats_for_shader_db(const struct si_shader *shader,
5523                                         struct pipe_debug_callback *debug)
5524 {
5525         const struct si_shader_config *conf = &shader->config;
5526
5527         pipe_debug_message(debug, SHADER_INFO,
5528                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5529                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5530                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
5531                            conf->num_sgprs, conf->num_vgprs,
5532                            si_get_shader_binary_size(shader),
5533                            conf->lds_size, conf->scratch_bytes_per_wave,
5534                            conf->max_simd_waves, conf->spilled_sgprs,
5535                            conf->spilled_vgprs, conf->private_mem_vgprs);
5536 }
5537
5538 static void si_shader_dump_stats(struct si_screen *sscreen,
5539                                  const struct si_shader *shader,
5540                                  unsigned processor,
5541                                  FILE *file,
5542                                  bool check_debug_option)
5543 {
5544         const struct si_shader_config *conf = &shader->config;
5545
5546         if (!check_debug_option ||
5547             si_can_dump_shader(sscreen, processor)) {
5548                 if (processor == PIPE_SHADER_FRAGMENT) {
5549                         fprintf(file, "*** SHADER CONFIG ***\n"
5550                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5551                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
5552                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5553                 }
5554
5555                 fprintf(file, "*** SHADER STATS ***\n"
5556                         "SGPRS: %d\n"
5557                         "VGPRS: %d\n"
5558                         "Spilled SGPRs: %d\n"
5559                         "Spilled VGPRs: %d\n"
5560                         "Private memory VGPRs: %d\n"
5561                         "Code Size: %d bytes\n"
5562                         "LDS: %d blocks\n"
5563                         "Scratch: %d bytes per wave\n"
5564                         "Max Waves: %d\n"
5565                         "********************\n\n\n",
5566                         conf->num_sgprs, conf->num_vgprs,
5567                         conf->spilled_sgprs, conf->spilled_vgprs,
5568                         conf->private_mem_vgprs,
5569                         si_get_shader_binary_size(shader),
5570                         conf->lds_size, conf->scratch_bytes_per_wave,
5571                         conf->max_simd_waves);
5572         }
5573 }
5574
5575 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5576 {
5577         switch (processor) {
5578         case PIPE_SHADER_VERTEX:
5579                 if (shader->key.as_es)
5580                         return "Vertex Shader as ES";
5581                 else if (shader->key.as_ls)
5582                         return "Vertex Shader as LS";
5583                 else
5584                         return "Vertex Shader as VS";
5585         case PIPE_SHADER_TESS_CTRL:
5586                 return "Tessellation Control Shader";
5587         case PIPE_SHADER_TESS_EVAL:
5588                 if (shader->key.as_es)
5589                         return "Tessellation Evaluation Shader as ES";
5590                 else
5591                         return "Tessellation Evaluation Shader as VS";
5592         case PIPE_SHADER_GEOMETRY:
5593                 if (shader->is_gs_copy_shader)
5594                         return "GS Copy Shader as VS";
5595                 else
5596                         return "Geometry Shader";
5597         case PIPE_SHADER_FRAGMENT:
5598                 return "Pixel Shader";
5599         case PIPE_SHADER_COMPUTE:
5600                 return "Compute Shader";
5601         default:
5602                 return "Unknown Shader";
5603         }
5604 }
5605
5606 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5607                     struct pipe_debug_callback *debug, unsigned processor,
5608                     FILE *file, bool check_debug_option)
5609 {
5610         if (!check_debug_option ||
5611             si_can_dump_shader(sscreen, processor))
5612                 si_dump_shader_key(processor, shader, file);
5613
5614         if (!check_debug_option && shader->binary.llvm_ir_string) {
5615                 if (shader->previous_stage &&
5616                     shader->previous_stage->binary.llvm_ir_string) {
5617                         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5618                                 si_get_shader_name(shader, processor));
5619                         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5620                 }
5621
5622                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5623                         si_get_shader_name(shader, processor));
5624                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5625         }
5626
5627         if (!check_debug_option ||
5628             (si_can_dump_shader(sscreen, processor) &&
5629              !(sscreen->debug_flags & DBG(NO_ASM)))) {
5630                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5631
5632                 if (shader->prolog)
5633                         si_shader_dump_disassembly(&shader->prolog->binary,
5634                                                    debug, "prolog", file);
5635                 if (shader->previous_stage)
5636                         si_shader_dump_disassembly(&shader->previous_stage->binary,
5637                                                    debug, "previous stage", file);
5638                 if (shader->prolog2)
5639                         si_shader_dump_disassembly(&shader->prolog2->binary,
5640                                                    debug, "prolog2", file);
5641
5642                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5643
5644                 if (shader->epilog)
5645                         si_shader_dump_disassembly(&shader->epilog->binary,
5646                                                    debug, "epilog", file);
5647                 fprintf(file, "\n");
5648         }
5649
5650         si_shader_dump_stats(sscreen, shader, processor, file,
5651                              check_debug_option);
5652 }
5653
5654 static int si_compile_llvm(struct si_screen *sscreen,
5655                            struct ac_shader_binary *binary,
5656                            struct si_shader_config *conf,
5657                            struct si_compiler *compiler,
5658                            LLVMModuleRef mod,
5659                            struct pipe_debug_callback *debug,
5660                            unsigned processor,
5661                            const char *name)
5662 {
5663         int r = 0;
5664         unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
5665
5666         if (si_can_dump_shader(sscreen, processor)) {
5667                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5668
5669                 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
5670                         fprintf(stderr, "%s LLVM IR:\n\n", name);
5671                         ac_dump_module(mod);
5672                         fprintf(stderr, "\n");
5673                 }
5674         }
5675
5676         if (sscreen->record_llvm_ir) {
5677                 char *ir = LLVMPrintModuleToString(mod);
5678                 binary->llvm_ir_string = strdup(ir);
5679                 LLVMDisposeMessage(ir);
5680         }
5681
5682         if (!si_replace_shader(count, binary)) {
5683                 r = si_llvm_compile(mod, binary, compiler, debug);
5684                 if (r)
5685                         return r;
5686         }
5687
5688         si_shader_binary_read_config(binary, conf, 0);
5689
5690         /* Enable 64-bit and 16-bit denormals, because there is no performance
5691          * cost.
5692          *
5693          * If denormals are enabled, all floating-point output modifiers are
5694          * ignored.
5695          *
5696          * Don't enable denormals for 32-bit floats, because:
5697          * - Floating-point output modifiers would be ignored by the hw.
5698          * - Some opcodes don't support denormals, such as v_mad_f32. We would
5699          *   have to stop using those.
5700          * - SI & CI would be very slow.
5701          */
5702         conf->float_mode |= V_00B028_FP_64_DENORMS;
5703
5704         FREE(binary->config);
5705         FREE(binary->global_symbol_offsets);
5706         binary->config = NULL;
5707         binary->global_symbol_offsets = NULL;
5708
5709         /* Some shaders can't have rodata because their binaries can be
5710          * concatenated.
5711          */
5712         if (binary->rodata_size &&
5713             (processor == PIPE_SHADER_VERTEX ||
5714              processor == PIPE_SHADER_TESS_CTRL ||
5715              processor == PIPE_SHADER_TESS_EVAL ||
5716              processor == PIPE_SHADER_FRAGMENT)) {
5717                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5718                 return -EINVAL;
5719         }
5720
5721         return r;
5722 }
5723
5724 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5725 {
5726         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5727                 LLVMBuildRetVoid(ctx->ac.builder);
5728         else
5729                 LLVMBuildRet(ctx->ac.builder, ret);
5730 }
5731
5732 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5733 struct si_shader *
5734 si_generate_gs_copy_shader(struct si_screen *sscreen,
5735                            struct si_compiler *compiler,
5736                            struct si_shader_selector *gs_selector,
5737                            struct pipe_debug_callback *debug)
5738 {
5739         struct si_shader_context ctx;
5740         struct si_shader *shader;
5741         LLVMBuilderRef builder;
5742         struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5743         struct lp_build_context *uint = &bld_base->uint_bld;
5744         struct si_shader_output_values *outputs;
5745         struct tgsi_shader_info *gsinfo = &gs_selector->info;
5746         int i, r;
5747
5748         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5749
5750         if (!outputs)
5751                 return NULL;
5752
5753         shader = CALLOC_STRUCT(si_shader);
5754         if (!shader) {
5755                 FREE(outputs);
5756                 return NULL;
5757         }
5758
5759         /* We can leave the fence as permanently signaled because the GS copy
5760          * shader only becomes visible globally after it has been compiled. */
5761         util_queue_fence_init(&shader->ready);
5762
5763         shader->selector = gs_selector;
5764         shader->is_gs_copy_shader = true;
5765
5766         si_init_shader_ctx(&ctx, sscreen, compiler);
5767         ctx.shader = shader;
5768         ctx.type = PIPE_SHADER_VERTEX;
5769
5770         builder = ctx.ac.builder;
5771
5772         create_function(&ctx);
5773         preload_ring_buffers(&ctx);
5774
5775         LLVMValueRef voffset =
5776                 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5777
5778         /* Fetch the vertex stream ID.*/
5779         LLVMValueRef stream_id;
5780
5781         if (gs_selector->so.num_outputs)
5782                 stream_id = si_unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5783         else
5784                 stream_id = ctx.i32_0;
5785
5786         /* Fill in output information. */
5787         for (i = 0; i < gsinfo->num_outputs; ++i) {
5788                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5789                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5790
5791                 for (int chan = 0; chan < 4; chan++) {
5792                         outputs[i].vertex_stream[chan] =
5793                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5794                 }
5795         }
5796
5797         LLVMBasicBlockRef end_bb;
5798         LLVMValueRef switch_inst;
5799
5800         end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
5801         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5802
5803         for (int stream = 0; stream < 4; stream++) {
5804                 LLVMBasicBlockRef bb;
5805                 unsigned offset;
5806
5807                 if (!gsinfo->num_stream_output_components[stream])
5808                         continue;
5809
5810                 if (stream > 0 && !gs_selector->so.num_outputs)
5811                         continue;
5812
5813                 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
5814                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5815                 LLVMPositionBuilderAtEnd(builder, bb);
5816
5817                 /* Fetch vertex data from GSVS ring */
5818                 offset = 0;
5819                 for (i = 0; i < gsinfo->num_outputs; ++i) {
5820                         for (unsigned chan = 0; chan < 4; chan++) {
5821                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5822                                     outputs[i].vertex_stream[chan] != stream) {
5823                                         outputs[i].values[chan] = ctx.bld_base.base.undef;
5824                                         continue;
5825                                 }
5826
5827                                 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5828                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5829                                 offset++;
5830
5831                                 outputs[i].values[chan] =
5832                                         ac_build_buffer_load(&ctx.ac,
5833                                                              ctx.gsvs_ring[0], 1,
5834                                                              ctx.i32_0, voffset,
5835                                                              soffset, 0, 1, 1,
5836                                                              true, false);
5837                         }
5838                 }
5839
5840                 /* Streamout and exports. */
5841                 if (gs_selector->so.num_outputs) {
5842                         si_llvm_emit_streamout(&ctx, outputs,
5843                                                gsinfo->num_outputs,
5844                                                stream);
5845                 }
5846
5847                 if (stream == 0)
5848                         si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
5849
5850                 LLVMBuildBr(builder, end_bb);
5851         }
5852
5853         LLVMPositionBuilderAtEnd(builder, end_bb);
5854
5855         LLVMBuildRetVoid(ctx.ac.builder);
5856
5857         ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5858         si_llvm_optimize_module(&ctx);
5859
5860         r = si_compile_llvm(sscreen, &ctx.shader->binary,
5861                             &ctx.shader->config, ctx.compiler,
5862                             ctx.gallivm.module,
5863                             debug, PIPE_SHADER_GEOMETRY,
5864                             "GS Copy Shader");
5865         if (!r) {
5866                 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
5867                         fprintf(stderr, "GS Copy Shader:\n");
5868                 si_shader_dump(sscreen, ctx.shader, debug,
5869                                PIPE_SHADER_GEOMETRY, stderr, true);
5870                 r = si_shader_binary_upload(sscreen, ctx.shader);
5871         }
5872
5873         si_llvm_dispose(&ctx);
5874
5875         FREE(outputs);
5876
5877         if (r != 0) {
5878                 FREE(shader);
5879                 shader = NULL;
5880         }
5881         return shader;
5882 }
5883
5884 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5885                                   const struct si_vs_prolog_bits *prolog,
5886                                   const char *prefix, FILE *f)
5887 {
5888         fprintf(f, "  %s.instance_divisor_is_one = %u\n",
5889                 prefix, prolog->instance_divisor_is_one);
5890         fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
5891                 prefix, prolog->instance_divisor_is_fetched);
5892         fprintf(f, "  %s.ls_vgpr_fix = %u\n",
5893                 prefix, prolog->ls_vgpr_fix);
5894
5895         fprintf(f, "  mono.vs.fix_fetch = {");
5896         for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5897                 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5898         fprintf(f, "}\n");
5899 }
5900
5901 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5902                                FILE *f)
5903 {
5904         const struct si_shader_key *key = &shader->key;
5905
5906         fprintf(f, "SHADER KEY\n");
5907
5908         switch (processor) {
5909         case PIPE_SHADER_VERTEX:
5910                 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5911                                       "part.vs.prolog", f);
5912                 fprintf(f, "  as_es = %u\n", key->as_es);
5913                 fprintf(f, "  as_ls = %u\n", key->as_ls);
5914                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5915                         key->mono.u.vs_export_prim_id);
5916                 break;
5917
5918         case PIPE_SHADER_TESS_CTRL:
5919                 if (shader->selector->screen->info.chip_class >= GFX9) {
5920                         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5921                                               "part.tcs.ls_prolog", f);
5922                 }
5923                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5924                 fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5925                 break;
5926
5927         case PIPE_SHADER_TESS_EVAL:
5928                 fprintf(f, "  as_es = %u\n", key->as_es);
5929                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5930                         key->mono.u.vs_export_prim_id);
5931                 break;
5932
5933         case PIPE_SHADER_GEOMETRY:
5934                 if (shader->is_gs_copy_shader)
5935                         break;
5936
5937                 if (shader->selector->screen->info.chip_class >= GFX9 &&
5938                     key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5939                         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5940                                               "part.gs.vs_prolog", f);
5941                 }
5942                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5943                 break;
5944
5945         case PIPE_SHADER_COMPUTE:
5946                 break;
5947
5948         case PIPE_SHADER_FRAGMENT:
5949                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5950                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5951                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5952                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5953                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5954                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5955                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5956                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5957                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5958                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5959                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5960                 fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5961                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5962                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5963                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5964                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5965                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5966                 break;
5967
5968         default:
5969                 assert(0);
5970         }
5971
5972         if ((processor == PIPE_SHADER_GEOMETRY ||
5973              processor == PIPE_SHADER_TESS_EVAL ||
5974              processor == PIPE_SHADER_VERTEX) &&
5975             !key->as_es && !key->as_ls) {
5976                 fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5977                 fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
5978         }
5979 }
5980
5981 static void si_init_shader_ctx(struct si_shader_context *ctx,
5982                                struct si_screen *sscreen,
5983                                struct si_compiler *compiler)
5984 {
5985         struct lp_build_tgsi_context *bld_base;
5986
5987         si_llvm_context_init(ctx, sscreen, compiler);
5988
5989         bld_base = &ctx->bld_base;
5990         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5991
5992         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5993         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5994         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5995
5996         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5997
5998         bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5999
6000         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6001         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6002         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6003         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6004
6005         bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
6006         bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
6007         bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
6008         bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
6009         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
6010         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
6011         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
6012         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
6013         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
6014
6015         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex;
6016         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_tgsi_emit_primitive;
6017         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6018 }
6019
6020 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
6021 {
6022         struct si_shader *shader = ctx->shader;
6023         struct tgsi_shader_info *info = &shader->selector->info;
6024
6025         if ((ctx->type != PIPE_SHADER_VERTEX &&
6026              ctx->type != PIPE_SHADER_TESS_EVAL) ||
6027             shader->key.as_ls ||
6028             shader->key.as_es)
6029                 return;
6030
6031         ac_optimize_vs_outputs(&ctx->ac,
6032                                ctx->main_fn,
6033                                shader->info.vs_output_param_offset,
6034                                info->num_outputs,
6035                                &shader->info.nr_param_exports);
6036 }
6037
6038 static void si_init_exec_from_input(struct si_shader_context *ctx,
6039                                     unsigned param, unsigned bitoffset)
6040 {
6041         LLVMValueRef args[] = {
6042                 LLVMGetParam(ctx->main_fn, param),
6043                 LLVMConstInt(ctx->i32, bitoffset, 0),
6044         };
6045         lp_build_intrinsic(ctx->ac.builder,
6046                            "llvm.amdgcn.init.exec.from.input",
6047                            ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
6048 }
6049
6050 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
6051                                const struct si_vs_prolog_bits *key)
6052 {
6053         /* VGPR initialization fixup for Vega10 and Raven is always done in the
6054          * VS prolog. */
6055         return sel->vs_needs_prolog || key->ls_vgpr_fix;
6056 }
6057
6058 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
6059                                  bool is_monolithic)
6060 {
6061         struct si_shader *shader = ctx->shader;
6062         struct si_shader_selector *sel = shader->selector;
6063         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6064
6065         // TODO clean all this up!
6066         switch (ctx->type) {
6067         case PIPE_SHADER_VERTEX:
6068                 ctx->load_input = declare_input_vs;
6069                 if (shader->key.as_ls)
6070                         ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
6071                 else if (shader->key.as_es)
6072                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
6073                 else
6074                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
6075                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
6076                 ctx->abi.load_base_vertex = get_base_vertex;
6077                 break;
6078         case PIPE_SHADER_TESS_CTRL:
6079                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6080                 ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
6081                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6082                 bld_base->emit_store = store_output_tcs;
6083                 ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
6084                 ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
6085                 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
6086                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
6087                 break;
6088         case PIPE_SHADER_TESS_EVAL:
6089                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6090                 ctx->abi.load_tess_varyings = si_nir_load_input_tes;
6091                 ctx->abi.load_tess_coord = si_load_tess_coord;
6092                 ctx->abi.load_tess_level = si_load_tess_level;
6093                 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
6094                 if (shader->key.as_es)
6095                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
6096                 else
6097                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
6098                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
6099                 break;
6100         case PIPE_SHADER_GEOMETRY:
6101                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6102                 ctx->abi.load_inputs = si_nir_load_input_gs;
6103                 ctx->abi.emit_vertex = si_llvm_emit_vertex;
6104                 ctx->abi.emit_primitive = si_llvm_emit_primitive;
6105                 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
6106                 bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue;
6107                 break;
6108         case PIPE_SHADER_FRAGMENT:
6109                 ctx->load_input = declare_input_fs;
6110                 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
6111                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
6112                 ctx->abi.lookup_interp_param = si_nir_lookup_interp_param;
6113                 ctx->abi.load_sample_position = load_sample_position;
6114                 ctx->abi.load_sample_mask_in = load_sample_mask_in;
6115                 ctx->abi.emit_kill = si_llvm_emit_kill;
6116                 break;
6117         case PIPE_SHADER_COMPUTE:
6118                 ctx->abi.load_local_group_size = get_block_size;
6119                 break;
6120         default:
6121                 assert(!"Unsupported shader type");
6122                 return false;
6123         }
6124
6125         ctx->abi.load_ubo = load_ubo;
6126         ctx->abi.load_ssbo = load_ssbo;
6127
6128         create_function(ctx);
6129         preload_ring_buffers(ctx);
6130
6131         /* For GFX9 merged shaders:
6132          * - Set EXEC for the first shader. If the prolog is present, set
6133          *   EXEC there instead.
6134          * - Add a barrier before the second shader.
6135          * - In the second shader, reset EXEC to ~0 and wrap the main part in
6136          *   an if-statement. This is required for correctness in geometry
6137          *   shaders, to ensure that empty GS waves do not send GS_EMIT and
6138          *   GS_CUT messages.
6139          *
6140          * For monolithic merged shaders, the first shader is wrapped in an
6141          * if-block together with its prolog in si_build_wrapper_function.
6142          */
6143         if (ctx->screen->info.chip_class >= GFX9) {
6144                 if (!is_monolithic &&
6145                     sel->info.num_instructions > 1 && /* not empty shader */
6146                     (shader->key.as_es || shader->key.as_ls) &&
6147                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
6148                      (ctx->type == PIPE_SHADER_VERTEX &&
6149                       !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
6150                         si_init_exec_from_input(ctx,
6151                                                 ctx->param_merged_wave_info, 0);
6152                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
6153                            ctx->type == PIPE_SHADER_GEOMETRY) {
6154                         if (!is_monolithic)
6155                                 ac_init_exec_full_mask(&ctx->ac);
6156
6157                         /* The barrier must execute for all shaders in a
6158                          * threadgroup.
6159                          */
6160                         si_llvm_emit_barrier(NULL, bld_base, NULL);
6161
6162                         LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
6163                         LLVMValueRef ena =
6164                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
6165                                             ac_get_thread_id(&ctx->ac), num_threads, "");
6166                         lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
6167                 }
6168         }
6169
6170         if (ctx->type == PIPE_SHADER_TESS_CTRL &&
6171             sel->tcs_info.tessfactors_are_def_in_all_invocs) {
6172                 for (unsigned i = 0; i < 6; i++) {
6173                         ctx->invoc0_tess_factors[i] =
6174                                 lp_build_alloca_undef(&ctx->gallivm, ctx->i32, "");
6175                 }
6176         }
6177
6178         if (ctx->type == PIPE_SHADER_GEOMETRY) {
6179                 int i;
6180                 for (i = 0; i < 4; i++) {
6181                         ctx->gs_next_vertex[i] =
6182                                 lp_build_alloca(&ctx->gallivm,
6183                                                 ctx->i32, "");
6184                 }
6185         }
6186
6187         if (sel->force_correct_derivs_after_kill) {
6188                 ctx->postponed_kill = lp_build_alloca_undef(&ctx->gallivm, ctx->i1, "");
6189                 /* true = don't kill. */
6190                 LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0),
6191                                ctx->postponed_kill);
6192         }
6193
6194         if (sel->tokens) {
6195                 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6196                         fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6197                         return false;
6198                 }
6199         } else {
6200                 if (!si_nir_build_llvm(ctx, sel->nir)) {
6201                         fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
6202                         return false;
6203                 }
6204         }
6205
6206         si_llvm_build_ret(ctx, ctx->return_value);
6207         return true;
6208 }
6209
6210 /**
6211  * Compute the VS prolog key, which contains all the information needed to
6212  * build the VS prolog function, and set shader->info bits where needed.
6213  *
6214  * \param info             Shader info of the vertex shader.
6215  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
6216  * \param prolog_key       Key of the VS prolog
6217  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
6218  * \param key              Output shader part key.
6219  */
6220 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
6221                                  unsigned num_input_sgprs,
6222                                  const struct si_vs_prolog_bits *prolog_key,
6223                                  struct si_shader *shader_out,
6224                                  union si_shader_part_key *key)
6225 {
6226         memset(key, 0, sizeof(*key));
6227         key->vs_prolog.states = *prolog_key;
6228         key->vs_prolog.num_input_sgprs = num_input_sgprs;
6229         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6230         key->vs_prolog.as_ls = shader_out->key.as_ls;
6231         key->vs_prolog.as_es = shader_out->key.as_es;
6232
6233         if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
6234                 key->vs_prolog.as_ls = 1;
6235                 key->vs_prolog.num_merged_next_stage_vgprs = 2;
6236         } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
6237                 key->vs_prolog.as_es = 1;
6238                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
6239         }
6240
6241         /* Enable loading the InstanceID VGPR. */
6242         uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
6243
6244         if ((key->vs_prolog.states.instance_divisor_is_one |
6245              key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
6246                 shader_out->info.uses_instanceid = true;
6247 }
6248
6249 /**
6250  * Compute the PS prolog key, which contains all the information needed to
6251  * build the PS prolog function, and set related bits in shader->config.
6252  */
6253 static void si_get_ps_prolog_key(struct si_shader *shader,
6254                                  union si_shader_part_key *key,
6255                                  bool separate_prolog)
6256 {
6257         struct tgsi_shader_info *info = &shader->selector->info;
6258
6259         memset(key, 0, sizeof(*key));
6260         key->ps_prolog.states = shader->key.part.ps.prolog;
6261         key->ps_prolog.colors_read = info->colors_read;
6262         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6263         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6264         key->ps_prolog.wqm = info->uses_derivatives &&
6265                 (key->ps_prolog.colors_read ||
6266                  key->ps_prolog.states.force_persp_sample_interp ||
6267                  key->ps_prolog.states.force_linear_sample_interp ||
6268                  key->ps_prolog.states.force_persp_center_interp ||
6269                  key->ps_prolog.states.force_linear_center_interp ||
6270                  key->ps_prolog.states.bc_optimize_for_persp ||
6271                  key->ps_prolog.states.bc_optimize_for_linear);
6272         key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
6273
6274         if (info->colors_read) {
6275                 unsigned *color = shader->selector->color_attr_index;
6276
6277                 if (shader->key.part.ps.prolog.color_two_side) {
6278                         /* BCOLORs are stored after the last input. */
6279                         key->ps_prolog.num_interp_inputs = info->num_inputs;
6280                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6281                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6282                 }
6283
6284                 for (unsigned i = 0; i < 2; i++) {
6285                         unsigned interp = info->input_interpolate[color[i]];
6286                         unsigned location = info->input_interpolate_loc[color[i]];
6287
6288                         if (!(info->colors_read & (0xf << i*4)))
6289                                 continue;
6290
6291                         key->ps_prolog.color_attr_index[i] = color[i];
6292
6293                         if (shader->key.part.ps.prolog.flatshade_colors &&
6294                             interp == TGSI_INTERPOLATE_COLOR)
6295                                 interp = TGSI_INTERPOLATE_CONSTANT;
6296
6297                         switch (interp) {
6298                         case TGSI_INTERPOLATE_CONSTANT:
6299                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
6300                                 break;
6301                         case TGSI_INTERPOLATE_PERSPECTIVE:
6302                         case TGSI_INTERPOLATE_COLOR:
6303                                 /* Force the interpolation location for colors here. */
6304                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
6305                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
6306                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
6307                                         location = TGSI_INTERPOLATE_LOC_CENTER;
6308
6309                                 switch (location) {
6310                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
6311                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
6312                                         shader->config.spi_ps_input_ena |=
6313                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
6314                                         break;
6315                                 case TGSI_INTERPOLATE_LOC_CENTER:
6316                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
6317                                         shader->config.spi_ps_input_ena |=
6318                                                 S_0286CC_PERSP_CENTER_ENA(1);
6319                                         break;
6320                                 case TGSI_INTERPOLATE_LOC_CENTROID:
6321                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
6322                                         shader->config.spi_ps_input_ena |=
6323                                                 S_0286CC_PERSP_CENTROID_ENA(1);
6324                                         break;
6325                                 default:
6326                                         assert(0);
6327                                 }
6328                                 break;
6329                         case TGSI_INTERPOLATE_LINEAR:
6330                                 /* Force the interpolation location for colors here. */
6331                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
6332                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
6333                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
6334                                         location = TGSI_INTERPOLATE_LOC_CENTER;
6335
6336                                 /* The VGPR assignment for non-monolithic shaders
6337                                  * works because InitialPSInputAddr is set on the
6338                                  * main shader and PERSP_PULL_MODEL is never used.
6339                                  */
6340                                 switch (location) {
6341                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
6342                                         key->ps_prolog.color_interp_vgpr_index[i] =
6343                                                 separate_prolog ? 6 : 9;
6344                                         shader->config.spi_ps_input_ena |=
6345                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
6346                                         break;
6347                                 case TGSI_INTERPOLATE_LOC_CENTER:
6348                                         key->ps_prolog.color_interp_vgpr_index[i] =
6349                                                 separate_prolog ? 8 : 11;
6350                                         shader->config.spi_ps_input_ena |=
6351                                                 S_0286CC_LINEAR_CENTER_ENA(1);
6352                                         break;
6353                                 case TGSI_INTERPOLATE_LOC_CENTROID:
6354                                         key->ps_prolog.color_interp_vgpr_index[i] =
6355                                                 separate_prolog ? 10 : 13;
6356                                         shader->config.spi_ps_input_ena |=
6357                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
6358                                         break;
6359                                 default:
6360                                         assert(0);
6361                                 }
6362                                 break;
6363                         default:
6364                                 assert(0);
6365                         }
6366                 }
6367         }
6368 }
6369
6370 /**
6371  * Check whether a PS prolog is required based on the key.
6372  */
6373 static bool si_need_ps_prolog(const union si_shader_part_key *key)
6374 {
6375         return key->ps_prolog.colors_read ||
6376                key->ps_prolog.states.force_persp_sample_interp ||
6377                key->ps_prolog.states.force_linear_sample_interp ||
6378                key->ps_prolog.states.force_persp_center_interp ||
6379                key->ps_prolog.states.force_linear_center_interp ||
6380                key->ps_prolog.states.bc_optimize_for_persp ||
6381                key->ps_prolog.states.bc_optimize_for_linear ||
6382                key->ps_prolog.states.poly_stipple ||
6383                key->ps_prolog.states.samplemask_log_ps_iter;
6384 }
6385
6386 /**
6387  * Compute the PS epilog key, which contains all the information needed to
6388  * build the PS epilog function.
6389  */
6390 static void si_get_ps_epilog_key(struct si_shader *shader,
6391                                  union si_shader_part_key *key)
6392 {
6393         struct tgsi_shader_info *info = &shader->selector->info;
6394         memset(key, 0, sizeof(*key));
6395         key->ps_epilog.colors_written = info->colors_written;
6396         key->ps_epilog.writes_z = info->writes_z;
6397         key->ps_epilog.writes_stencil = info->writes_stencil;
6398         key->ps_epilog.writes_samplemask = info->writes_samplemask;
6399         key->ps_epilog.states = shader->key.part.ps.epilog;
6400 }
6401
6402 /**
6403  * Build the GS prolog function. Rotate the input vertices for triangle strips
6404  * with adjacency.
6405  */
6406 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
6407                                         union si_shader_part_key *key)
6408 {
6409         unsigned num_sgprs, num_vgprs;
6410         struct si_function_info fninfo;
6411         LLVMBuilderRef builder = ctx->ac.builder;
6412         LLVMTypeRef returns[48];
6413         LLVMValueRef func, ret;
6414
6415         si_init_function_info(&fninfo);
6416
6417         if (ctx->screen->info.chip_class >= GFX9) {
6418                 if (key->gs_prolog.states.gfx9_prev_is_vs)
6419                         num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
6420                 else
6421                         num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
6422                 num_vgprs = 5; /* ES inputs are not needed by GS */
6423         } else {
6424                 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6425                 num_vgprs = 8;
6426         }
6427
6428         for (unsigned i = 0; i < num_sgprs; ++i) {
6429                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6430                 returns[i] = ctx->i32;
6431         }
6432
6433         for (unsigned i = 0; i < num_vgprs; ++i) {
6434                 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6435                 returns[num_sgprs + i] = ctx->f32;
6436         }
6437
6438         /* Create the function. */
6439         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6440                            &fninfo, 0);
6441         func = ctx->main_fn;
6442
6443         /* Set the full EXEC mask for the prolog, because we are only fiddling
6444          * with registers here. The main shader part will set the correct EXEC
6445          * mask.
6446          */
6447         if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6448                 ac_init_exec_full_mask(&ctx->ac);
6449
6450         /* Copy inputs to outputs. This should be no-op, as the registers match,
6451          * but it will prevent the compiler from overwriting them unintentionally.
6452          */
6453         ret = ctx->return_value;
6454         for (unsigned i = 0; i < num_sgprs; i++) {
6455                 LLVMValueRef p = LLVMGetParam(func, i);
6456                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6457         }
6458         for (unsigned i = 0; i < num_vgprs; i++) {
6459                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6460                 p = ac_to_float(&ctx->ac, p);
6461                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6462         }
6463
6464         if (key->gs_prolog.states.tri_strip_adj_fix) {
6465                 /* Remap the input vertices for every other primitive. */
6466                 const unsigned gfx6_vtx_params[6] = {
6467                         num_sgprs,
6468                         num_sgprs + 1,
6469                         num_sgprs + 3,
6470                         num_sgprs + 4,
6471                         num_sgprs + 5,
6472                         num_sgprs + 6
6473                 };
6474                 const unsigned gfx9_vtx_params[3] = {
6475                         num_sgprs,
6476                         num_sgprs + 1,
6477                         num_sgprs + 4,
6478                 };
6479                 LLVMValueRef vtx_in[6], vtx_out[6];
6480                 LLVMValueRef prim_id, rotate;
6481
6482                 if (ctx->screen->info.chip_class >= GFX9) {
6483                         for (unsigned i = 0; i < 3; i++) {
6484                                 vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6485                                 vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6486                         }
6487                 } else {
6488                         for (unsigned i = 0; i < 6; i++)
6489                                 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6490                 }
6491
6492                 prim_id = LLVMGetParam(func, num_sgprs + 2);
6493                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6494
6495                 for (unsigned i = 0; i < 6; ++i) {
6496                         LLVMValueRef base, rotated;
6497                         base = vtx_in[i];
6498                         rotated = vtx_in[(i + 4) % 6];
6499                         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6500                 }
6501
6502                 if (ctx->screen->info.chip_class >= GFX9) {
6503                         for (unsigned i = 0; i < 3; i++) {
6504                                 LLVMValueRef hi, out;
6505
6506                                 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6507                                                   LLVMConstInt(ctx->i32, 16, 0), "");
6508                                 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6509                                 out = ac_to_float(&ctx->ac, out);
6510                                 ret = LLVMBuildInsertValue(builder, ret, out,
6511                                                            gfx9_vtx_params[i], "");
6512                         }
6513                 } else {
6514                         for (unsigned i = 0; i < 6; i++) {
6515                                 LLVMValueRef out;
6516
6517                                 out = ac_to_float(&ctx->ac, vtx_out[i]);
6518                                 ret = LLVMBuildInsertValue(builder, ret, out,
6519                                                            gfx6_vtx_params[i], "");
6520                         }
6521                 }
6522         }
6523
6524         LLVMBuildRet(builder, ret);
6525 }
6526
6527 /**
6528  * Given a list of shader part functions, build a wrapper function that
6529  * runs them in sequence to form a monolithic shader.
6530  */
6531 static void si_build_wrapper_function(struct si_shader_context *ctx,
6532                                       LLVMValueRef *parts,
6533                                       unsigned num_parts,
6534                                       unsigned main_part,
6535                                       unsigned next_shader_first_part)
6536 {
6537         LLVMBuilderRef builder = ctx->ac.builder;
6538         /* PS epilog has one arg per color component; gfx9 merged shader
6539          * prologs need to forward 32 user SGPRs.
6540          */
6541         struct si_function_info fninfo;
6542         LLVMValueRef initial[64], out[64];
6543         LLVMTypeRef function_type;
6544         unsigned num_first_params;
6545         unsigned num_out, initial_num_out;
6546         MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6547         MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6548         unsigned num_sgprs, num_vgprs;
6549         unsigned gprs;
6550         struct lp_build_if_state if_state;
6551
6552         si_init_function_info(&fninfo);
6553
6554         for (unsigned i = 0; i < num_parts; ++i) {
6555                 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6556                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6557         }
6558
6559         /* The parameters of the wrapper function correspond to those of the
6560          * first part in terms of SGPRs and VGPRs, but we use the types of the
6561          * main part to get the right types. This is relevant for the
6562          * dereferenceable attribute on descriptor table pointers.
6563          */
6564         num_sgprs = 0;
6565         num_vgprs = 0;
6566
6567         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6568         num_first_params = LLVMCountParamTypes(function_type);
6569
6570         for (unsigned i = 0; i < num_first_params; ++i) {
6571                 LLVMValueRef param = LLVMGetParam(parts[0], i);
6572
6573                 if (ac_is_sgpr_param(param)) {
6574                         assert(num_vgprs == 0);
6575                         num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6576                 } else {
6577                         num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6578                 }
6579         }
6580
6581         gprs = 0;
6582         while (gprs < num_sgprs + num_vgprs) {
6583                 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6584                 LLVMTypeRef type = LLVMTypeOf(param);
6585                 unsigned size = ac_get_type_size(type) / 4;
6586
6587                 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6588
6589                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6590                 assert(gprs + size <= num_sgprs + num_vgprs &&
6591                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
6592
6593                 gprs += size;
6594         }
6595
6596         si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6597                            si_get_max_workgroup_size(ctx->shader));
6598
6599         if (is_merged_shader(ctx->shader))
6600                 ac_init_exec_full_mask(&ctx->ac);
6601
6602         /* Record the arguments of the function as if they were an output of
6603          * a previous part.
6604          */
6605         num_out = 0;
6606         num_out_sgpr = 0;
6607
6608         for (unsigned i = 0; i < fninfo.num_params; ++i) {
6609                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6610                 LLVMTypeRef param_type = LLVMTypeOf(param);
6611                 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6612                 unsigned size = ac_get_type_size(param_type) / 4;
6613
6614                 if (size == 1) {
6615                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6616                                 param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
6617                                 param_type = ctx->i32;
6618                         }
6619
6620                         if (param_type != out_type)
6621                                 param = LLVMBuildBitCast(builder, param, out_type, "");
6622                         out[num_out++] = param;
6623                 } else {
6624                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6625
6626                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6627                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6628                                 param_type = ctx->i64;
6629                         }
6630
6631                         if (param_type != vector_type)
6632                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
6633
6634                         for (unsigned j = 0; j < size; ++j)
6635                                 out[num_out++] = LLVMBuildExtractElement(
6636                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6637                 }
6638
6639                 if (i < fninfo.num_sgpr_params)
6640                         num_out_sgpr = num_out;
6641         }
6642
6643         memcpy(initial, out, sizeof(out));
6644         initial_num_out = num_out;
6645         initial_num_out_sgpr = num_out_sgpr;
6646
6647         /* Now chain the parts. */
6648         for (unsigned part = 0; part < num_parts; ++part) {
6649                 LLVMValueRef in[48];
6650                 LLVMValueRef ret;
6651                 LLVMTypeRef ret_type;
6652                 unsigned out_idx = 0;
6653                 unsigned num_params = LLVMCountParams(parts[part]);
6654
6655                 /* Merged shaders are executed conditionally depending
6656                  * on the number of enabled threads passed in the input SGPRs. */
6657                 if (is_merged_shader(ctx->shader) && part == 0) {
6658                         LLVMValueRef ena, count = initial[3];
6659
6660                         count = LLVMBuildAnd(builder, count,
6661                                              LLVMConstInt(ctx->i32, 0x7f, 0), "");
6662                         ena = LLVMBuildICmp(builder, LLVMIntULT,
6663                                             ac_get_thread_id(&ctx->ac), count, "");
6664                         lp_build_if(&if_state, &ctx->gallivm, ena);
6665                 }
6666
6667                 /* Derive arguments for the next part from outputs of the
6668                  * previous one.
6669                  */
6670                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6671                         LLVMValueRef param;
6672                         LLVMTypeRef param_type;
6673                         bool is_sgpr;
6674                         unsigned param_size;
6675                         LLVMValueRef arg = NULL;
6676
6677                         param = LLVMGetParam(parts[part], param_idx);
6678                         param_type = LLVMTypeOf(param);
6679                         param_size = ac_get_type_size(param_type) / 4;
6680                         is_sgpr = ac_is_sgpr_param(param);
6681
6682                         if (is_sgpr)
6683                                 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6684                         else if (out_idx < num_out_sgpr) {
6685                                 /* Skip returned SGPRs the current part doesn't
6686                                  * declare on the input. */
6687                                 out_idx = num_out_sgpr;
6688                         }
6689
6690                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6691
6692                         if (param_size == 1)
6693                                 arg = out[out_idx];
6694                         else
6695                                 arg = lp_build_gather_values(&ctx->gallivm, &out[out_idx], param_size);
6696
6697                         if (LLVMTypeOf(arg) != param_type) {
6698                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6699                                         if (LLVMGetPointerAddressSpace(param_type) ==
6700                                             AC_CONST_32BIT_ADDR_SPACE) {
6701                                                 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
6702                                                 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6703                                         } else {
6704                                                 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6705                                                 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6706                                         }
6707                                 } else {
6708                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
6709                                 }
6710                         }
6711
6712                         in[param_idx] = arg;
6713                         out_idx += param_size;
6714                 }
6715
6716                 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6717
6718                 if (is_merged_shader(ctx->shader) &&
6719                     part + 1 == next_shader_first_part) {
6720                         lp_build_endif(&if_state);
6721
6722                         /* The second half of the merged shader should use
6723                          * the inputs from the toplevel (wrapper) function,
6724                          * not the return value from the last call.
6725                          *
6726                          * That's because the last call was executed condi-
6727                          * tionally, so we can't consume it in the main
6728                          * block.
6729                          */
6730                         memcpy(out, initial, sizeof(initial));
6731                         num_out = initial_num_out;
6732                         num_out_sgpr = initial_num_out_sgpr;
6733                         continue;
6734                 }
6735
6736                 /* Extract the returned GPRs. */
6737                 ret_type = LLVMTypeOf(ret);
6738                 num_out = 0;
6739                 num_out_sgpr = 0;
6740
6741                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6742                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6743
6744                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6745
6746                         for (unsigned i = 0; i < ret_size; ++i) {
6747                                 LLVMValueRef val =
6748                                         LLVMBuildExtractValue(builder, ret, i, "");
6749
6750                                 assert(num_out < ARRAY_SIZE(out));
6751                                 out[num_out++] = val;
6752
6753                                 if (LLVMTypeOf(val) == ctx->i32) {
6754                                         assert(num_out_sgpr + 1 == num_out);
6755                                         num_out_sgpr = num_out;
6756                                 }
6757                         }
6758                 }
6759         }
6760
6761         LLVMBuildRetVoid(builder);
6762 }
6763
6764 int si_compile_tgsi_shader(struct si_screen *sscreen,
6765                            struct si_compiler *compiler,
6766                            struct si_shader *shader,
6767                            bool is_monolithic,
6768                            struct pipe_debug_callback *debug)
6769 {
6770         struct si_shader_selector *sel = shader->selector;
6771         struct si_shader_context ctx;
6772         int r = -1;
6773
6774         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6775          * conversion fails. */
6776         if (si_can_dump_shader(sscreen, sel->info.processor) &&
6777             !(sscreen->debug_flags & DBG(NO_TGSI))) {
6778                 if (sel->tokens)
6779                         tgsi_dump(sel->tokens, 0);
6780                 else
6781                         nir_print_shader(sel->nir, stderr);
6782                 si_dump_streamout(&sel->so);
6783         }
6784
6785         si_init_shader_ctx(&ctx, sscreen, compiler);
6786         si_llvm_context_set_tgsi(&ctx, shader);
6787         ctx.separate_prolog = !is_monolithic;
6788
6789         memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6790                sizeof(shader->info.vs_output_param_offset));
6791
6792         shader->info.uses_instanceid = sel->info.uses_instanceid;
6793
6794         if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6795                 si_llvm_dispose(&ctx);
6796                 return -1;
6797         }
6798
6799         if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6800                 LLVMValueRef parts[2];
6801                 bool need_prolog = sel->vs_needs_prolog;
6802
6803                 parts[1] = ctx.main_fn;
6804
6805                 if (need_prolog) {
6806                         union si_shader_part_key prolog_key;
6807                         si_get_vs_prolog_key(&sel->info,
6808                                              shader->info.num_input_sgprs,
6809                                              &shader->key.part.vs.prolog,
6810                                              shader, &prolog_key);
6811                         si_build_vs_prolog_function(&ctx, &prolog_key);
6812                         parts[0] = ctx.main_fn;
6813                 }
6814
6815                 si_build_wrapper_function(&ctx, parts + !need_prolog,
6816                                           1 + need_prolog, need_prolog, 0);
6817         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6818                 if (sscreen->info.chip_class >= GFX9) {
6819                         struct si_shader_selector *ls = shader->key.part.tcs.ls;
6820                         LLVMValueRef parts[4];
6821                         bool vs_needs_prolog =
6822                                 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6823
6824                         /* TCS main part */
6825                         parts[2] = ctx.main_fn;
6826
6827                         /* TCS epilog */
6828                         union si_shader_part_key tcs_epilog_key;
6829                         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6830                         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6831                         si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6832                         parts[3] = ctx.main_fn;
6833
6834                         /* VS as LS main part */
6835                         struct si_shader shader_ls = {};
6836                         shader_ls.selector = ls;
6837                         shader_ls.key.as_ls = 1;
6838                         shader_ls.key.mono = shader->key.mono;
6839                         shader_ls.key.opt = shader->key.opt;
6840                         si_llvm_context_set_tgsi(&ctx, &shader_ls);
6841
6842                         if (!si_compile_tgsi_main(&ctx, true)) {
6843                                 si_llvm_dispose(&ctx);
6844                                 return -1;
6845                         }
6846                         shader->info.uses_instanceid |= ls->info.uses_instanceid;
6847                         parts[1] = ctx.main_fn;
6848
6849                         /* LS prolog */
6850                         if (vs_needs_prolog) {
6851                                 union si_shader_part_key vs_prolog_key;
6852                                 si_get_vs_prolog_key(&ls->info,
6853                                                      shader_ls.info.num_input_sgprs,
6854                                                      &shader->key.part.tcs.ls_prolog,
6855                                                      shader, &vs_prolog_key);
6856                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6857                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6858                                 parts[0] = ctx.main_fn;
6859                         }
6860
6861                         /* Reset the shader context. */
6862                         ctx.shader = shader;
6863                         ctx.type = PIPE_SHADER_TESS_CTRL;
6864
6865                         si_build_wrapper_function(&ctx,
6866                                                   parts + !vs_needs_prolog,
6867                                                   4 - !vs_needs_prolog, vs_needs_prolog,
6868                                                   vs_needs_prolog ? 2 : 1);
6869                 } else {
6870                         LLVMValueRef parts[2];
6871                         union si_shader_part_key epilog_key;
6872
6873                         parts[0] = ctx.main_fn;
6874
6875                         memset(&epilog_key, 0, sizeof(epilog_key));
6876                         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6877                         si_build_tcs_epilog_function(&ctx, &epilog_key);
6878                         parts[1] = ctx.main_fn;
6879
6880                         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6881                 }
6882         } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6883                 if (ctx.screen->info.chip_class >= GFX9) {
6884                         struct si_shader_selector *es = shader->key.part.gs.es;
6885                         LLVMValueRef es_prolog = NULL;
6886                         LLVMValueRef es_main = NULL;
6887                         LLVMValueRef gs_prolog = NULL;
6888                         LLVMValueRef gs_main = ctx.main_fn;
6889
6890                         /* GS prolog */
6891                         union si_shader_part_key gs_prolog_key;
6892                         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6893                         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6894                         gs_prolog_key.gs_prolog.is_monolithic = true;
6895                         si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6896                         gs_prolog = ctx.main_fn;
6897
6898                         /* ES main part */
6899                         struct si_shader shader_es = {};
6900                         shader_es.selector = es;
6901                         shader_es.key.as_es = 1;
6902                         shader_es.key.mono = shader->key.mono;
6903                         shader_es.key.opt = shader->key.opt;
6904                         si_llvm_context_set_tgsi(&ctx, &shader_es);
6905
6906                         if (!si_compile_tgsi_main(&ctx, true)) {
6907                                 si_llvm_dispose(&ctx);
6908                                 return -1;
6909                         }
6910                         shader->info.uses_instanceid |= es->info.uses_instanceid;
6911                         es_main = ctx.main_fn;
6912
6913                         /* ES prolog */
6914                         if (es->vs_needs_prolog) {
6915                                 union si_shader_part_key vs_prolog_key;
6916                                 si_get_vs_prolog_key(&es->info,
6917                                                      shader_es.info.num_input_sgprs,
6918                                                      &shader->key.part.gs.vs_prolog,
6919                                                      shader, &vs_prolog_key);
6920                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6921                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6922                                 es_prolog = ctx.main_fn;
6923                         }
6924
6925                         /* Reset the shader context. */
6926                         ctx.shader = shader;
6927                         ctx.type = PIPE_SHADER_GEOMETRY;
6928
6929                         /* Prepare the array of shader parts. */
6930                         LLVMValueRef parts[4];
6931                         unsigned num_parts = 0, main_part, next_first_part;
6932
6933                         if (es_prolog)
6934                                 parts[num_parts++] = es_prolog;
6935
6936                         parts[main_part = num_parts++] = es_main;
6937                         parts[next_first_part = num_parts++] = gs_prolog;
6938                         parts[num_parts++] = gs_main;
6939
6940                         si_build_wrapper_function(&ctx, parts, num_parts,
6941                                                   main_part, next_first_part);
6942                 } else {
6943                         LLVMValueRef parts[2];
6944                         union si_shader_part_key prolog_key;
6945
6946                         parts[1] = ctx.main_fn;
6947
6948                         memset(&prolog_key, 0, sizeof(prolog_key));
6949                         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6950                         si_build_gs_prolog_function(&ctx, &prolog_key);
6951                         parts[0] = ctx.main_fn;
6952
6953                         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6954                 }
6955         } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6956                 LLVMValueRef parts[3];
6957                 union si_shader_part_key prolog_key;
6958                 union si_shader_part_key epilog_key;
6959                 bool need_prolog;
6960
6961                 si_get_ps_prolog_key(shader, &prolog_key, false);
6962                 need_prolog = si_need_ps_prolog(&prolog_key);
6963
6964                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6965
6966                 if (need_prolog) {
6967                         si_build_ps_prolog_function(&ctx, &prolog_key);
6968                         parts[0] = ctx.main_fn;
6969                 }
6970
6971                 si_get_ps_epilog_key(shader, &epilog_key);
6972                 si_build_ps_epilog_function(&ctx, &epilog_key);
6973                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6974
6975                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6976                                           need_prolog ? 1 : 0, 0);
6977         }
6978
6979         si_llvm_optimize_module(&ctx);
6980
6981         /* Post-optimization transformations and analysis. */
6982         si_optimize_vs_outputs(&ctx);
6983
6984         if ((debug && debug->debug_message) ||
6985             si_can_dump_shader(sscreen, ctx.type)) {
6986                 ctx.shader->config.private_mem_vgprs =
6987                         ac_count_scratch_private_memory(ctx.main_fn);
6988         }
6989
6990         /* Make sure the input is a pointer and not integer followed by inttoptr. */
6991         assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
6992                LLVMPointerTypeKind);
6993
6994         /* Compile to bytecode. */
6995         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
6996                             ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6997         si_llvm_dispose(&ctx);
6998         if (r) {
6999                 fprintf(stderr, "LLVM failed to compile shader\n");
7000                 return r;
7001         }
7002
7003         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
7004          * LLVM 3.9svn has this bug.
7005          */
7006         if (sel->type == PIPE_SHADER_COMPUTE) {
7007                 unsigned wave_size = 64;
7008                 unsigned max_vgprs = 256;
7009                 unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512;
7010                 unsigned max_sgprs_per_wave = 128;
7011                 unsigned max_block_threads = si_get_max_workgroup_size(shader);
7012                 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
7013                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
7014
7015                 max_vgprs = max_vgprs / min_waves_per_simd;
7016                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
7017
7018                 if (shader->config.num_sgprs > max_sgprs ||
7019                     shader->config.num_vgprs > max_vgprs) {
7020                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
7021                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
7022                                 shader->config.num_sgprs, shader->config.num_vgprs,
7023                                 max_sgprs, max_vgprs);
7024
7025                         /* Just terminate the process, because dependent
7026                          * shaders can hang due to bad input data, but use
7027                          * the env var to allow shader-db to work.
7028                          */
7029                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
7030                                 abort();
7031                 }
7032         }
7033
7034         /* Add the scratch offset to input SGPRs. */
7035         if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
7036                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
7037
7038         /* Calculate the number of fragment input VGPRs. */
7039         if (ctx.type == PIPE_SHADER_FRAGMENT) {
7040                 shader->info.num_input_vgprs = 0;
7041                 shader->info.face_vgpr_index = -1;
7042                 shader->info.ancillary_vgpr_index = -1;
7043
7044                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
7045                         shader->info.num_input_vgprs += 2;
7046                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
7047                         shader->info.num_input_vgprs += 2;
7048                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
7049                         shader->info.num_input_vgprs += 2;
7050                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
7051                         shader->info.num_input_vgprs += 3;
7052                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
7053                         shader->info.num_input_vgprs += 2;
7054                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
7055                         shader->info.num_input_vgprs += 2;
7056                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
7057                         shader->info.num_input_vgprs += 2;
7058                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
7059                         shader->info.num_input_vgprs += 1;
7060                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
7061                         shader->info.num_input_vgprs += 1;
7062                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
7063                         shader->info.num_input_vgprs += 1;
7064                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
7065                         shader->info.num_input_vgprs += 1;
7066                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
7067                         shader->info.num_input_vgprs += 1;
7068                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
7069                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
7070                         shader->info.num_input_vgprs += 1;
7071                 }
7072                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) {
7073                         shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs;
7074                         shader->info.num_input_vgprs += 1;
7075                 }
7076                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
7077                         shader->info.num_input_vgprs += 1;
7078                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
7079                         shader->info.num_input_vgprs += 1;
7080         }
7081
7082         si_calculate_max_simd_waves(shader);
7083         si_shader_dump_stats_for_shader_db(shader, debug);
7084         return 0;
7085 }
7086
7087 /**
7088  * Create, compile and return a shader part (prolog or epilog).
7089  *
7090  * \param sscreen       screen
7091  * \param list          list of shader parts of the same category
7092  * \param type          shader type
7093  * \param key           shader part key
7094  * \param prolog        whether the part being requested is a prolog
7095  * \param tm            LLVM target machine
7096  * \param debug         debug callback
7097  * \param build         the callback responsible for building the main function
7098  * \return              non-NULL on success
7099  */
7100 static struct si_shader_part *
7101 si_get_shader_part(struct si_screen *sscreen,
7102                    struct si_shader_part **list,
7103                    enum pipe_shader_type type,
7104                    bool prolog,
7105                    union si_shader_part_key *key,
7106                    struct si_compiler *compiler,
7107                    struct pipe_debug_callback *debug,
7108                    void (*build)(struct si_shader_context *,
7109                                  union si_shader_part_key *),
7110                    const char *name)
7111 {
7112         struct si_shader_part *result;
7113
7114         mtx_lock(&sscreen->shader_parts_mutex);
7115
7116         /* Find existing. */
7117         for (result = *list; result; result = result->next) {
7118                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
7119                         mtx_unlock(&sscreen->shader_parts_mutex);
7120                         return result;
7121                 }
7122         }
7123
7124         /* Compile a new one. */
7125         result = CALLOC_STRUCT(si_shader_part);
7126         result->key = *key;
7127
7128         struct si_shader shader = {};
7129         struct si_shader_context ctx;
7130
7131         si_init_shader_ctx(&ctx, sscreen, compiler);
7132         ctx.shader = &shader;
7133         ctx.type = type;
7134
7135         switch (type) {
7136         case PIPE_SHADER_VERTEX:
7137                 shader.key.as_ls = key->vs_prolog.as_ls;
7138                 shader.key.as_es = key->vs_prolog.as_es;
7139                 break;
7140         case PIPE_SHADER_TESS_CTRL:
7141                 assert(!prolog);
7142                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
7143                 break;
7144         case PIPE_SHADER_GEOMETRY:
7145                 assert(prolog);
7146                 break;
7147         case PIPE_SHADER_FRAGMENT:
7148                 if (prolog)
7149                         shader.key.part.ps.prolog = key->ps_prolog.states;
7150                 else
7151                         shader.key.part.ps.epilog = key->ps_epilog.states;
7152                 break;
7153         default:
7154                 unreachable("bad shader part");
7155         }
7156
7157         build(&ctx, key);
7158
7159         /* Compile. */
7160         si_llvm_optimize_module(&ctx);
7161
7162         if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
7163                             ctx.ac.module, debug, ctx.type, name)) {
7164                 FREE(result);
7165                 result = NULL;
7166                 goto out;
7167         }
7168
7169         result->next = *list;
7170         *list = result;
7171
7172 out:
7173         si_llvm_dispose(&ctx);
7174         mtx_unlock(&sscreen->shader_parts_mutex);
7175         return result;
7176 }
7177
7178 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
7179 {
7180         LLVMValueRef ptr[2], list;
7181         bool is_merged_shader =
7182                 ctx->screen->info.chip_class >= GFX9 &&
7183                 (ctx->type == PIPE_SHADER_TESS_CTRL ||
7184                  ctx->type == PIPE_SHADER_GEOMETRY ||
7185                  ctx->shader->key.as_ls || ctx->shader->key.as_es);
7186
7187         if (HAVE_32BIT_POINTERS) {
7188                 ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
7189                 list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
7190                                          ac_array_in_const32_addr_space(ctx->v4i32), "");
7191                 return list;
7192         }
7193
7194         /* Get the pointer to rw buffers. */
7195         ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
7196         ptr[1] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS + 1);
7197         list = lp_build_gather_values(&ctx->gallivm, ptr, 2);
7198         list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, "");
7199         list = LLVMBuildIntToPtr(ctx->ac.builder, list,
7200                                  ac_array_in_const_addr_space(ctx->v4i32), "");
7201         return list;
7202 }
7203
7204 /**
7205  * Build the vertex shader prolog function.
7206  *
7207  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
7208  * All inputs are returned unmodified. The vertex load indices are
7209  * stored after them, which will be used by the API VS for fetching inputs.
7210  *
7211  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
7212  *   input_v0,
7213  *   input_v1,
7214  *   input_v2,
7215  *   input_v3,
7216  *   (VertexID + BaseVertex),
7217  *   (InstanceID + StartInstance),
7218  *   (InstanceID / 2 + StartInstance)
7219  */
7220 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
7221                                         union si_shader_part_key *key)
7222 {
7223         struct si_function_info fninfo;
7224         LLVMTypeRef *returns;
7225         LLVMValueRef ret, func;
7226         int num_returns, i;
7227         unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
7228         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
7229         LLVMValueRef input_vgprs[9];
7230         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
7231                                       num_input_vgprs;
7232         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
7233
7234         si_init_function_info(&fninfo);
7235
7236         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
7237         returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
7238                          sizeof(LLVMTypeRef));
7239         num_returns = 0;
7240
7241         /* Declare input and output SGPRs. */
7242         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7243                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7244                 returns[num_returns++] = ctx->i32;
7245         }
7246
7247         /* Preloaded VGPRs (outputs must be floats) */
7248         for (i = 0; i < num_input_vgprs; i++) {
7249                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
7250                 returns[num_returns++] = ctx->f32;
7251         }
7252
7253         /* Vertex load indices. */
7254         for (i = 0; i <= key->vs_prolog.last_input; i++)
7255                 returns[num_returns++] = ctx->f32;
7256
7257         /* Create the function. */
7258         si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
7259         func = ctx->main_fn;
7260
7261         if (key->vs_prolog.num_merged_next_stage_vgprs) {
7262                 if (!key->vs_prolog.is_monolithic)
7263                         si_init_exec_from_input(ctx, 3, 0);
7264
7265                 if (key->vs_prolog.as_ls &&
7266                     ctx->screen->has_ls_vgpr_init_bug) {
7267                         /* If there are no HS threads, SPI loads the LS VGPRs
7268                          * starting at VGPR 0. Shift them back to where they
7269                          * belong.
7270                          */
7271                         LLVMValueRef has_hs_threads =
7272                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
7273                                     si_unpack_param(ctx, 3, 8, 8),
7274                                     ctx->i32_0, "");
7275
7276                         for (i = 4; i > 0; --i) {
7277                                 input_vgprs[i + 1] =
7278                                         LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
7279                                                         input_vgprs[i + 1],
7280                                                         input_vgprs[i - 1], "");
7281                         }
7282                 }
7283         }
7284
7285         ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
7286         ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
7287
7288         /* Copy inputs to outputs. This should be no-op, as the registers match,
7289          * but it will prevent the compiler from overwriting them unintentionally.
7290          */
7291         ret = ctx->return_value;
7292         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7293                 LLVMValueRef p = LLVMGetParam(func, i);
7294                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7295         }
7296         for (i = 0; i < num_input_vgprs; i++) {
7297                 LLVMValueRef p = input_vgprs[i];
7298                 p = ac_to_float(&ctx->ac, p);
7299                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
7300                                            key->vs_prolog.num_input_sgprs + i, "");
7301         }
7302
7303         /* Compute vertex load indices from instance divisors. */
7304         LLVMValueRef instance_divisor_constbuf = NULL;
7305
7306         if (key->vs_prolog.states.instance_divisor_is_fetched) {
7307                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7308                 LLVMValueRef buf_index =
7309                         LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
7310                 instance_divisor_constbuf =
7311                         ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
7312         }
7313
7314         for (i = 0; i <= key->vs_prolog.last_input; i++) {
7315                 bool divisor_is_one =
7316                         key->vs_prolog.states.instance_divisor_is_one & (1u << i);
7317                 bool divisor_is_fetched =
7318                         key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
7319                 LLVMValueRef index;
7320
7321                 if (divisor_is_one || divisor_is_fetched) {
7322                         LLVMValueRef divisor = ctx->i32_1;
7323
7324                         if (divisor_is_fetched) {
7325                                 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
7326                                                             LLVMConstInt(ctx->i32, i * 4, 0));
7327                                 divisor = ac_to_integer(&ctx->ac, divisor);
7328                         }
7329
7330                         /* InstanceID / Divisor + StartInstance */
7331                         index = get_instance_index_for_fetch(ctx,
7332                                                              user_sgpr_base +
7333                                                              SI_SGPR_START_INSTANCE,
7334                                                              divisor);
7335                 } else {
7336                         /* VertexID + BaseVertex */
7337                         index = LLVMBuildAdd(ctx->ac.builder,
7338                                              ctx->abi.vertex_id,
7339                                              LLVMGetParam(func, user_sgpr_base +
7340                                                                 SI_SGPR_BASE_VERTEX), "");
7341                 }
7342
7343                 index = ac_to_float(&ctx->ac, index);
7344                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
7345                                            fninfo.num_params + i, "");
7346         }
7347
7348         si_llvm_build_ret(ctx, ret);
7349 }
7350
7351 static bool si_get_vs_prolog(struct si_screen *sscreen,
7352                              struct si_compiler *compiler,
7353                              struct si_shader *shader,
7354                              struct pipe_debug_callback *debug,
7355                              struct si_shader *main_part,
7356                              const struct si_vs_prolog_bits *key)
7357 {
7358         struct si_shader_selector *vs = main_part->selector;
7359
7360         if (!si_vs_needs_prolog(vs, key))
7361                 return true;
7362
7363         /* Get the prolog. */
7364         union si_shader_part_key prolog_key;
7365         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
7366                              key, shader, &prolog_key);
7367
7368         shader->prolog =
7369                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7370                                    PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
7371                                    debug, si_build_vs_prolog_function,
7372                                    "Vertex Shader Prolog");
7373         return shader->prolog != NULL;
7374 }
7375
7376 /**
7377  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7378  */
7379 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7380                                       struct si_compiler *compiler,
7381                                       struct si_shader *shader,
7382                                       struct pipe_debug_callback *debug)
7383 {
7384         return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
7385                                 &shader->key.part.vs.prolog);
7386 }
7387
7388 /**
7389  * Compile the TCS epilog function. This writes tesselation factors to memory
7390  * based on the output primitive type of the tesselator (determined by TES).
7391  */
7392 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
7393                                          union si_shader_part_key *key)
7394 {
7395         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7396         struct si_function_info fninfo;
7397         LLVMValueRef func;
7398
7399         si_init_function_info(&fninfo);
7400
7401         if (ctx->screen->info.chip_class >= GFX9) {
7402                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7403                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7404                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7405                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
7406                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7407                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7408                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7409                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7410                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7411                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7412                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7413                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7414                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7415                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7416                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7417                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7418                 if (!HAVE_32BIT_POINTERS)
7419                         add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7420                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7421                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7422                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7423         } else {
7424                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7425                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7426                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7427                 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7428                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7429                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7430                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7431                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7432                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7433                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7434         }
7435
7436         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7437         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7438         unsigned tess_factors_idx =
7439                 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7440         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7441         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7442
7443         for (unsigned i = 0; i < 6; i++)
7444                 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
7445
7446         /* Create the function. */
7447         si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7448                            ctx->screen->info.chip_class >= CIK ? 128 : 64);
7449         ac_declare_lds_as_pointer(&ctx->ac);
7450         func = ctx->main_fn;
7451
7452         LLVMValueRef invoc0_tess_factors[6];
7453         for (unsigned i = 0; i < 6; i++)
7454                 invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i);
7455
7456         si_write_tess_factors(bld_base,
7457                               LLVMGetParam(func, tess_factors_idx),
7458                               LLVMGetParam(func, tess_factors_idx + 1),
7459                               LLVMGetParam(func, tess_factors_idx + 2),
7460                               invoc0_tess_factors, invoc0_tess_factors + 4);
7461
7462         LLVMBuildRetVoid(ctx->ac.builder);
7463 }
7464
7465 /**
7466  * Select and compile (or reuse) TCS parts (epilog).
7467  */
7468 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7469                                        struct si_compiler *compiler,
7470                                        struct si_shader *shader,
7471                                        struct pipe_debug_callback *debug)
7472 {
7473         if (sscreen->info.chip_class >= GFX9) {
7474                 struct si_shader *ls_main_part =
7475                         shader->key.part.tcs.ls->main_shader_part_ls;
7476
7477                 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
7478                                       &shader->key.part.tcs.ls_prolog))
7479                         return false;
7480
7481                 shader->previous_stage = ls_main_part;
7482         }
7483
7484         /* Get the epilog. */
7485         union si_shader_part_key epilog_key;
7486         memset(&epilog_key, 0, sizeof(epilog_key));
7487         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7488
7489         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7490                                             PIPE_SHADER_TESS_CTRL, false,
7491                                             &epilog_key, compiler, debug,
7492                                             si_build_tcs_epilog_function,
7493                                             "Tessellation Control Shader Epilog");
7494         return shader->epilog != NULL;
7495 }
7496
7497 /**
7498  * Select and compile (or reuse) GS parts (prolog).
7499  */
7500 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7501                                       struct si_compiler *compiler,
7502                                       struct si_shader *shader,
7503                                       struct pipe_debug_callback *debug)
7504 {
7505         if (sscreen->info.chip_class >= GFX9) {
7506                 struct si_shader *es_main_part =
7507                         shader->key.part.gs.es->main_shader_part_es;
7508
7509                 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7510                     !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
7511                                       &shader->key.part.gs.vs_prolog))
7512                         return false;
7513
7514                 shader->previous_stage = es_main_part;
7515         }
7516
7517         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7518                 return true;
7519
7520         union si_shader_part_key prolog_key;
7521         memset(&prolog_key, 0, sizeof(prolog_key));
7522         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7523
7524         shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7525                                             PIPE_SHADER_GEOMETRY, true,
7526                                             &prolog_key, compiler, debug,
7527                                             si_build_gs_prolog_function,
7528                                             "Geometry Shader Prolog");
7529         return shader->prolog2 != NULL;
7530 }
7531
7532 /**
7533  * Build the pixel shader prolog function. This handles:
7534  * - two-side color selection and interpolation
7535  * - overriding interpolation parameters for the API PS
7536  * - polygon stippling
7537  *
7538  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7539  * overriden by other states. (e.g. per-sample interpolation)
7540  * Interpolated colors are stored after the preloaded VGPRs.
7541  */
7542 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7543                                         union si_shader_part_key *key)
7544 {
7545         struct si_function_info fninfo;
7546         LLVMValueRef ret, func;
7547         int num_returns, i, num_color_channels;
7548
7549         assert(si_need_ps_prolog(key));
7550
7551         si_init_function_info(&fninfo);
7552
7553         /* Declare inputs. */
7554         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7555                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7556
7557         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7558                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7559
7560         /* Declare outputs (same as inputs + add colors if needed) */
7561         num_returns = fninfo.num_params;
7562         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7563         for (i = 0; i < num_color_channels; i++)
7564                 fninfo.types[num_returns++] = ctx->f32;
7565
7566         /* Create the function. */
7567         si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7568                            &fninfo, 0);
7569         func = ctx->main_fn;
7570
7571         /* Copy inputs to outputs. This should be no-op, as the registers match,
7572          * but it will prevent the compiler from overwriting them unintentionally.
7573          */
7574         ret = ctx->return_value;
7575         for (i = 0; i < fninfo.num_params; i++) {
7576                 LLVMValueRef p = LLVMGetParam(func, i);
7577                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7578         }
7579
7580         /* Polygon stippling. */
7581         if (key->ps_prolog.states.poly_stipple) {
7582                 /* POS_FIXED_PT is always last. */
7583                 unsigned pos = key->ps_prolog.num_input_sgprs +
7584                                key->ps_prolog.num_input_vgprs - 1;
7585                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7586
7587                 si_llvm_emit_polygon_stipple(ctx, list, pos);
7588         }
7589
7590         if (key->ps_prolog.states.bc_optimize_for_persp ||
7591             key->ps_prolog.states.bc_optimize_for_linear) {
7592                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7593                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7594
7595                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7596                  * The hw doesn't compute CENTROID if the whole wave only
7597                  * contains fully-covered quads.
7598                  *
7599                  * PRIM_MASK is after user SGPRs.
7600                  */
7601                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7602                 bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
7603                                             LLVMConstInt(ctx->i32, 31, 0), "");
7604                 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
7605                                              ctx->i1, "");
7606
7607                 if (key->ps_prolog.states.bc_optimize_for_persp) {
7608                         /* Read PERSP_CENTER. */
7609                         for (i = 0; i < 2; i++)
7610                                 center[i] = LLVMGetParam(func, base + 2 + i);
7611                         /* Read PERSP_CENTROID. */
7612                         for (i = 0; i < 2; i++)
7613                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
7614                         /* Select PERSP_CENTROID. */
7615                         for (i = 0; i < 2; i++) {
7616                                 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7617                                                       center[i], centroid[i], "");
7618                                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7619                                                            tmp, base + 4 + i, "");
7620                         }
7621                 }
7622                 if (key->ps_prolog.states.bc_optimize_for_linear) {
7623                         /* Read LINEAR_CENTER. */
7624                         for (i = 0; i < 2; i++)
7625                                 center[i] = LLVMGetParam(func, base + 8 + i);
7626                         /* Read LINEAR_CENTROID. */
7627                         for (i = 0; i < 2; i++)
7628                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
7629                         /* Select LINEAR_CENTROID. */
7630                         for (i = 0; i < 2; i++) {
7631                                 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7632                                                       center[i], centroid[i], "");
7633                                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7634                                                            tmp, base + 10 + i, "");
7635                         }
7636                 }
7637         }
7638
7639         /* Force per-sample interpolation. */
7640         if (key->ps_prolog.states.force_persp_sample_interp) {
7641                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7642                 LLVMValueRef persp_sample[2];
7643
7644                 /* Read PERSP_SAMPLE. */
7645                 for (i = 0; i < 2; i++)
7646                         persp_sample[i] = LLVMGetParam(func, base + i);
7647                 /* Overwrite PERSP_CENTER. */
7648                 for (i = 0; i < 2; i++)
7649                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7650                                                    persp_sample[i], base + 2 + i, "");
7651                 /* Overwrite PERSP_CENTROID. */
7652                 for (i = 0; i < 2; i++)
7653                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7654                                                    persp_sample[i], base + 4 + i, "");
7655         }
7656         if (key->ps_prolog.states.force_linear_sample_interp) {
7657                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7658                 LLVMValueRef linear_sample[2];
7659
7660                 /* Read LINEAR_SAMPLE. */
7661                 for (i = 0; i < 2; i++)
7662                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7663                 /* Overwrite LINEAR_CENTER. */
7664                 for (i = 0; i < 2; i++)
7665                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7666                                                    linear_sample[i], base + 8 + i, "");
7667                 /* Overwrite LINEAR_CENTROID. */
7668                 for (i = 0; i < 2; i++)
7669                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7670                                                    linear_sample[i], base + 10 + i, "");
7671         }
7672
7673         /* Force center interpolation. */
7674         if (key->ps_prolog.states.force_persp_center_interp) {
7675                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7676                 LLVMValueRef persp_center[2];
7677
7678                 /* Read PERSP_CENTER. */
7679                 for (i = 0; i < 2; i++)
7680                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
7681                 /* Overwrite PERSP_SAMPLE. */
7682                 for (i = 0; i < 2; i++)
7683                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7684                                                    persp_center[i], base + i, "");
7685                 /* Overwrite PERSP_CENTROID. */
7686                 for (i = 0; i < 2; i++)
7687                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7688                                                    persp_center[i], base + 4 + i, "");
7689         }
7690         if (key->ps_prolog.states.force_linear_center_interp) {
7691                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7692                 LLVMValueRef linear_center[2];
7693
7694                 /* Read LINEAR_CENTER. */
7695                 for (i = 0; i < 2; i++)
7696                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
7697                 /* Overwrite LINEAR_SAMPLE. */
7698                 for (i = 0; i < 2; i++)
7699                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7700                                                    linear_center[i], base + 6 + i, "");
7701                 /* Overwrite LINEAR_CENTROID. */
7702                 for (i = 0; i < 2; i++)
7703                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7704                                                    linear_center[i], base + 10 + i, "");
7705         }
7706
7707         /* Interpolate colors. */
7708         unsigned color_out_idx = 0;
7709         for (i = 0; i < 2; i++) {
7710                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7711                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7712                                      key->ps_prolog.face_vgpr_index;
7713                 LLVMValueRef interp[2], color[4];
7714                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7715
7716                 if (!writemask)
7717                         continue;
7718
7719                 /* If the interpolation qualifier is not CONSTANT (-1). */
7720                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7721                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7722                                                key->ps_prolog.color_interp_vgpr_index[i];
7723
7724                         /* Get the (i,j) updated by bc_optimize handling. */
7725                         interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7726                                                           interp_vgpr, "");
7727                         interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7728                                                           interp_vgpr + 1, "");
7729                         interp_ij = lp_build_gather_values(&ctx->gallivm, interp, 2);
7730                 }
7731
7732                 /* Use the absolute location of the input. */
7733                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7734
7735                 if (key->ps_prolog.states.color_two_side) {
7736                         face = LLVMGetParam(func, face_vgpr);
7737                         face = ac_to_integer(&ctx->ac, face);
7738                 }
7739
7740                 interp_fs_input(ctx,
7741                                 key->ps_prolog.color_attr_index[i],
7742                                 TGSI_SEMANTIC_COLOR, i,
7743                                 key->ps_prolog.num_interp_inputs,
7744                                 key->ps_prolog.colors_read, interp_ij,
7745                                 prim_mask, face, color);
7746
7747                 while (writemask) {
7748                         unsigned chan = u_bit_scan(&writemask);
7749                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
7750                                                    fninfo.num_params + color_out_idx++, "");
7751                 }
7752         }
7753
7754         /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
7755          * says:
7756          *
7757          *    "When per-sample shading is active due to the use of a fragment
7758          *     input qualified by sample or due to the use of the gl_SampleID
7759          *     or gl_SamplePosition variables, only the bit for the current
7760          *     sample is set in gl_SampleMaskIn. When state specifies multiple
7761          *     fragment shader invocations for a given fragment, the sample
7762          *     mask for any single fragment shader invocation may specify a
7763          *     subset of the covered samples for the fragment. In this case,
7764          *     the bit corresponding to each covered sample will be set in
7765          *     exactly one fragment shader invocation."
7766          *
7767          * The samplemask loaded by hardware is always the coverage of the
7768          * entire pixel/fragment, so mask bits out based on the sample ID.
7769          */
7770         if (key->ps_prolog.states.samplemask_log_ps_iter) {
7771                 /* The bit pattern matches that used by fixed function fragment
7772                  * processing. */
7773                 static const uint16_t ps_iter_masks[] = {
7774                         0xffff, /* not used */
7775                         0x5555,
7776                         0x1111,
7777                         0x0101,
7778                         0x0001,
7779                 };
7780                 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
7781
7782                 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
7783                 unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs +
7784                                           key->ps_prolog.ancillary_vgpr_index;
7785                 LLVMValueRef sampleid = si_unpack_param(ctx, ancillary_vgpr, 8, 4);
7786                 LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1);
7787
7788                 samplemask = ac_to_integer(&ctx->ac, samplemask);
7789                 samplemask = LLVMBuildAnd(
7790                         ctx->ac.builder,
7791                         samplemask,
7792                         LLVMBuildShl(ctx->ac.builder,
7793                                      LLVMConstInt(ctx->i32, ps_iter_mask, false),
7794                                      sampleid, ""),
7795                         "");
7796                 samplemask = ac_to_float(&ctx->ac, samplemask);
7797
7798                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
7799                                            ancillary_vgpr + 1, "");
7800         }
7801
7802         /* Tell LLVM to insert WQM instruction sequence when needed. */
7803         if (key->ps_prolog.wqm) {
7804                 LLVMAddTargetDependentFunctionAttr(func,
7805                                                    "amdgpu-ps-wqm-outputs", "");
7806         }
7807
7808         si_llvm_build_ret(ctx, ret);
7809 }
7810
7811 /**
7812  * Build the pixel shader epilog function. This handles everything that must be
7813  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7814  */
7815 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7816                                         union si_shader_part_key *key)
7817 {
7818         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7819         struct si_function_info fninfo;
7820         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7821         int i;
7822         struct si_ps_exports exp = {};
7823
7824         si_init_function_info(&fninfo);
7825
7826         /* Declare input SGPRs. */
7827         ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7828         ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7829         ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7830         ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7831         add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7832
7833         /* Declare input VGPRs. */
7834         unsigned required_num_params =
7835                      fninfo.num_sgpr_params +
7836                      util_bitcount(key->ps_epilog.colors_written) * 4 +
7837                      key->ps_epilog.writes_z +
7838                      key->ps_epilog.writes_stencil +
7839                      key->ps_epilog.writes_samplemask;
7840
7841         required_num_params = MAX2(required_num_params,
7842                                    fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7843
7844         while (fninfo.num_params < required_num_params)
7845                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7846
7847         /* Create the function. */
7848         si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7849         /* Disable elimination of unused inputs. */
7850         ac_llvm_add_target_dep_function_attr(ctx->main_fn,
7851                                              "InitialPSInputAddr", 0xffffff);
7852
7853         /* Process colors. */
7854         unsigned vgpr = fninfo.num_sgpr_params;
7855         unsigned colors_written = key->ps_epilog.colors_written;
7856         int last_color_export = -1;
7857
7858         /* Find the last color export. */
7859         if (!key->ps_epilog.writes_z &&
7860             !key->ps_epilog.writes_stencil &&
7861             !key->ps_epilog.writes_samplemask) {
7862                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7863
7864                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7865                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7866                         /* Just set this if any of the colorbuffers are enabled. */
7867                         if (spi_format &
7868                             ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7869                                 last_color_export = 0;
7870                 } else {
7871                         for (i = 0; i < 8; i++)
7872                                 if (colors_written & (1 << i) &&
7873                                     (spi_format >> (i * 4)) & 0xf)
7874                                         last_color_export = i;
7875                 }
7876         }
7877
7878         while (colors_written) {
7879                 LLVMValueRef color[4];
7880                 int mrt = u_bit_scan(&colors_written);
7881
7882                 for (i = 0; i < 4; i++)
7883                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7884
7885                 si_export_mrt_color(bld_base, color, mrt,
7886                                     fninfo.num_params - 1,
7887                                     mrt == last_color_export, &exp);
7888         }
7889
7890         /* Process depth, stencil, samplemask. */
7891         if (key->ps_epilog.writes_z)
7892                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7893         if (key->ps_epilog.writes_stencil)
7894                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7895         if (key->ps_epilog.writes_samplemask)
7896                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7897
7898         if (depth || stencil || samplemask)
7899                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7900         else if (last_color_export == -1)
7901                 ac_build_export_null(&ctx->ac);
7902
7903         if (exp.num)
7904                 si_emit_ps_exports(ctx, &exp);
7905
7906         /* Compile. */
7907         LLVMBuildRetVoid(ctx->ac.builder);
7908 }
7909
7910 /**
7911  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7912  */
7913 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7914                                       struct si_compiler *compiler,
7915                                       struct si_shader *shader,
7916                                       struct pipe_debug_callback *debug)
7917 {
7918         union si_shader_part_key prolog_key;
7919         union si_shader_part_key epilog_key;
7920
7921         /* Get the prolog. */
7922         si_get_ps_prolog_key(shader, &prolog_key, true);
7923
7924         /* The prolog is a no-op if these aren't set. */
7925         if (si_need_ps_prolog(&prolog_key)) {
7926                 shader->prolog =
7927                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
7928                                            PIPE_SHADER_FRAGMENT, true,
7929                                            &prolog_key, compiler, debug,
7930                                            si_build_ps_prolog_function,
7931                                            "Fragment Shader Prolog");
7932                 if (!shader->prolog)
7933                         return false;
7934         }
7935
7936         /* Get the epilog. */
7937         si_get_ps_epilog_key(shader, &epilog_key);
7938
7939         shader->epilog =
7940                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7941                                    PIPE_SHADER_FRAGMENT, false,
7942                                    &epilog_key, compiler, debug,
7943                                    si_build_ps_epilog_function,
7944                                    "Fragment Shader Epilog");
7945         if (!shader->epilog)
7946                 return false;
7947
7948         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7949         if (shader->key.part.ps.prolog.poly_stipple) {
7950                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7951                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7952         }
7953
7954         /* Set up the enable bits for per-sample shading if needed. */
7955         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7956             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7957              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7958                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7959                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7960                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7961         }
7962         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7963             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7964              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7965                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7966                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7967                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7968         }
7969         if (shader->key.part.ps.prolog.force_persp_center_interp &&
7970             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7971              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7972                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7973                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7974                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7975         }
7976         if (shader->key.part.ps.prolog.force_linear_center_interp &&
7977             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7978              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7979                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7980                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7981                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7982         }
7983
7984         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7985         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7986             !(shader->config.spi_ps_input_ena & 0xf)) {
7987                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7988                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7989         }
7990
7991         /* At least one pair of interpolation weights must be enabled. */
7992         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7993                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7994                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7995         }
7996
7997         /* Samplemask fixup requires the sample ID. */
7998         if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
7999                 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
8000                 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
8001         }
8002
8003         /* The sample mask input is always enabled, because the API shader always
8004          * passes it through to the epilog. Disable it here if it's unused.
8005          */
8006         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
8007             !shader->selector->info.reads_samplemask)
8008                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
8009
8010         return true;
8011 }
8012
8013 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
8014                                       unsigned *lds_size)
8015 {
8016         /* If tessellation is all offchip and on-chip GS isn't used, this
8017          * workaround is not needed.
8018          */
8019         return;
8020
8021         /* SPI barrier management bug:
8022          *   Make sure we have at least 4k of LDS in use to avoid the bug.
8023          *   It applies to workgroup sizes of more than one wavefront.
8024          */
8025         if (sscreen->info.family == CHIP_BONAIRE ||
8026             sscreen->info.family == CHIP_KABINI ||
8027             sscreen->info.family == CHIP_MULLINS)
8028                 *lds_size = MAX2(*lds_size, 8);
8029 }
8030
8031 static void si_fix_resource_usage(struct si_screen *sscreen,
8032                                   struct si_shader *shader)
8033 {
8034         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
8035
8036         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
8037
8038         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
8039             si_get_max_workgroup_size(shader) > 64) {
8040                 si_multiwave_lds_size_workaround(sscreen,
8041                                                  &shader->config.lds_size);
8042         }
8043 }
8044
8045 int si_shader_create(struct si_screen *sscreen, struct si_compiler *compiler,
8046                      struct si_shader *shader,
8047                      struct pipe_debug_callback *debug)
8048 {
8049         struct si_shader_selector *sel = shader->selector;
8050         struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
8051         int r;
8052
8053         /* LS, ES, VS are compiled on demand if the main part hasn't been
8054          * compiled for that stage.
8055          *
8056          * Vertex shaders are compiled on demand when a vertex fetch
8057          * workaround must be applied.
8058          */
8059         if (shader->is_monolithic) {
8060                 /* Monolithic shader (compiled as a whole, has many variants,
8061                  * may take a long time to compile).
8062                  */
8063                 r = si_compile_tgsi_shader(sscreen, compiler, shader, true, debug);
8064                 if (r)
8065                         return r;
8066         } else {
8067                 /* The shader consists of several parts:
8068                  *
8069                  * - the middle part is the user shader, it has 1 variant only
8070                  *   and it was compiled during the creation of the shader
8071                  *   selector
8072                  * - the prolog part is inserted at the beginning
8073                  * - the epilog part is inserted at the end
8074                  *
8075                  * The prolog and epilog have many (but simple) variants.
8076                  *
8077                  * Starting with gfx9, geometry and tessellation control
8078                  * shaders also contain the prolog and user shader parts of
8079                  * the previous shader stage.
8080                  */
8081
8082                 if (!mainp)
8083                         return -1;
8084
8085                 /* Copy the compiled TGSI shader data over. */
8086                 shader->is_binary_shared = true;
8087                 shader->binary = mainp->binary;
8088                 shader->config = mainp->config;
8089                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
8090                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
8091                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
8092                 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
8093                 memcpy(shader->info.vs_output_param_offset,
8094                        mainp->info.vs_output_param_offset,
8095                        sizeof(mainp->info.vs_output_param_offset));
8096                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
8097                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
8098                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
8099
8100                 /* Select prologs and/or epilogs. */
8101                 switch (sel->type) {
8102                 case PIPE_SHADER_VERTEX:
8103                         if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
8104                                 return -1;
8105                         break;
8106                 case PIPE_SHADER_TESS_CTRL:
8107                         if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
8108                                 return -1;
8109                         break;
8110                 case PIPE_SHADER_TESS_EVAL:
8111                         break;
8112                 case PIPE_SHADER_GEOMETRY:
8113                         if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
8114                                 return -1;
8115                         break;
8116                 case PIPE_SHADER_FRAGMENT:
8117                         if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
8118                                 return -1;
8119
8120                         /* Make sure we have at least as many VGPRs as there
8121                          * are allocated inputs.
8122                          */
8123                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8124                                                         shader->info.num_input_vgprs);
8125                         break;
8126                 }
8127
8128                 /* Update SGPR and VGPR counts. */
8129                 if (shader->prolog) {
8130                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8131                                                         shader->prolog->config.num_sgprs);
8132                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8133                                                         shader->prolog->config.num_vgprs);
8134                 }
8135                 if (shader->previous_stage) {
8136                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8137                                                         shader->previous_stage->config.num_sgprs);
8138                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8139                                                         shader->previous_stage->config.num_vgprs);
8140                         shader->config.spilled_sgprs =
8141                                 MAX2(shader->config.spilled_sgprs,
8142                                      shader->previous_stage->config.spilled_sgprs);
8143                         shader->config.spilled_vgprs =
8144                                 MAX2(shader->config.spilled_vgprs,
8145                                      shader->previous_stage->config.spilled_vgprs);
8146                         shader->config.private_mem_vgprs =
8147                                 MAX2(shader->config.private_mem_vgprs,
8148                                      shader->previous_stage->config.private_mem_vgprs);
8149                         shader->config.scratch_bytes_per_wave =
8150                                 MAX2(shader->config.scratch_bytes_per_wave,
8151                                      shader->previous_stage->config.scratch_bytes_per_wave);
8152                         shader->info.uses_instanceid |=
8153                                 shader->previous_stage->info.uses_instanceid;
8154                 }
8155                 if (shader->prolog2) {
8156                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8157                                                         shader->prolog2->config.num_sgprs);
8158                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8159                                                         shader->prolog2->config.num_vgprs);
8160                 }
8161                 if (shader->epilog) {
8162                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8163                                                         shader->epilog->config.num_sgprs);
8164                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8165                                                         shader->epilog->config.num_vgprs);
8166                 }
8167                 si_calculate_max_simd_waves(shader);
8168         }
8169
8170         si_fix_resource_usage(sscreen, shader);
8171         si_shader_dump(sscreen, shader, debug, sel->info.processor,
8172                        stderr, true);
8173
8174         /* Upload. */
8175         r = si_shader_binary_upload(sscreen, shader);
8176         if (r) {
8177                 fprintf(stderr, "LLVM failed to upload shader\n");
8178                 return r;
8179         }
8180
8181         return 0;
8182 }
8183
8184 void si_shader_destroy(struct si_shader *shader)
8185 {
8186         if (shader->scratch_bo)
8187                 r600_resource_reference(&shader->scratch_bo, NULL);
8188
8189         r600_resource_reference(&shader->bo, NULL);
8190
8191         if (!shader->is_binary_shared)
8192                 ac_shader_binary_clean(&shader->binary);
8193
8194         free(shader->shader_log);
8195 }