src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 #include "gallivm/lp_bld_const.h"
  25 #include "gallivm/lp_bld_gather.h"
  26 #include "gallivm/lp_bld_intr.h"
  27 #include "gallivm/lp_bld_logic.h"
  28 #include "gallivm/lp_bld_arit.h"
  29 #include "gallivm/lp_bld_flow.h"
  30 #include "gallivm/lp_bld_misc.h"
  31 #include "util/u_memory.h"
  32 #include "util/u_string.h"
  33 #include "tgsi/tgsi_build.h"
  34 #include "tgsi/tgsi_util.h"
  35 #include "tgsi/tgsi_dump.h"
  36
  37 #include "ac_binary.h"
  38 #include "ac_llvm_util.h"
  39 #include "ac_exp_param.h"
  40 #include "si_shader_internal.h"
  41 #include "si_pipe.h"
  42 #include "sid.h"
  43
  44 #include "compiler/nir/nir.h"
  45
  46 static const char *scratch_rsrc_dword0_symbol =
  47         "SCRATCH_RSRC_DWORD0";
  48
  49 static const char *scratch_rsrc_dword1_symbol =
  50         "SCRATCH_RSRC_DWORD1";
  51
  52 struct si_shader_output_values
  53 {
  54         LLVMValueRef values[4];
  55         unsigned semantic_name;
  56         unsigned semantic_index;
  57         ubyte vertex_stream[4];
  58 };
  59
  60 /**
  61  * Used to collect types and other info about arguments of the LLVM function
  62  * before the function is created.
  63  */
  64 struct si_function_info {
  65         LLVMTypeRef types[100];
  66         LLVMValueRef *assign[100];
  67         unsigned num_sgpr_params;
  68         unsigned num_params;
  69 };
  70
  71 enum si_arg_regfile {
  72         ARG_SGPR,
  73         ARG_VGPR
  74 };
  75
  76 static void si_init_shader_ctx(struct si_shader_context *ctx,
  77                                struct si_screen *sscreen,
  78                                LLVMTargetMachineRef tm);
  79
  80 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  81                                  struct lp_build_tgsi_context *bld_base,
  82                                  struct lp_build_emit_data *emit_data);
  83
  84 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
  85                                FILE *f);
  86
  87 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  88                                         union si_shader_part_key *key);
  89 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  90                                          union si_shader_part_key *key);
  91 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  92                                         union si_shader_part_key *key);
  93 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  94                                         union si_shader_part_key *key);
  95
  96 /* Ideally pass the sample mask input to the PS epilog as v14, which
  97  * is its usual location, so that the shader doesn't have to add v_mov.
  98  */
  99 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
 100
 101 enum {
 102         CONST_ADDR_SPACE = 2,
 103         LOCAL_ADDR_SPACE = 3,
 104 };
 105
 106 static bool llvm_type_is_64bit(struct si_shader_context *ctx,
 107                                LLVMTypeRef type)
 108 {
 109         if (type == ctx->ac.i64 || type == ctx->ac.f64)
 110                 return true;
 111
 112         return false;
 113 }
 114
 115 static bool is_merged_shader(struct si_shader *shader)
 116 {
 117         if (shader->selector->screen->info.chip_class <= VI)
 118                 return false;
 119
 120         return shader->key.as_ls ||
 121                shader->key.as_es ||
 122                shader->selector->type == PIPE_SHADER_TESS_CTRL ||
 123                shader->selector->type == PIPE_SHADER_GEOMETRY;
 124 }
 125
 126 static void si_init_function_info(struct si_function_info *fninfo)
 127 {
 128         fninfo->num_params = 0;
 129         fninfo->num_sgpr_params = 0;
 130 }
 131
 132 static unsigned add_arg_assign(struct si_function_info *fninfo,
 133                         enum si_arg_regfile regfile, LLVMTypeRef type,
 134                         LLVMValueRef *assign)
 135 {
 136         assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
 137
 138         unsigned idx = fninfo->num_params++;
 139         assert(idx < ARRAY_SIZE(fninfo->types));
 140
 141         if (regfile == ARG_SGPR)
 142                 fninfo->num_sgpr_params = fninfo->num_params;
 143
 144         fninfo->types[idx] = type;
 145         fninfo->assign[idx] = assign;
 146         return idx;
 147 }
 148
 149 static unsigned add_arg(struct si_function_info *fninfo,
 150                         enum si_arg_regfile regfile, LLVMTypeRef type)
 151 {
 152         return add_arg_assign(fninfo, regfile, type, NULL);
 153 }
 154
 155 static void add_arg_assign_checked(struct si_function_info *fninfo,
 156                                    enum si_arg_regfile regfile, LLVMTypeRef type,
 157                                    LLVMValueRef *assign, unsigned idx)
 158 {
 159         MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
 160         assert(actual == idx);
 161 }
 162
 163 static void add_arg_checked(struct si_function_info *fninfo,
 164                             enum si_arg_regfile regfile, LLVMTypeRef type,
 165                             unsigned idx)
 166 {
 167         add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
 168 }
 169
 170 /**
 171  * Returns a unique index for a per-patch semantic name and index. The index
 172  * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
 173  * can be calculated.
 174  */
 175 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
 176 {
 177         switch (semantic_name) {
 178         case TGSI_SEMANTIC_TESSOUTER:
 179                 return 0;
 180         case TGSI_SEMANTIC_TESSINNER:
 181                 return 1;
 182         case TGSI_SEMANTIC_PATCH:
 183                 assert(index < 30);
 184                 return 2 + index;
 185
 186         default:
 187                 assert(!"invalid semantic name");
 188                 return 0;
 189         }
 190 }
 191
 192 /**
 193  * Returns a unique index for a semantic name and index. The index must be
 194  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 195  * calculated.
 196  */
 197 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 198 {
 199         switch (semantic_name) {
 200         case TGSI_SEMANTIC_POSITION:
 201                 return 0;
 202         case TGSI_SEMANTIC_GENERIC:
 203                 /* Since some shader stages use the the highest used IO index
 204                  * to determine the size to allocate for inputs/outputs
 205                  * (in LDS, tess and GS rings). GENERIC should be placed right
 206                  * after POSITION to make that size as small as possible.
 207                  */
 208                 if (index < SI_MAX_IO_GENERIC)
 209                         return 1 + index;
 210
 211                 assert(!"invalid generic index");
 212                 return 0;
 213         case TGSI_SEMANTIC_PSIZE:
 214                 return SI_MAX_IO_GENERIC + 1;
 215         case TGSI_SEMANTIC_CLIPDIST:
 216                 assert(index <= 1);
 217                 return SI_MAX_IO_GENERIC + 2 + index;
 218         case TGSI_SEMANTIC_FOG:
 219                 return SI_MAX_IO_GENERIC + 4;
 220         case TGSI_SEMANTIC_LAYER:
 221                 return SI_MAX_IO_GENERIC + 5;
 222         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 223                 return SI_MAX_IO_GENERIC + 6;
 224         case TGSI_SEMANTIC_PRIMID:
 225                 return SI_MAX_IO_GENERIC + 7;
 226         case TGSI_SEMANTIC_COLOR: /* these alias */
 227         case TGSI_SEMANTIC_BCOLOR:
 228                 assert(index < 2);
 229                 return SI_MAX_IO_GENERIC + 8 + index;
 230         case TGSI_SEMANTIC_TEXCOORD:
 231                 assert(index < 8);
 232                 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
 233                 return SI_MAX_IO_GENERIC + 10 + index;
 234         default:
 235                 assert(!"invalid semantic name");
 236                 return 0;
 237         }
 238 }
 239
 240 /**
 241  * Get the value of a shader input parameter and extract a bitfield.
 242  */
 243 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 244                                  unsigned param, unsigned rshift,
 245                                  unsigned bitwidth)
 246 {
 247         LLVMValueRef value = LLVMGetParam(ctx->main_fn,
 248                                           param);
 249
 250         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 251                 value = ac_to_integer(&ctx->ac, value);
 252
 253         if (rshift)
 254                 value = LLVMBuildLShr(ctx->ac.builder, value,
 255                                       LLVMConstInt(ctx->i32, rshift, 0), "");
 256
 257         if (rshift + bitwidth < 32) {
 258                 unsigned mask = (1 << bitwidth) - 1;
 259                 value = LLVMBuildAnd(ctx->ac.builder, value,
 260                                      LLVMConstInt(ctx->i32, mask, 0), "");
 261         }
 262
 263         return value;
 264 }
 265
 266 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 267 {
 268         switch (ctx->type) {
 269         case PIPE_SHADER_TESS_CTRL:
 270                 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
 271
 272         case PIPE_SHADER_TESS_EVAL:
 273                 return LLVMGetParam(ctx->main_fn,
 274                                     ctx->param_tes_rel_patch_id);
 275
 276         default:
 277                 assert(0);
 278                 return NULL;
 279         }
 280 }
 281
 282 /* Tessellation shaders pass outputs to the next shader using LDS.
 283  *
 284  * LS outputs = TCS inputs
 285  * TCS outputs = TES inputs
 286  *
 287  * The LDS layout is:
 288  * - TCS inputs for patch 0
 289  * - TCS inputs for patch 1
 290  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 291  * - ...
 292  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 293  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 294  * - TCS outputs for patch 1
 295  * - Per-patch TCS outputs for patch 1
 296  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 297  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 298  * - ...
 299  *
 300  * All three shaders VS(LS), TCS, TES share the same LDS space.
 301  */
 302
 303 static LLVMValueRef
 304 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 305 {
 306         return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 307 }
 308
 309 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
 310 {
 311         assert(ctx->type == PIPE_SHADER_TESS_CTRL);
 312
 313         if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
 314                 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
 315
 316         return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
 317 }
 318
 319 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
 320 {
 321         unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 322
 323         return LLVMConstInt(ctx->i32, stride, 0);
 324 }
 325
 326 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
 327 {
 328         if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
 329                 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 330
 331         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 332         unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
 333         unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
 334         unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
 335         unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
 336                                    num_patch_outputs * 4;
 337         return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
 338 }
 339
 340 static LLVMValueRef
 341 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 342 {
 343         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 344                                 unpack_param(ctx,
 345                                              ctx->param_tcs_out_lds_offsets,
 346                                              0, 16),
 347                                 4);
 348 }
 349
 350 static LLVMValueRef
 351 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 352 {
 353         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 354                                 unpack_param(ctx,
 355                                              ctx->param_tcs_out_lds_offsets,
 356                                              16, 16),
 357                                 4);
 358 }
 359
 360 static LLVMValueRef
 361 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 362 {
 363         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 364         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 365
 366         return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
 367 }
 368
 369 static LLVMValueRef
 370 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 371 {
 372         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 373         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 374         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 375
 376         return LLVMBuildAdd(ctx->ac.builder, patch0_offset,
 377                             LLVMBuildMul(ctx->ac.builder, patch_stride,
 378                                          rel_patch_id, ""),
 379                             "");
 380 }
 381
 382 static LLVMValueRef
 383 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 384 {
 385         LLVMValueRef patch0_patch_data_offset =
 386                 get_tcs_out_patch0_patch_data_offset(ctx);
 387         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 388         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 389
 390         return LLVMBuildAdd(ctx->ac.builder, patch0_patch_data_offset,
 391                             LLVMBuildMul(ctx->ac.builder, patch_stride,
 392                                          rel_patch_id, ""),
 393                             "");
 394 }
 395
 396 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
 397 {
 398         unsigned tcs_out_vertices =
 399                 ctx->shader->selector ?
 400                 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
 401
 402         /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
 403         if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
 404                 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
 405
 406         return unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
 407 }
 408
 409 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 410 {
 411         unsigned stride;
 412
 413         switch (ctx->type) {
 414         case PIPE_SHADER_VERTEX:
 415                 stride = util_last_bit64(ctx->shader->selector->outputs_written);
 416                 return LLVMConstInt(ctx->i32, stride * 4, 0);
 417
 418         case PIPE_SHADER_TESS_CTRL:
 419                 if (ctx->screen->info.chip_class >= GFX9 &&
 420                     ctx->shader->is_monolithic) {
 421                         stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
 422                         return LLVMConstInt(ctx->i32, stride * 4, 0);
 423                 }
 424                 return unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 425
 426         default:
 427                 assert(0);
 428                 return NULL;
 429         }
 430 }
 431
 432 static LLVMValueRef get_instance_index_for_fetch(
 433         struct si_shader_context *ctx,
 434         unsigned param_start_instance, LLVMValueRef divisor)
 435 {
 436         LLVMValueRef result = ctx->abi.instance_id;
 437
 438         /* The division must be done before START_INSTANCE is added. */
 439         if (divisor != ctx->i32_1)
 440                 result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
 441
 442         return LLVMBuildAdd(ctx->ac.builder, result,
 443                             LLVMGetParam(ctx->main_fn, param_start_instance), "");
 444 }
 445
 446 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
 447  * to float. */
 448 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
 449                                             LLVMValueRef vec4,
 450                                             unsigned double_index)
 451 {
 452         LLVMBuilderRef builder = ctx->ac.builder;
 453         LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
 454         LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
 455                                               LLVMVectorType(f64, 2), "");
 456         LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
 457         LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
 458         return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
 459 }
 460
 461 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
 462                                  LLVMValueRef i32, unsigned index)
 463 {
 464         assert(index <= 1);
 465
 466         if (index == 1)
 467                 return LLVMBuildAShr(ctx->ac.builder, i32,
 468                                      LLVMConstInt(ctx->i32, 16, 0), "");
 469
 470         return LLVMBuildSExt(ctx->ac.builder,
 471                              LLVMBuildTrunc(ctx->ac.builder, i32,
 472                                             ctx->ac.i16, ""),
 473                              ctx->i32, "");
 474 }
 475
 476 void si_llvm_load_input_vs(
 477         struct si_shader_context *ctx,
 478         unsigned input_index,
 479         LLVMValueRef out[4])
 480 {
 481         unsigned vs_blit_property =
 482                 ctx->shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
 483
 484         if (vs_blit_property) {
 485                 LLVMValueRef vertex_id = ctx->abi.vertex_id;
 486                 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
 487                                                     LLVMIntULE, vertex_id,
 488                                                     ctx->i32_1, "");
 489                 /* Use LLVMIntNE, because we have 3 vertices and only
 490                  * the middle one should use y2.
 491                  */
 492                 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
 493                                                     LLVMIntNE, vertex_id,
 494                                                     ctx->i32_1, "");
 495
 496                 if (input_index == 0) {
 497                         /* Position: */
 498                         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
 499                                                          ctx->param_vs_blit_inputs);
 500                         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
 501                                                          ctx->param_vs_blit_inputs + 1);
 502
 503                         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
 504                         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
 505                         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
 506                         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
 507
 508                         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 509                                                          x1, x2, "");
 510                         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 511                                                          y1, y2, "");
 512
 513                         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
 514                         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
 515                         out[2] = LLVMGetParam(ctx->main_fn,
 516                                               ctx->param_vs_blit_inputs + 2);
 517                         out[3] = ctx->ac.f32_1;
 518                         return;
 519                 }
 520
 521                 /* Color or texture coordinates: */
 522                 assert(input_index == 1);
 523
 524                 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
 525                         for (int i = 0; i < 4; i++) {
 526                                 out[i] = LLVMGetParam(ctx->main_fn,
 527                                                       ctx->param_vs_blit_inputs + 3 + i);
 528                         }
 529                 } else {
 530                         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
 531                         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
 532                                                        ctx->param_vs_blit_inputs + 3);
 533                         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
 534                                                        ctx->param_vs_blit_inputs + 4);
 535                         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
 536                                                        ctx->param_vs_blit_inputs + 5);
 537                         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
 538                                                        ctx->param_vs_blit_inputs + 6);
 539
 540                         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 541                                                  x1, x2, "");
 542                         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 543                                                  y1, y2, "");
 544                         out[2] = LLVMGetParam(ctx->main_fn,
 545                                               ctx->param_vs_blit_inputs + 7);
 546                         out[3] = LLVMGetParam(ctx->main_fn,
 547                                               ctx->param_vs_blit_inputs + 8);
 548                 }
 549                 return;
 550         }
 551
 552         unsigned chan;
 553         unsigned fix_fetch;
 554         unsigned num_fetches;
 555         unsigned fetch_stride;
 556
 557         LLVMValueRef t_list_ptr;
 558         LLVMValueRef t_offset;
 559         LLVMValueRef t_list;
 560         LLVMValueRef vertex_index;
 561         LLVMValueRef input[3];
 562
 563         /* Load the T list */
 564         t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 565
 566         t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 567
 568         t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 569
 570         vertex_index = LLVMGetParam(ctx->main_fn,
 571                                     ctx->param_vertex_index0 +
 572                                     input_index);
 573
 574         fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
 575
 576         /* Do multiple loads for special formats. */
 577         switch (fix_fetch) {
 578         case SI_FIX_FETCH_RGB_64_FLOAT:
 579                 num_fetches = 3; /* 3 2-dword loads */
 580                 fetch_stride = 8;
 581                 break;
 582         case SI_FIX_FETCH_RGBA_64_FLOAT:
 583                 num_fetches = 2; /* 2 4-dword loads */
 584                 fetch_stride = 16;
 585                 break;
 586         case SI_FIX_FETCH_RGB_8:
 587         case SI_FIX_FETCH_RGB_8_INT:
 588                 num_fetches = 3;
 589                 fetch_stride = 1;
 590                 break;
 591         case SI_FIX_FETCH_RGB_16:
 592         case SI_FIX_FETCH_RGB_16_INT:
 593                 num_fetches = 3;
 594                 fetch_stride = 2;
 595                 break;
 596         default:
 597                 num_fetches = 1;
 598                 fetch_stride = 0;
 599         }
 600
 601         for (unsigned i = 0; i < num_fetches; i++) {
 602                 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
 603
 604                 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
 605                                                        vertex_index, voffset,
 606                                                        true);
 607         }
 608
 609         /* Break up the vec4 into individual components */
 610         for (chan = 0; chan < 4; chan++) {
 611                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
 612                 out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 613                                                     input[0], llvm_chan, "");
 614         }
 615
 616         switch (fix_fetch) {
 617         case SI_FIX_FETCH_A2_SNORM:
 618         case SI_FIX_FETCH_A2_SSCALED:
 619         case SI_FIX_FETCH_A2_SINT: {
 620                 /* The hardware returns an unsigned value; convert it to a
 621                  * signed one.
 622                  */
 623                 LLVMValueRef tmp = out[3];
 624                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 625
 626                 /* First, recover the sign-extended signed integer value. */
 627                 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
 628                         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
 629                 else
 630                         tmp = ac_to_integer(&ctx->ac, tmp);
 631
 632                 /* For the integer-like cases, do a natural sign extension.
 633                  *
 634                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 635                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 636                  * exponent.
 637                  */
 638                 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
 639                                    fix_fetch == SI_FIX_FETCH_A2_SNORM ?
 640                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 641                 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 642
 643                 /* Convert back to the right type. */
 644                 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
 645                         LLVMValueRef clamp;
 646                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 647                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 648                         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
 649                         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
 650                 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
 651                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
 652                 }
 653
 654                 out[3] = tmp;
 655                 break;
 656         }
 657         case SI_FIX_FETCH_RGBA_32_UNORM:
 658         case SI_FIX_FETCH_RGBX_32_UNORM:
 659                 for (chan = 0; chan < 4; chan++) {
 660                         out[chan] = ac_to_integer(&ctx->ac, out[chan]);
 661                         out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
 662                                                     out[chan], ctx->f32, "");
 663                         out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
 664                                                   LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
 665                 }
 666                 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 667                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
 668                         out[3] = LLVMConstReal(ctx->f32, 1);
 669                 break;
 670         case SI_FIX_FETCH_RGBA_32_SNORM:
 671         case SI_FIX_FETCH_RGBX_32_SNORM:
 672         case SI_FIX_FETCH_RGBA_32_FIXED:
 673         case SI_FIX_FETCH_RGBX_32_FIXED: {
 674                 double scale;
 675                 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
 676                         scale = 1.0 / 0x10000;
 677                 else
 678                         scale = 1.0 / INT_MAX;
 679
 680                 for (chan = 0; chan < 4; chan++) {
 681                         out[chan] = ac_to_integer(&ctx->ac, out[chan]);
 682                         out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
 683                                                     out[chan], ctx->f32, "");
 684                         out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
 685                                                   LLVMConstReal(ctx->f32, scale), "");
 686                 }
 687                 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 688                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
 689                     fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
 690                         out[3] = LLVMConstReal(ctx->f32, 1);
 691                 break;
 692         }
 693         case SI_FIX_FETCH_RGBA_32_USCALED:
 694                 for (chan = 0; chan < 4; chan++) {
 695                         out[chan] = ac_to_integer(&ctx->ac, out[chan]);
 696                         out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
 697                                                     out[chan], ctx->f32, "");
 698                 }
 699                 break;
 700         case SI_FIX_FETCH_RGBA_32_SSCALED:
 701                 for (chan = 0; chan < 4; chan++) {
 702                         out[chan] = ac_to_integer(&ctx->ac, out[chan]);
 703                         out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
 704                                                     out[chan], ctx->f32, "");
 705                 }
 706                 break;
 707         case SI_FIX_FETCH_RG_64_FLOAT:
 708                 for (chan = 0; chan < 2; chan++)
 709                         out[chan] = extract_double_to_float(ctx, input[0], chan);
 710
 711                 out[2] = LLVMConstReal(ctx->f32, 0);
 712                 out[3] = LLVMConstReal(ctx->f32, 1);
 713                 break;
 714         case SI_FIX_FETCH_RGB_64_FLOAT:
 715                 for (chan = 0; chan < 3; chan++)
 716                         out[chan] = extract_double_to_float(ctx, input[chan], 0);
 717
 718                 out[3] = LLVMConstReal(ctx->f32, 1);
 719                 break;
 720         case SI_FIX_FETCH_RGBA_64_FLOAT:
 721                 for (chan = 0; chan < 4; chan++) {
 722                         out[chan] = extract_double_to_float(ctx, input[chan / 2],
 723                                                             chan % 2);
 724                 }
 725                 break;
 726         case SI_FIX_FETCH_RGB_8:
 727         case SI_FIX_FETCH_RGB_8_INT:
 728         case SI_FIX_FETCH_RGB_16:
 729         case SI_FIX_FETCH_RGB_16_INT:
 730                 for (chan = 0; chan < 3; chan++) {
 731                         out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 732                                                             input[chan],
 733                                                             ctx->i32_0, "");
 734                 }
 735                 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
 736                     fix_fetch == SI_FIX_FETCH_RGB_16) {
 737                         out[3] = LLVMConstReal(ctx->f32, 1);
 738                 } else {
 739                         out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
 740                 }
 741                 break;
 742         }
 743 }
 744
 745 static void declare_input_vs(
 746         struct si_shader_context *ctx,
 747         unsigned input_index,
 748         const struct tgsi_full_declaration *decl,
 749         LLVMValueRef out[4])
 750 {
 751         si_llvm_load_input_vs(ctx, input_index, out);
 752 }
 753
 754 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
 755                                      unsigned swizzle)
 756 {
 757         if (swizzle > 0)
 758                 return ctx->i32_0;
 759
 760         switch (ctx->type) {
 761         case PIPE_SHADER_VERTEX:
 762                 return LLVMGetParam(ctx->main_fn,
 763                                     ctx->param_vs_prim_id);
 764         case PIPE_SHADER_TESS_CTRL:
 765                 return LLVMGetParam(ctx->main_fn,
 766                                     ctx->param_tcs_patch_id);
 767         case PIPE_SHADER_TESS_EVAL:
 768                 return LLVMGetParam(ctx->main_fn,
 769                                     ctx->param_tes_patch_id);
 770         case PIPE_SHADER_GEOMETRY:
 771                 return ctx->abi.gs_prim_id;
 772         default:
 773                 assert(0);
 774                 return ctx->i32_0;
 775         }
 776 }
 777
 778 /**
 779  * Return the value of tgsi_ind_register for indexing.
 780  * This is the indirect index with the constant offset added to it.
 781  */
 782 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
 783                                    const struct tgsi_ind_register *ind,
 784                                    unsigned addr_mul,
 785                                    int rel_index)
 786 {
 787         LLVMValueRef result;
 788
 789         if (ind->File == TGSI_FILE_ADDRESS) {
 790                 result = ctx->addrs[ind->Index][ind->Swizzle];
 791                 result = LLVMBuildLoad(ctx->ac.builder, result, "");
 792         } else {
 793                 struct tgsi_full_src_register src = {};
 794
 795                 src.Register.File = ind->File;
 796                 src.Register.Index = ind->Index;
 797
 798                 /* Set the second index to 0 for constants. */
 799                 if (ind->File == TGSI_FILE_CONSTANT)
 800                         src.Register.Dimension = 1;
 801
 802                 result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src,
 803                                                                    TGSI_TYPE_SIGNED,
 804                                                                    ind->Swizzle);
 805                 result = ac_to_integer(&ctx->ac, result);
 806         }
 807
 808         if (addr_mul != 1)
 809                 result = LLVMBuildMul(ctx->ac.builder, result,
 810                                       LLVMConstInt(ctx->i32, addr_mul, 0), "");
 811         result = LLVMBuildAdd(ctx->ac.builder, result,
 812                               LLVMConstInt(ctx->i32, rel_index, 0), "");
 813         return result;
 814 }
 815
 816 /**
 817  * Like si_get_indirect_index, but restricts the return value to a (possibly
 818  * undefined) value inside [0..num).
 819  */
 820 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
 821                                            const struct tgsi_ind_register *ind,
 822                                            int rel_index, unsigned num)
 823 {
 824         LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index);
 825
 826         return si_llvm_bound_index(ctx, result, num);
 827 }
 828
 829
 830 /**
 831  * Calculate a dword address given an input or output register and a stride.
 832  */
 833 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 834                                    const struct tgsi_full_dst_register *dst,
 835                                    const struct tgsi_full_src_register *src,
 836                                    LLVMValueRef vertex_dw_stride,
 837                                    LLVMValueRef base_addr)
 838 {
 839         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 840         ubyte *name, *index, *array_first;
 841         int first, param;
 842         struct tgsi_full_dst_register reg;
 843
 844         /* Set the register description. The address computation is the same
 845          * for sources and destinations. */
 846         if (src) {
 847                 reg.Register.File = src->Register.File;
 848                 reg.Register.Index = src->Register.Index;
 849                 reg.Register.Indirect = src->Register.Indirect;
 850                 reg.Register.Dimension = src->Register.Dimension;
 851                 reg.Indirect = src->Indirect;
 852                 reg.Dimension = src->Dimension;
 853                 reg.DimIndirect = src->DimIndirect;
 854         } else
 855                 reg = *dst;
 856
 857         /* If the register is 2-dimensional (e.g. an array of vertices
 858          * in a primitive), calculate the base address of the vertex. */
 859         if (reg.Register.Dimension) {
 860                 LLVMValueRef index;
 861
 862                 if (reg.Dimension.Indirect)
 863                         index = si_get_indirect_index(ctx, &reg.DimIndirect,
 864                                                       1, reg.Dimension.Index);
 865                 else
 866                         index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 867
 868                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 869                                          LLVMBuildMul(ctx->ac.builder, index,
 870                                                       vertex_dw_stride, ""), "");
 871         }
 872
 873         /* Get information about the register. */
 874         if (reg.Register.File == TGSI_FILE_INPUT) {
 875                 name = info->input_semantic_name;
 876                 index = info->input_semantic_index;
 877                 array_first = info->input_array_first;
 878         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 879                 name = info->output_semantic_name;
 880                 index = info->output_semantic_index;
 881                 array_first = info->output_array_first;
 882         } else {
 883                 assert(0);
 884                 return NULL;
 885         }
 886
 887         if (reg.Register.Indirect) {
 888                 /* Add the relative address of the element. */
 889                 LLVMValueRef ind_index;
 890
 891                 if (reg.Indirect.ArrayID)
 892                         first = array_first[reg.Indirect.ArrayID];
 893                 else
 894                         first = reg.Register.Index;
 895
 896                 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
 897                                                   1, reg.Register.Index - first);
 898
 899                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 900                                     LLVMBuildMul(ctx->ac.builder, ind_index,
 901                                                  LLVMConstInt(ctx->i32, 4, 0), ""), "");
 902
 903                 param = reg.Register.Dimension ?
 904                         si_shader_io_get_unique_index(name[first], index[first]) :
 905                         si_shader_io_get_unique_index_patch(name[first], index[first]);
 906         } else {
 907                 param = reg.Register.Dimension ?
 908                         si_shader_io_get_unique_index(name[reg.Register.Index],
 909                                                       index[reg.Register.Index]) :
 910                         si_shader_io_get_unique_index_patch(name[reg.Register.Index],
 911                                                             index[reg.Register.Index]);
 912         }
 913
 914         /* Add the base address of the element. */
 915         return LLVMBuildAdd(ctx->ac.builder, base_addr,
 916                             LLVMConstInt(ctx->i32, param * 4, 0), "");
 917 }
 918
 919 /* The offchip buffer layout for TCS->TES is
 920  *
 921  * - attribute 0 of patch 0 vertex 0
 922  * - attribute 0 of patch 0 vertex 1
 923  * - attribute 0 of patch 0 vertex 2
 924  *   ...
 925  * - attribute 0 of patch 1 vertex 0
 926  * - attribute 0 of patch 1 vertex 1
 927  *   ...
 928  * - attribute 1 of patch 0 vertex 0
 929  * - attribute 1 of patch 0 vertex 1
 930  *   ...
 931  * - per patch attribute 0 of patch 0
 932  * - per patch attribute 0 of patch 1
 933  *   ...
 934  *
 935  * Note that every attribute has 4 components.
 936  */
 937 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 938                                                LLVMValueRef rel_patch_id,
 939                                                LLVMValueRef vertex_index,
 940                                                LLVMValueRef param_index)
 941 {
 942         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 943         LLVMValueRef param_stride, constant16;
 944
 945         vertices_per_patch = get_num_tcs_out_vertices(ctx);
 946         num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
 947         total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
 948                                       num_patches, "");
 949
 950         constant16 = LLVMConstInt(ctx->i32, 16, 0);
 951         if (vertex_index) {
 952                 base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
 953                                          vertices_per_patch, "");
 954
 955                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 956                                          vertex_index, "");
 957
 958                 param_stride = total_vertices;
 959         } else {
 960                 base_addr = rel_patch_id;
 961                 param_stride = num_patches;
 962         }
 963
 964         base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 965                                  LLVMBuildMul(ctx->ac.builder, param_index,
 966                                               param_stride, ""), "");
 967
 968         base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
 969
 970         if (!vertex_index) {
 971                 LLVMValueRef patch_data_offset =
 972                            unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
 973
 974                 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
 975                                          patch_data_offset, "");
 976         }
 977         return base_addr;
 978 }
 979
 980 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 981                                        struct si_shader_context *ctx,
 982                                        const struct tgsi_full_dst_register *dst,
 983                                        const struct tgsi_full_src_register *src)
 984 {
 985         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 986         ubyte *name, *index, *array_first;
 987         struct tgsi_full_src_register reg;
 988         LLVMValueRef vertex_index = NULL;
 989         LLVMValueRef param_index = NULL;
 990         unsigned param_index_base, param_base;
 991
 992         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 993
 994         if (reg.Register.Dimension) {
 995
 996                 if (reg.Dimension.Indirect)
 997                         vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
 998                                                              1, reg.Dimension.Index);
 999                 else
1000                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
1001         }
1002
1003         /* Get information about the register. */
1004         if (reg.Register.File == TGSI_FILE_INPUT) {
1005                 name = info->input_semantic_name;
1006                 index = info->input_semantic_index;
1007                 array_first = info->input_array_first;
1008         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1009                 name = info->output_semantic_name;
1010                 index = info->output_semantic_index;
1011                 array_first = info->output_array_first;
1012         } else {
1013                 assert(0);
1014                 return NULL;
1015         }
1016
1017         if (reg.Register.Indirect) {
1018                 if (reg.Indirect.ArrayID)
1019                         param_base = array_first[reg.Indirect.ArrayID];
1020                 else
1021                         param_base = reg.Register.Index;
1022
1023                 param_index = si_get_indirect_index(ctx, &reg.Indirect,
1024                                                     1, reg.Register.Index - param_base);
1025
1026         } else {
1027                 param_base = reg.Register.Index;
1028                 param_index = ctx->i32_0;
1029         }
1030
1031         param_index_base = reg.Register.Dimension ?
1032                 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
1033                 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
1034
1035         param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1036                                    LLVMConstInt(ctx->i32, param_index_base, 0),
1037                                    "");
1038
1039         return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
1040                                           vertex_index, param_index);
1041 }
1042
1043 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
1044                                 LLVMTypeRef type, unsigned swizzle,
1045                                 LLVMValueRef buffer, LLVMValueRef offset,
1046                                 LLVMValueRef base, bool can_speculate)
1047 {
1048         struct si_shader_context *ctx = si_shader_context(bld_base);
1049         LLVMValueRef value, value2;
1050         LLVMTypeRef vec_type = LLVMVectorType(type, 4);
1051
1052         if (swizzle == ~0) {
1053                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1054                                              0, 1, 0, can_speculate, false);
1055
1056                 return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1057         }
1058
1059         if (!llvm_type_is_64bit(ctx, type)) {
1060                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1061                                              0, 1, 0, can_speculate, false);
1062
1063                 value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1064                 return LLVMBuildExtractElement(ctx->ac.builder, value,
1065                                     LLVMConstInt(ctx->i32, swizzle, 0), "");
1066         }
1067
1068         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1069                                   swizzle * 4, 1, 0, can_speculate, false);
1070
1071         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1072                                    swizzle * 4 + 4, 1, 0, can_speculate, false);
1073
1074         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1075 }
1076
1077 /**
1078  * Load from LDS.
1079  *
1080  * \param type          output value type
1081  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
1082  * \param dw_addr       address in dwords
1083  */
1084 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
1085                              LLVMTypeRef type, unsigned swizzle,
1086                              LLVMValueRef dw_addr)
1087 {
1088         struct si_shader_context *ctx = si_shader_context(bld_base);
1089         LLVMValueRef value;
1090
1091         if (swizzle == ~0) {
1092                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1093
1094                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1095                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
1096
1097                 return lp_build_gather_values(&ctx->gallivm, values,
1098                                               TGSI_NUM_CHANNELS);
1099         }
1100
1101         /* Split 64-bit loads. */
1102         if (llvm_type_is_64bit(ctx, type)) {
1103                 LLVMValueRef lo, hi;
1104
1105                 lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr);
1106                 hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr);
1107                 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
1108         }
1109
1110         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1111                             LLVMConstInt(ctx->i32, swizzle, 0));
1112
1113         value = ac_lds_load(&ctx->ac, dw_addr);
1114
1115         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1116 }
1117
1118 /**
1119  * Store to LDS.
1120  *
1121  * \param swizzle       offset (typically 0..3)
1122  * \param dw_addr       address in dwords
1123  * \param value         value to store
1124  */
1125 static void lds_store(struct si_shader_context *ctx,
1126                       unsigned dw_offset_imm, LLVMValueRef dw_addr,
1127                       LLVMValueRef value)
1128 {
1129         dw_addr = lp_build_add(&ctx->bld_base.uint_bld, dw_addr,
1130                             LLVMConstInt(ctx->i32, dw_offset_imm, 0));
1131
1132         ac_lds_store(&ctx->ac, dw_addr, value);
1133 }
1134
1135 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
1136                                                   unsigned param)
1137 {
1138         LLVMBuilderRef builder = ctx->ac.builder;
1139
1140         LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1141         addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
1142         addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
1143
1144         uint64_t desc2 = 0xffffffff;
1145         uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1146                          S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1147                          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1148                          S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1149                          S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1150                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1151         LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1152
1153         LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1154         desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1155         desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1156         return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1157 }
1158
1159 static LLVMValueRef fetch_input_tcs(
1160         struct lp_build_tgsi_context *bld_base,
1161         const struct tgsi_full_src_register *reg,
1162         enum tgsi_opcode_type type, unsigned swizzle)
1163 {
1164         struct si_shader_context *ctx = si_shader_context(bld_base);
1165         LLVMValueRef dw_addr, stride;
1166
1167         stride = get_tcs_in_vertex_dw_stride(ctx);
1168         dw_addr = get_tcs_in_current_patch_offset(ctx);
1169         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1170
1171         return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1172 }
1173
1174 static LLVMValueRef fetch_output_tcs(
1175                 struct lp_build_tgsi_context *bld_base,
1176                 const struct tgsi_full_src_register *reg,
1177                 enum tgsi_opcode_type type, unsigned swizzle)
1178 {
1179         struct si_shader_context *ctx = si_shader_context(bld_base);
1180         LLVMValueRef dw_addr, stride;
1181
1182         if (reg->Register.Dimension) {
1183                 stride = get_tcs_out_vertex_dw_stride(ctx);
1184                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1185                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1186         } else {
1187                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1188                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1189         }
1190
1191         return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1192 }
1193
1194 static LLVMValueRef fetch_input_tes(
1195         struct lp_build_tgsi_context *bld_base,
1196         const struct tgsi_full_src_register *reg,
1197         enum tgsi_opcode_type type, unsigned swizzle)
1198 {
1199         struct si_shader_context *ctx = si_shader_context(bld_base);
1200         LLVMValueRef buffer, base, addr;
1201
1202         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1203
1204         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1205         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1206
1207         return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle,
1208                            buffer, base, addr, true);
1209 }
1210
1211 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1212                              const struct tgsi_full_instruction *inst,
1213                              const struct tgsi_opcode_info *info,
1214                              unsigned index,
1215                              LLVMValueRef dst[4])
1216 {
1217         struct si_shader_context *ctx = si_shader_context(bld_base);
1218         const struct tgsi_full_dst_register *reg = &inst->Dst[index];
1219         const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1220         unsigned chan_index;
1221         LLVMValueRef dw_addr, stride;
1222         LLVMValueRef buffer, base, buf_addr;
1223         LLVMValueRef values[4];
1224         bool skip_lds_store;
1225         bool is_tess_factor = false, is_tess_inner = false;
1226
1227         /* Only handle per-patch and per-vertex outputs here.
1228          * Vectors will be lowered to scalars and this function will be called again.
1229          */
1230         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1231             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1232                 si_llvm_emit_store(bld_base, inst, info, index, dst);
1233                 return;
1234         }
1235
1236         if (reg->Register.Dimension) {
1237                 stride = get_tcs_out_vertex_dw_stride(ctx);
1238                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1239                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1240                 skip_lds_store = !sh_info->reads_pervertex_outputs;
1241         } else {
1242                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1243                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1244                 skip_lds_store = !sh_info->reads_perpatch_outputs;
1245
1246                 if (!reg->Register.Indirect) {
1247                         int name = sh_info->output_semantic_name[reg->Register.Index];
1248
1249                         /* Always write tess factors into LDS for the TCS epilog. */
1250                         if (name == TGSI_SEMANTIC_TESSINNER ||
1251                             name == TGSI_SEMANTIC_TESSOUTER) {
1252                                 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1253                                 skip_lds_store = !sh_info->reads_tessfactor_outputs &&
1254                                                  ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1255                                 is_tess_factor = true;
1256                                 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1257                         }
1258                 }
1259         }
1260
1261         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1262
1263         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1264         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1265
1266         uint32_t writemask = reg->Register.WriteMask;
1267         while (writemask) {
1268                 chan_index = u_bit_scan(&writemask);
1269                 LLVMValueRef value = dst[chan_index];
1270
1271                 if (inst->Instruction.Saturate)
1272                         value = ac_build_clamp(&ctx->ac, value);
1273
1274                 /* Skip LDS stores if there is no LDS read of this output. */
1275                 if (!skip_lds_store)
1276                         lds_store(ctx, chan_index, dw_addr, value);
1277
1278                 value = ac_to_integer(&ctx->ac, value);
1279                 values[chan_index] = value;
1280
1281                 if (reg->Register.WriteMask != 0xF && !is_tess_factor) {
1282                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1283                                                     buf_addr, base,
1284                                                     4 * chan_index, 1, 0, true, false);
1285                 }
1286
1287                 /* Write tess factors into VGPRs for the epilog. */
1288                 if (is_tess_factor &&
1289                     ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1290                         if (!is_tess_inner) {
1291                                 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1292                                                ctx->invoc0_tess_factors[chan_index]);
1293                         } else if (chan_index < 2) {
1294                                 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1295                                                ctx->invoc0_tess_factors[4 + chan_index]);
1296                         }
1297                 }
1298         }
1299
1300         if (reg->Register.WriteMask == 0xF && !is_tess_factor) {
1301                 LLVMValueRef value = lp_build_gather_values(&ctx->gallivm,
1302                                                             values, 4);
1303                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1304                                             base, 0, 1, 0, true, false);
1305         }
1306 }
1307
1308 LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
1309                                    unsigned input_index,
1310                                    unsigned vtx_offset_param,
1311                                    LLVMTypeRef type,
1312                                    unsigned swizzle)
1313 {
1314         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1315         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1316         struct si_shader *shader = ctx->shader;
1317         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1318         LLVMValueRef vtx_offset, soffset;
1319         struct tgsi_shader_info *info = &shader->selector->info;
1320         unsigned semantic_name = info->input_semantic_name[input_index];
1321         unsigned semantic_index = info->input_semantic_index[input_index];
1322         unsigned param;
1323         LLVMValueRef value;
1324
1325         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1326
1327         /* GFX9 has the ESGS ring in LDS. */
1328         if (ctx->screen->info.chip_class >= GFX9) {
1329                 unsigned index = vtx_offset_param;
1330
1331                 switch (index / 2) {
1332                 case 0:
1333                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1334                                                   index % 2 ? 16 : 0, 16);
1335                         break;
1336                 case 1:
1337                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1338                                                   index % 2 ? 16 : 0, 16);
1339                         break;
1340                 case 2:
1341                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1342                                                   index % 2 ? 16 : 0, 16);
1343                         break;
1344                 default:
1345                         assert(0);
1346                         return NULL;
1347                 }
1348
1349                 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
1350                                           LLVMConstInt(ctx->i32, param * 4, 0), "");
1351                 return lds_load(bld_base, type, swizzle, vtx_offset);
1352         }
1353
1354         /* GFX6: input load from the ESGS ring in memory. */
1355         if (swizzle == ~0) {
1356                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1357                 unsigned chan;
1358                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1359                         values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
1360                                                              type, chan);
1361                 }
1362                 return lp_build_gather_values(&ctx->gallivm, values,
1363                                               TGSI_NUM_CHANNELS);
1364         }
1365
1366         /* Get the vertex offset parameter on GFX6. */
1367         LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param];
1368
1369         vtx_offset = lp_build_mul_imm(uint, gs_vtx_offset, 4);
1370
1371         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1372
1373         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1374                                      vtx_offset, soffset, 0, 1, 0, true, false);
1375         if (llvm_type_is_64bit(ctx, type)) {
1376                 LLVMValueRef value2;
1377                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1378
1379                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1380                                               ctx->i32_0, vtx_offset, soffset,
1381                                               0, 1, 0, true, false);
1382                 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1383         }
1384         return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1385 }
1386
1387 static LLVMValueRef fetch_input_gs(
1388         struct lp_build_tgsi_context *bld_base,
1389         const struct tgsi_full_src_register *reg,
1390         enum tgsi_opcode_type type,
1391         unsigned swizzle)
1392 {
1393         struct si_shader_context *ctx = si_shader_context(bld_base);
1394         struct tgsi_shader_info *info = &ctx->shader->selector->info;
1395
1396         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1397         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1398                 return get_primitive_id(ctx, swizzle);
1399
1400         if (!reg->Register.Dimension)
1401                 return NULL;
1402
1403         return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index,
1404                                      reg->Dimension.Index,
1405                                      tgsi2llvmtype(bld_base, type),
1406                                      swizzle);
1407 }
1408
1409 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1410 {
1411         switch (interpolate) {
1412         case TGSI_INTERPOLATE_CONSTANT:
1413                 return 0;
1414
1415         case TGSI_INTERPOLATE_LINEAR:
1416                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1417                         return SI_PARAM_LINEAR_SAMPLE;
1418                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1419                         return SI_PARAM_LINEAR_CENTROID;
1420                 else
1421                         return SI_PARAM_LINEAR_CENTER;
1422                 break;
1423         case TGSI_INTERPOLATE_COLOR:
1424         case TGSI_INTERPOLATE_PERSPECTIVE:
1425                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1426                         return SI_PARAM_PERSP_SAMPLE;
1427                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1428                         return SI_PARAM_PERSP_CENTROID;
1429                 else
1430                         return SI_PARAM_PERSP_CENTER;
1431                 break;
1432         default:
1433                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1434                 return -1;
1435         }
1436 }
1437
1438 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1439                                        unsigned attr_index, unsigned chan,
1440                                        LLVMValueRef prim_mask,
1441                                        LLVMValueRef i, LLVMValueRef j)
1442 {
1443         if (i || j) {
1444                 return ac_build_fs_interp(&ctx->ac,
1445                                           LLVMConstInt(ctx->i32, chan, 0),
1446                                           LLVMConstInt(ctx->i32, attr_index, 0),
1447                                           prim_mask, i, j);
1448         }
1449         return ac_build_fs_interp_mov(&ctx->ac,
1450                                       LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1451                                       LLVMConstInt(ctx->i32, chan, 0),
1452                                       LLVMConstInt(ctx->i32, attr_index, 0),
1453                                       prim_mask);
1454 }
1455
1456 /**
1457  * Interpolate a fragment shader input.
1458  *
1459  * @param ctx           context
1460  * @param input_index           index of the input in hardware
1461  * @param semantic_name         TGSI_SEMANTIC_*
1462  * @param semantic_index        semantic index
1463  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1464  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1465  * @param interp_param          interpolation weights (i,j)
1466  * @param prim_mask             SI_PARAM_PRIM_MASK
1467  * @param face                  SI_PARAM_FRONT_FACE
1468  * @param result                the return value (4 components)
1469  */
1470 static void interp_fs_input(struct si_shader_context *ctx,
1471                             unsigned input_index,
1472                             unsigned semantic_name,
1473                             unsigned semantic_index,
1474                             unsigned num_interp_inputs,
1475                             unsigned colors_read_mask,
1476                             LLVMValueRef interp_param,
1477                             LLVMValueRef prim_mask,
1478                             LLVMValueRef face,
1479                             LLVMValueRef result[4])
1480 {
1481         LLVMValueRef i = NULL, j = NULL;
1482         unsigned chan;
1483
1484         /* fs.constant returns the param from the middle vertex, so it's not
1485          * really useful for flat shading. It's meant to be used for custom
1486          * interpolation (but the intrinsic can't fetch from the other two
1487          * vertices).
1488          *
1489          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1490          * to do the right thing. The only reason we use fs.constant is that
1491          * fs.interp cannot be used on integers, because they can be equal
1492          * to NaN.
1493          *
1494          * When interp is false we will use fs.constant or for newer llvm,
1495          * amdgcn.interp.mov.
1496          */
1497         bool interp = interp_param != NULL;
1498
1499         if (interp) {
1500                 interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
1501                                                 LLVMVectorType(ctx->f32, 2), "");
1502
1503                 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1504                                                 ctx->i32_0, "");
1505                 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1506                                                 ctx->i32_1, "");
1507         }
1508
1509         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1510             ctx->shader->key.part.ps.prolog.color_two_side) {
1511                 LLVMValueRef is_face_positive;
1512
1513                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1514                  * otherwise it's at offset "num_inputs".
1515                  */
1516                 unsigned back_attr_offset = num_interp_inputs;
1517                 if (semantic_index == 1 && colors_read_mask & 0xf)
1518                         back_attr_offset += 1;
1519
1520                 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
1521                                                  face, ctx->i32_0, "");
1522
1523                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1524                         LLVMValueRef front, back;
1525
1526                         front = si_build_fs_interp(ctx,
1527                                                    input_index, chan,
1528                                                    prim_mask, i, j);
1529                         back = si_build_fs_interp(ctx,
1530                                                   back_attr_offset, chan,
1531                                                   prim_mask, i, j);
1532
1533                         result[chan] = LLVMBuildSelect(ctx->ac.builder,
1534                                                 is_face_positive,
1535                                                 front,
1536                                                 back,
1537                                                 "");
1538                 }
1539         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1540                 result[0] = si_build_fs_interp(ctx, input_index,
1541                                                0, prim_mask, i, j);
1542                 result[1] =
1543                 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1544                 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1545         } else {
1546                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1547                         result[chan] = si_build_fs_interp(ctx,
1548                                                           input_index, chan,
1549                                                           prim_mask, i, j);
1550                 }
1551         }
1552 }
1553
1554 void si_llvm_load_input_fs(
1555         struct si_shader_context *ctx,
1556         unsigned input_index,
1557         LLVMValueRef out[4])
1558 {
1559         struct lp_build_context *base = &ctx->bld_base.base;
1560         struct si_shader *shader = ctx->shader;
1561         struct tgsi_shader_info *info = &shader->selector->info;
1562         LLVMValueRef main_fn = ctx->main_fn;
1563         LLVMValueRef interp_param = NULL;
1564         int interp_param_idx;
1565         enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1566         unsigned semantic_index = info->input_semantic_index[input_index];
1567         enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1568         enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1569
1570         /* Get colors from input VGPRs (set by the prolog). */
1571         if (semantic_name == TGSI_SEMANTIC_COLOR) {
1572                 unsigned colors_read = shader->selector->info.colors_read;
1573                 unsigned mask = colors_read >> (semantic_index * 4);
1574                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1575                                   (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1576
1577                 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1578                 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1579                 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1580                 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1581                 return;
1582         }
1583
1584         interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1585         if (interp_param_idx == -1)
1586                 return;
1587         else if (interp_param_idx) {
1588                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1589         }
1590
1591         interp_fs_input(ctx, input_index, semantic_name,
1592                         semantic_index, 0, /* this param is unused */
1593                         shader->selector->info.colors_read, interp_param,
1594                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1595                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1596                         &out[0]);
1597 }
1598
1599 static void declare_input_fs(
1600         struct si_shader_context *ctx,
1601         unsigned input_index,
1602         const struct tgsi_full_declaration *decl,
1603         LLVMValueRef out[4])
1604 {
1605         si_llvm_load_input_fs(ctx, input_index, out);
1606 }
1607
1608 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1609 {
1610         return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1611 }
1612
1613
1614 /**
1615  * Load a dword from a constant buffer.
1616  */
1617 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1618                                       LLVMValueRef resource,
1619                                       LLVMValueRef offset)
1620 {
1621         return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1622                                     0, 0, 0, true, true);
1623 }
1624
1625 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1626 {
1627         struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1628         LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1629         LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1630         LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
1631
1632         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1633         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1634         LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1635
1636         LLVMValueRef pos[4] = {
1637                 buffer_load_const(ctx, resource, offset0),
1638                 buffer_load_const(ctx, resource, offset1),
1639                 LLVMConstReal(ctx->f32, 0),
1640                 LLVMConstReal(ctx->f32, 0)
1641         };
1642
1643         return lp_build_gather_values(&ctx->gallivm, pos, 4);
1644 }
1645
1646 void si_load_system_value(struct si_shader_context *ctx,
1647                           unsigned index,
1648                           const struct tgsi_full_declaration *decl)
1649 {
1650         struct lp_build_context *bld = &ctx->bld_base.base;
1651         LLVMValueRef value = 0;
1652
1653         assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1654
1655         switch (decl->Semantic.Name) {
1656         case TGSI_SEMANTIC_INSTANCEID:
1657                 value = ctx->abi.instance_id;
1658                 break;
1659
1660         case TGSI_SEMANTIC_VERTEXID:
1661                 value = LLVMBuildAdd(ctx->ac.builder,
1662                                      ctx->abi.vertex_id,
1663                                      ctx->abi.base_vertex, "");
1664                 break;
1665
1666         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1667                 /* Unused. Clarify the meaning in indexed vs. non-indexed
1668                  * draws if this is ever used again. */
1669                 assert(false);
1670                 break;
1671
1672         case TGSI_SEMANTIC_BASEVERTEX:
1673         {
1674                 /* For non-indexed draws, the base vertex set by the driver
1675                  * (for direct draws) or the CP (for indirect draws) is the
1676                  * first vertex ID, but GLSL expects 0 to be returned.
1677                  */
1678                 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1679                 LLVMValueRef indexed;
1680
1681                 indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
1682                 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
1683
1684                 value = LLVMBuildSelect(ctx->ac.builder, indexed,
1685                                         ctx->abi.base_vertex, ctx->i32_0, "");
1686                 break;
1687         }
1688
1689         case TGSI_SEMANTIC_BASEINSTANCE:
1690                 value = ctx->abi.start_instance;
1691                 break;
1692
1693         case TGSI_SEMANTIC_DRAWID:
1694                 value = ctx->abi.draw_id;
1695                 break;
1696
1697         case TGSI_SEMANTIC_INVOCATIONID:
1698                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1699                         value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1700                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1701                         value = ctx->abi.gs_invocation_id;
1702                 else
1703                         assert(!"INVOCATIONID not implemented");
1704                 break;
1705
1706         case TGSI_SEMANTIC_POSITION:
1707         {
1708                 LLVMValueRef pos[4] = {
1709                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1710                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1711                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1712                         lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1713                                                  LLVMGetParam(ctx->main_fn,
1714                                                               SI_PARAM_POS_W_FLOAT)),
1715                 };
1716                 value = lp_build_gather_values(&ctx->gallivm, pos, 4);
1717                 break;
1718         }
1719
1720         case TGSI_SEMANTIC_FACE:
1721                 value = ctx->abi.front_face;
1722                 break;
1723
1724         case TGSI_SEMANTIC_SAMPLEID:
1725                 value = get_sample_id(ctx);
1726                 break;
1727
1728         case TGSI_SEMANTIC_SAMPLEPOS: {
1729                 LLVMValueRef pos[4] = {
1730                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1731                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1732                         LLVMConstReal(ctx->f32, 0),
1733                         LLVMConstReal(ctx->f32, 0)
1734                 };
1735                 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1736                                                   TGSI_OPCODE_FRC, pos[0]);
1737                 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1738                                                   TGSI_OPCODE_FRC, pos[1]);
1739                 value = lp_build_gather_values(&ctx->gallivm, pos, 4);
1740                 break;
1741         }
1742
1743         case TGSI_SEMANTIC_SAMPLEMASK:
1744                 /* This can only occur with the OpenGL Core profile, which
1745                  * doesn't support smoothing.
1746                  */
1747                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1748                 break;
1749
1750         case TGSI_SEMANTIC_TESSCOORD:
1751         {
1752                 LLVMValueRef coord[4] = {
1753                         LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1754                         LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1755                         ctx->ac.f32_0,
1756                         ctx->ac.f32_0
1757                 };
1758
1759                 /* For triangles, the vector should be (u, v, 1-u-v). */
1760                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1761                     PIPE_PRIM_TRIANGLES)
1762                         coord[2] = lp_build_sub(bld, ctx->ac.f32_1,
1763                                                 lp_build_add(bld, coord[0], coord[1]));
1764
1765                 value = lp_build_gather_values(&ctx->gallivm, coord, 4);
1766                 break;
1767         }
1768
1769         case TGSI_SEMANTIC_VERTICESIN:
1770                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1771                         value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1772                 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1773                         value = get_num_tcs_out_vertices(ctx);
1774                 else
1775                         assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1776                 break;
1777
1778         case TGSI_SEMANTIC_TESSINNER:
1779         case TGSI_SEMANTIC_TESSOUTER:
1780         {
1781                 LLVMValueRef buffer, base, addr;
1782                 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1783
1784                 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1785
1786                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1787                 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1788                                           LLVMConstInt(ctx->i32, param, 0));
1789
1790                 value = buffer_load(&ctx->bld_base, ctx->f32,
1791                                     ~0, buffer, base, addr, true);
1792
1793                 break;
1794         }
1795
1796         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1797         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1798         {
1799                 LLVMValueRef buf, slot, val[4];
1800                 int i, offset;
1801
1802                 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1803                 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1804                 buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
1805                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1806
1807                 for (i = 0; i < 4; i++)
1808                         val[i] = buffer_load_const(ctx, buf,
1809                                                    LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1810                 value = lp_build_gather_values(&ctx->gallivm, val, 4);
1811                 break;
1812         }
1813
1814         case TGSI_SEMANTIC_PRIMID:
1815                 value = get_primitive_id(ctx, 0);
1816                 break;
1817
1818         case TGSI_SEMANTIC_GRID_SIZE:
1819                 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1820                 break;
1821
1822         case TGSI_SEMANTIC_BLOCK_SIZE:
1823         {
1824                 LLVMValueRef values[3];
1825                 unsigned i;
1826                 unsigned *properties = ctx->shader->selector->info.properties;
1827
1828                 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1829                         unsigned sizes[3] = {
1830                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1831                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1832                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1833                         };
1834
1835                         for (i = 0; i < 3; ++i)
1836                                 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1837
1838                         value = lp_build_gather_values(&ctx->gallivm, values, 3);
1839                 } else {
1840                         value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1841                 }
1842                 break;
1843         }
1844
1845         case TGSI_SEMANTIC_BLOCK_ID:
1846         {
1847                 LLVMValueRef values[3];
1848
1849                 for (int i = 0; i < 3; i++) {
1850                         values[i] = ctx->i32_0;
1851                         if (ctx->param_block_id[i] >= 0) {
1852                                 values[i] = LLVMGetParam(ctx->main_fn,
1853                                                          ctx->param_block_id[i]);
1854                         }
1855                 }
1856                 value = lp_build_gather_values(&ctx->gallivm, values, 3);
1857                 break;
1858         }
1859
1860         case TGSI_SEMANTIC_THREAD_ID:
1861                 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1862                 break;
1863
1864         case TGSI_SEMANTIC_HELPER_INVOCATION:
1865                 value = lp_build_intrinsic(ctx->ac.builder,
1866                                            "llvm.amdgcn.ps.live",
1867                                            ctx->i1, NULL, 0,
1868                                            LP_FUNC_ATTR_READNONE);
1869                 value = LLVMBuildNot(ctx->ac.builder, value, "");
1870                 value = LLVMBuildSExt(ctx->ac.builder, value, ctx->i32, "");
1871                 break;
1872
1873         case TGSI_SEMANTIC_SUBGROUP_SIZE:
1874                 value = LLVMConstInt(ctx->i32, 64, 0);
1875                 break;
1876
1877         case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1878                 value = ac_get_thread_id(&ctx->ac);
1879                 break;
1880
1881         case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1882         {
1883                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1884                 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
1885                 value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1886                 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
1887                 break;
1888         }
1889
1890         case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1891         case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1892         case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1893         case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1894         {
1895                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1896                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1897                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1898                         /* All bits set except LSB */
1899                         value = LLVMConstInt(ctx->i64, -2, 0);
1900                 } else {
1901                         /* All bits set */
1902                         value = LLVMConstInt(ctx->i64, -1, 0);
1903                 }
1904                 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
1905                 value = LLVMBuildShl(ctx->ac.builder, value, id, "");
1906                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1907                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1908                         value = LLVMBuildNot(ctx->ac.builder, value, "");
1909                 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
1910                 break;
1911         }
1912
1913         default:
1914                 assert(!"unknown system value");
1915                 return;
1916         }
1917
1918         ctx->system_values[index] = value;
1919 }
1920
1921 void si_declare_compute_memory(struct si_shader_context *ctx,
1922                                const struct tgsi_full_declaration *decl)
1923 {
1924         struct si_shader_selector *sel = ctx->shader->selector;
1925
1926         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1927         LLVMValueRef var;
1928
1929         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1930         assert(decl->Range.First == decl->Range.Last);
1931         assert(!ctx->ac.lds);
1932
1933         var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1934                                           LLVMArrayType(ctx->i8, sel->local_size),
1935                                           "compute_lds",
1936                                           LOCAL_ADDR_SPACE);
1937         LLVMSetAlignment(var, 4);
1938
1939         ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
1940 }
1941
1942 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1943 {
1944         LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1945                                              ctx->param_const_and_shader_buffers);
1946
1947         return ac_build_load_to_sgpr(&ctx->ac, list_ptr,
1948                                      LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1949 }
1950
1951 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
1952 {
1953         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1954         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1955
1956         index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
1957         index = LLVMBuildAdd(ctx->ac.builder, index,
1958                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1959
1960         return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
1961 }
1962
1963 static LLVMValueRef
1964 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
1965 {
1966         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1967         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
1968                                              ctx->param_const_and_shader_buffers);
1969
1970         index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
1971         index = LLVMBuildSub(ctx->ac.builder,
1972                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
1973                              index, "");
1974
1975         return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
1976 }
1977
1978 static LLVMValueRef fetch_constant(
1979         struct lp_build_tgsi_context *bld_base,
1980         const struct tgsi_full_src_register *reg,
1981         enum tgsi_opcode_type type,
1982         unsigned swizzle)
1983 {
1984         struct si_shader_context *ctx = si_shader_context(bld_base);
1985         struct si_shader_selector *sel = ctx->shader->selector;
1986         const struct tgsi_ind_register *ireg = &reg->Indirect;
1987         unsigned buf, idx;
1988
1989         LLVMValueRef addr, bufp;
1990
1991         if (swizzle == LP_CHAN_ALL) {
1992                 unsigned chan;
1993                 LLVMValueRef values[4];
1994                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1995                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1996
1997                 return lp_build_gather_values(&ctx->gallivm, values, 4);
1998         }
1999
2000         /* Split 64-bit loads. */
2001         if (tgsi_type_is_64bit(type)) {
2002                 LLVMValueRef lo, hi;
2003
2004                 lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
2005                 hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1);
2006                 return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type),
2007                                                 lo, hi);
2008         }
2009
2010         idx = reg->Register.Index * 4 + swizzle;
2011         if (reg->Register.Indirect) {
2012                 addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
2013         } else {
2014                 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
2015         }
2016
2017         /* Fast path when user data SGPRs point to constant buffer 0 directly. */
2018         if (sel->info.const_buffers_declared == 1 &&
2019             sel->info.shader_buffers_declared == 0) {
2020                 LLVMValueRef ptr =
2021                         LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2022
2023                 /* This enables use of s_load_dword and flat_load_dword for const buffer 0
2024                  * loads, and up to x4 load opcode merging. However, it leads to horrible
2025                  * code reducing SIMD wave occupancy from 8 to 2 in many cases.
2026                  *
2027                  * Using s_buffer_load_dword (x1) seems to be the best option right now.
2028                  *
2029                  * LLVM 5.0 on SI doesn't insert a required s_nop between SALU setting
2030                  * a descriptor and s_buffer_load_dword using it, so we can't expand
2031                  * the pointer into a full descriptor like below. We have to use
2032                  * s_load_dword instead. The only case when LLVM 5.0 would select
2033                  * s_buffer_load_dword (that we have to prevent) is when we use use
2034                  * a literal offset where we don't need bounds checking.
2035                  */
2036                 if (ctx->screen->info.chip_class == SI &&
2037                     HAVE_LLVM < 0x0600 &&
2038                     !reg->Register.Indirect) {
2039                         addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
2040                         LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
2041                         return bitcast(bld_base, type, result);
2042                 }
2043
2044                 /* Do the bounds checking with a descriptor, because
2045                  * doing computation and manual bounds checking of 64-bit
2046                  * addresses generates horrible VALU code with very high
2047                  * VGPR usage and very low SIMD occupancy.
2048                  */
2049                 ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, "");
2050                 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
2051
2052                 LLVMValueRef desc_elems[] = {
2053                         LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""),
2054                         LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""),
2055                         LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
2056                         LLVMConstInt(ctx->i32,
2057                                 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2058                                 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2059                                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2060                                 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2061                                 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
2062                                 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
2063                 };
2064                 LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4);
2065                 LLVMValueRef result = buffer_load_const(ctx, desc, addr);
2066                 return bitcast(bld_base, type, result);
2067         }
2068
2069         assert(reg->Register.Dimension);
2070         buf = reg->Dimension.Index;
2071
2072         if (reg->Dimension.Indirect) {
2073                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2074                 LLVMValueRef index;
2075                 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
2076                                                       reg->Dimension.Index,
2077                                                       ctx->num_const_buffers);
2078                 index = LLVMBuildAdd(ctx->ac.builder, index,
2079                                      LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2080                 bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2081         } else
2082                 bufp = load_const_buffer_desc(ctx, buf);
2083
2084         return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
2085 }
2086
2087 /* Upper 16 bits must be zero. */
2088 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
2089                                            LLVMValueRef val[2])
2090 {
2091         return LLVMBuildOr(ctx->ac.builder, val[0],
2092                            LLVMBuildShl(ctx->ac.builder, val[1],
2093                                         LLVMConstInt(ctx->i32, 16, 0),
2094                                         ""), "");
2095 }
2096
2097 /* Upper 16 bits are ignored and will be dropped. */
2098 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
2099                                                     LLVMValueRef val[2])
2100 {
2101         LLVMValueRef v[2] = {
2102                 LLVMBuildAnd(ctx->ac.builder, val[0],
2103                              LLVMConstInt(ctx->i32, 0xffff, 0), ""),
2104                 val[1],
2105         };
2106         return si_llvm_pack_two_int16(ctx, v);
2107 }
2108
2109 /* Initialize arguments for the shader export intrinsic */
2110 static void si_llvm_init_export_args(struct si_shader_context *ctx,
2111                                      LLVMValueRef *values,
2112                                      unsigned target,
2113                                      struct ac_export_args *args)
2114 {
2115         LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
2116         LLVMBuilderRef builder = ctx->ac.builder;
2117         LLVMValueRef val[4];
2118         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
2119         unsigned chan;
2120         bool is_int8, is_int10;
2121
2122         /* Default is 0xf. Adjusted below depending on the format. */
2123         args->enabled_channels = 0xf; /* writemask */
2124
2125         /* Specify whether the EXEC mask represents the valid mask */
2126         args->valid_mask = 0;
2127
2128         /* Specify whether this is the last export */
2129         args->done = 0;
2130
2131         /* Specify the target we are exporting */
2132         args->target = target;
2133
2134         if (ctx->type == PIPE_SHADER_FRAGMENT) {
2135                 const struct si_shader_key *key = &ctx->shader->key;
2136                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
2137                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
2138
2139                 assert(cbuf >= 0 && cbuf < 8);
2140                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2141                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
2142                 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
2143         }
2144
2145         args->compr = false;
2146         args->out[0] = f32undef;
2147         args->out[1] = f32undef;
2148         args->out[2] = f32undef;
2149         args->out[3] = f32undef;
2150
2151         switch (spi_shader_col_format) {
2152         case V_028714_SPI_SHADER_ZERO:
2153                 args->enabled_channels = 0; /* writemask */
2154                 args->target = V_008DFC_SQ_EXP_NULL;
2155                 break;
2156
2157         case V_028714_SPI_SHADER_32_R:
2158                 args->enabled_channels = 1; /* writemask */
2159                 args->out[0] = values[0];
2160                 break;
2161
2162         case V_028714_SPI_SHADER_32_GR:
2163                 args->enabled_channels = 0x3; /* writemask */
2164                 args->out[0] = values[0];
2165                 args->out[1] = values[1];
2166                 break;
2167
2168         case V_028714_SPI_SHADER_32_AR:
2169                 args->enabled_channels = 0x9; /* writemask */
2170                 args->out[0] = values[0];
2171                 args->out[3] = values[3];
2172                 break;
2173
2174         case V_028714_SPI_SHADER_FP16_ABGR:
2175                 args->compr = 1; /* COMPR flag */
2176
2177                 for (chan = 0; chan < 2; chan++) {
2178                         LLVMValueRef pack_args[2] = {
2179                                 values[2 * chan],
2180                                 values[2 * chan + 1]
2181                         };
2182                         LLVMValueRef packed;
2183
2184                         packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
2185                         args->out[chan] = ac_to_float(&ctx->ac, packed);
2186                 }
2187                 break;
2188
2189         case V_028714_SPI_SHADER_UNORM16_ABGR:
2190                 for (chan = 0; chan < 4; chan++) {
2191                         val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
2192                         val[chan] = LLVMBuildFMul(builder, val[chan],
2193                                                   LLVMConstReal(ctx->f32, 65535), "");
2194                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2195                                                   LLVMConstReal(ctx->f32, 0.5), "");
2196                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
2197                                                     ctx->i32, "");
2198                 }
2199
2200                 args->compr = 1; /* COMPR flag */
2201                 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val));
2202                 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val+2));
2203                 break;
2204
2205         case V_028714_SPI_SHADER_SNORM16_ABGR:
2206                 for (chan = 0; chan < 4; chan++) {
2207                         /* Clamp between [-1, 1]. */
2208                         val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_MIN,
2209                                                               values[chan],
2210                                                               LLVMConstReal(ctx->f32, 1));
2211                         val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_MAX,
2212                                                               val[chan],
2213                                                               LLVMConstReal(ctx->f32, -1));
2214                         /* Convert to a signed integer in [-32767, 32767]. */
2215                         val[chan] = LLVMBuildFMul(builder, val[chan],
2216                                                   LLVMConstReal(ctx->f32, 32767), "");
2217                         /* If positive, add 0.5, else add -0.5. */
2218                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2219                                         LLVMBuildSelect(builder,
2220                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
2221                                                               val[chan], ctx->ac.f32_0, ""),
2222                                                 LLVMConstReal(ctx->f32, 0.5),
2223                                                 LLVMConstReal(ctx->f32, -0.5), ""), "");
2224                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2225                 }
2226
2227                 args->compr = 1; /* COMPR flag */
2228                 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val));
2229                 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val+2));
2230                 break;
2231
2232         case V_028714_SPI_SHADER_UINT16_ABGR: {
2233                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2234                         is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
2235                 LLVMValueRef max_alpha =
2236                         !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2237
2238                 /* Clamp. */
2239                 for (chan = 0; chan < 4; chan++) {
2240                         val[chan] = ac_to_integer(&ctx->ac, values[chan]);
2241                         val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_UMIN,
2242                                         val[chan],
2243                                         chan == 3 ? max_alpha : max_rgb);
2244                 }
2245
2246                 args->compr = 1; /* COMPR flag */
2247                 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val));
2248                 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val+2));
2249                 break;
2250         }
2251
2252         case V_028714_SPI_SHADER_SINT16_ABGR: {
2253                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2254                         is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2255                 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2256                         is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2257                 LLVMValueRef max_alpha =
2258                         !is_int10 ? max_rgb : ctx->i32_1;
2259                 LLVMValueRef min_alpha =
2260                         !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2261
2262                 /* Clamp. */
2263                 for (chan = 0; chan < 4; chan++) {
2264                         val[chan] = ac_to_integer(&ctx->ac, values[chan]);
2265                         val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base,
2266                                         TGSI_OPCODE_IMIN,
2267                                         val[chan], chan == 3 ? max_alpha : max_rgb);
2268                         val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base,
2269                                         TGSI_OPCODE_IMAX,
2270                                         val[chan], chan == 3 ? min_alpha : min_rgb);
2271                 }
2272
2273                 args->compr = 1; /* COMPR flag */
2274                 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val));
2275                 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val+2));
2276                 break;
2277         }
2278
2279         case V_028714_SPI_SHADER_32_ABGR:
2280                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2281                 break;
2282         }
2283 }
2284
2285 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2286                           LLVMValueRef alpha)
2287 {
2288         struct si_shader_context *ctx = si_shader_context(bld_base);
2289
2290         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2291                 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
2292                         [PIPE_FUNC_LESS] = LLVMRealOLT,
2293                         [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
2294                         [PIPE_FUNC_LEQUAL] = LLVMRealOLE,
2295                         [PIPE_FUNC_GREATER] = LLVMRealOGT,
2296                         [PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
2297                         [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
2298                 };
2299                 LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
2300                 assert(cond);
2301
2302                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2303                                 SI_PARAM_ALPHA_REF);
2304                 LLVMValueRef alpha_pass =
2305                         LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
2306                 ac_build_kill_if_false(&ctx->ac, alpha_pass);
2307         } else {
2308                 ac_build_kill_if_false(&ctx->ac, LLVMConstInt(ctx->i1, 0, 0));
2309         }
2310 }
2311
2312 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2313                                                   LLVMValueRef alpha,
2314                                                   unsigned samplemask_param)
2315 {
2316         struct si_shader_context *ctx = si_shader_context(bld_base);
2317         LLVMValueRef coverage;
2318
2319         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2320         coverage = LLVMGetParam(ctx->main_fn,
2321                                 samplemask_param);
2322         coverage = ac_to_integer(&ctx->ac, coverage);
2323
2324         coverage = lp_build_intrinsic(ctx->ac.builder, "llvm.ctpop.i32",
2325                                    ctx->i32,
2326                                    &coverage, 1, LP_FUNC_ATTR_READNONE);
2327
2328         coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
2329                                    ctx->f32, "");
2330
2331         coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
2332                                  LLVMConstReal(ctx->f32,
2333                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2334
2335         return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
2336 }
2337
2338 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
2339                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
2340 {
2341         unsigned reg_index;
2342         unsigned chan;
2343         unsigned const_chan;
2344         LLVMValueRef base_elt;
2345         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2346         LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2347                                                    SI_VS_CONST_CLIP_PLANES, 0);
2348         LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
2349
2350         for (reg_index = 0; reg_index < 2; reg_index ++) {
2351                 struct ac_export_args *args = &pos[2 + reg_index];
2352
2353                 args->out[0] =
2354                 args->out[1] =
2355                 args->out[2] =
2356                 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2357
2358                 /* Compute dot products of position and user clip plane vectors */
2359                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2360                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2361                                 LLVMValueRef addr =
2362                                         LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2363                                                                 const_chan) * 4, 0);
2364                                 base_elt = buffer_load_const(ctx, const_resource,
2365                                                              addr);
2366                                 args->out[chan] =
2367                                         lp_build_add(&ctx->bld_base.base, args->out[chan],
2368                                                      lp_build_mul(&ctx->bld_base.base, base_elt,
2369                                                                   out_elts[const_chan]));
2370                         }
2371                 }
2372
2373                 args->enabled_channels = 0xf;
2374                 args->valid_mask = 0;
2375                 args->done = 0;
2376                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2377                 args->compr = 0;
2378         }
2379 }
2380
2381 static void si_dump_streamout(struct pipe_stream_output_info *so)
2382 {
2383         unsigned i;
2384
2385         if (so->num_outputs)
2386                 fprintf(stderr, "STREAMOUT\n");
2387
2388         for (i = 0; i < so->num_outputs; i++) {
2389                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2390                                 so->output[i].start_component;
2391                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2392                         i, so->output[i].output_buffer,
2393                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2394                         so->output[i].register_index,
2395                         mask & 1 ? "x" : "",
2396                         mask & 2 ? "y" : "",
2397                         mask & 4 ? "z" : "",
2398                         mask & 8 ? "w" : "");
2399         }
2400 }
2401
2402 static void emit_streamout_output(struct si_shader_context *ctx,
2403                                   LLVMValueRef const *so_buffers,
2404                                   LLVMValueRef const *so_write_offsets,
2405                                   struct pipe_stream_output *stream_out,
2406                                   struct si_shader_output_values *shader_out)
2407 {
2408         unsigned buf_idx = stream_out->output_buffer;
2409         unsigned start = stream_out->start_component;
2410         unsigned num_comps = stream_out->num_components;
2411         LLVMValueRef out[4];
2412
2413         assert(num_comps && num_comps <= 4);
2414         if (!num_comps || num_comps > 4)
2415                 return;
2416
2417         /* Load the output as int. */
2418         for (int j = 0; j < num_comps; j++) {
2419                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2420
2421                 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
2422         }
2423
2424         /* Pack the output. */
2425         LLVMValueRef vdata = NULL;
2426
2427         switch (num_comps) {
2428         case 1: /* as i32 */
2429                 vdata = out[0];
2430                 break;
2431         case 2: /* as v2i32 */
2432         case 3: /* as v4i32 (aligned to 4) */
2433         case 4: /* as v4i32 */
2434                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2435                 for (int j = 0; j < num_comps; j++) {
2436                         vdata = LLVMBuildInsertElement(ctx->ac.builder, vdata, out[j],
2437                                                        LLVMConstInt(ctx->i32, j, 0), "");
2438                 }
2439                 break;
2440         }
2441
2442         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2443                                     vdata, num_comps,
2444                                     so_write_offsets[buf_idx],
2445                                     ctx->i32_0,
2446                                     stream_out->dst_offset * 4, 1, 1, true, false);
2447 }
2448
2449 /**
2450  * Write streamout data to buffers for vertex stream @p stream (different
2451  * vertex streams can occur for GS copy shaders).
2452  */
2453 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2454                                    struct si_shader_output_values *outputs,
2455                                    unsigned noutput, unsigned stream)
2456 {
2457         struct si_shader_selector *sel = ctx->shader->selector;
2458         struct pipe_stream_output_info *so = &sel->so;
2459         LLVMBuilderRef builder = ctx->ac.builder;
2460         int i;
2461         struct lp_build_if_state if_ctx;
2462
2463         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2464         LLVMValueRef so_vtx_count =
2465                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2466
2467         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2468
2469         /* can_emit = tid < so_vtx_count; */
2470         LLVMValueRef can_emit =
2471                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2472
2473         /* Emit the streamout code conditionally. This actually avoids
2474          * out-of-bounds buffer access. The hw tells us via the SGPR
2475          * (so_vtx_count) which threads are allowed to emit streamout data. */
2476         lp_build_if(&if_ctx, &ctx->gallivm, can_emit);
2477         {
2478                 /* The buffer offset is computed as follows:
2479                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2480                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2481                  *                attrib_offset
2482                  */
2483
2484                 LLVMValueRef so_write_index =
2485                         LLVMGetParam(ctx->main_fn,
2486                                      ctx->param_streamout_write_index);
2487
2488                 /* Compute (streamout_write_index + thread_id). */
2489                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2490
2491                 /* Load the descriptor and compute the write offset for each
2492                  * enabled buffer. */
2493                 LLVMValueRef so_write_offset[4] = {};
2494                 LLVMValueRef so_buffers[4];
2495                 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2496                                                     ctx->param_rw_buffers);
2497
2498                 for (i = 0; i < 4; i++) {
2499                         if (!so->stride[i])
2500                                 continue;
2501
2502                         LLVMValueRef offset = LLVMConstInt(ctx->i32,
2503                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
2504
2505                         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
2506
2507                         LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2508                                                               ctx->param_streamout_offset[i]);
2509                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2510
2511                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2512                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2513                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2514                 }
2515
2516                 /* Write streamout data. */
2517                 for (i = 0; i < so->num_outputs; i++) {
2518                         unsigned reg = so->output[i].register_index;
2519
2520                         if (reg >= noutput)
2521                                 continue;
2522
2523                         if (stream != so->output[i].stream)
2524                                 continue;
2525
2526                         emit_streamout_output(ctx, so_buffers, so_write_offset,
2527                                               &so->output[i], &outputs[reg]);
2528                 }
2529         }
2530         lp_build_endif(&if_ctx);
2531 }
2532
2533 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2534                             LLVMValueRef *values)
2535 {
2536         struct ac_export_args args;
2537
2538         si_llvm_init_export_args(ctx, values,
2539                                  V_008DFC_SQ_EXP_PARAM + index, &args);
2540         ac_build_export(&ctx->ac, &args);
2541 }
2542
2543 static void si_build_param_exports(struct si_shader_context *ctx,
2544                                    struct si_shader_output_values *outputs,
2545                                    unsigned noutput)
2546 {
2547         struct si_shader *shader = ctx->shader;
2548         unsigned param_count = 0;
2549
2550         for (unsigned i = 0; i < noutput; i++) {
2551                 unsigned semantic_name = outputs[i].semantic_name;
2552                 unsigned semantic_index = outputs[i].semantic_index;
2553
2554                 if (outputs[i].vertex_stream[0] != 0 &&
2555                     outputs[i].vertex_stream[1] != 0 &&
2556                     outputs[i].vertex_stream[2] != 0 &&
2557                     outputs[i].vertex_stream[3] != 0)
2558                         continue;
2559
2560                 switch (semantic_name) {
2561                 case TGSI_SEMANTIC_LAYER:
2562                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2563                 case TGSI_SEMANTIC_CLIPDIST:
2564                 case TGSI_SEMANTIC_COLOR:
2565                 case TGSI_SEMANTIC_BCOLOR:
2566                 case TGSI_SEMANTIC_PRIMID:
2567                 case TGSI_SEMANTIC_FOG:
2568                 case TGSI_SEMANTIC_TEXCOORD:
2569                 case TGSI_SEMANTIC_GENERIC:
2570                         break;
2571                 default:
2572                         continue;
2573                 }
2574
2575                 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2576                      semantic_index < SI_MAX_IO_GENERIC) &&
2577                     shader->key.opt.kill_outputs &
2578                     (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2579                         continue;
2580
2581                 si_export_param(ctx, param_count, outputs[i].values);
2582
2583                 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2584                 shader->info.vs_output_param_offset[i] = param_count++;
2585         }
2586
2587         shader->info.nr_param_exports = param_count;
2588 }
2589
2590 /* Generate export instructions for hardware VS shader stage */
2591 static void si_llvm_export_vs(struct si_shader_context *ctx,
2592                               struct si_shader_output_values *outputs,
2593                               unsigned noutput)
2594 {
2595         struct si_shader *shader = ctx->shader;
2596         struct ac_export_args pos_args[4] = {};
2597         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2598         unsigned pos_idx;
2599         int i;
2600
2601         /* Build position exports. */
2602         for (i = 0; i < noutput; i++) {
2603                 switch (outputs[i].semantic_name) {
2604                 case TGSI_SEMANTIC_POSITION:
2605                         si_llvm_init_export_args(ctx, outputs[i].values,
2606                                                  V_008DFC_SQ_EXP_POS, &pos_args[0]);
2607                         break;
2608                 case TGSI_SEMANTIC_PSIZE:
2609                         psize_value = outputs[i].values[0];
2610                         break;
2611                 case TGSI_SEMANTIC_LAYER:
2612                         layer_value = outputs[i].values[0];
2613                         break;
2614                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2615                         viewport_index_value = outputs[i].values[0];
2616                         break;
2617                 case TGSI_SEMANTIC_EDGEFLAG:
2618                         edgeflag_value = outputs[i].values[0];
2619                         break;
2620                 case TGSI_SEMANTIC_CLIPDIST:
2621                         if (!shader->key.opt.clip_disable) {
2622                                 unsigned index = 2 + outputs[i].semantic_index;
2623                                 si_llvm_init_export_args(ctx, outputs[i].values,
2624                                                          V_008DFC_SQ_EXP_POS + index,
2625                                                          &pos_args[index]);
2626                         }
2627                         break;
2628                 case TGSI_SEMANTIC_CLIPVERTEX:
2629                         if (!shader->key.opt.clip_disable) {
2630                                 si_llvm_emit_clipvertex(ctx, pos_args,
2631                                                         outputs[i].values);
2632                         }
2633                         break;
2634                 }
2635         }
2636
2637         /* We need to add the position output manually if it's missing. */
2638         if (!pos_args[0].out[0]) {
2639                 pos_args[0].enabled_channels = 0xf; /* writemask */
2640                 pos_args[0].valid_mask = 0; /* EXEC mask */
2641                 pos_args[0].done = 0; /* last export? */
2642                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2643                 pos_args[0].compr = 0; /* COMPR flag */
2644                 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2645                 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2646                 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2647                 pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
2648         }
2649
2650         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2651         if (shader->selector->info.writes_psize ||
2652             shader->selector->info.writes_edgeflag ||
2653             shader->selector->info.writes_viewport_index ||
2654             shader->selector->info.writes_layer) {
2655                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2656                                                (shader->selector->info.writes_edgeflag << 1) |
2657                                                (shader->selector->info.writes_layer << 2);
2658
2659                 pos_args[1].valid_mask = 0; /* EXEC mask */
2660                 pos_args[1].done = 0; /* last export? */
2661                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2662                 pos_args[1].compr = 0; /* COMPR flag */
2663                 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
2664                 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
2665                 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
2666                 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
2667
2668                 if (shader->selector->info.writes_psize)
2669                         pos_args[1].out[0] = psize_value;
2670
2671                 if (shader->selector->info.writes_edgeflag) {
2672                         /* The output is a float, but the hw expects an integer
2673                          * with the first bit containing the edge flag. */
2674                         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
2675                                                          edgeflag_value,
2676                                                          ctx->i32, "");
2677                         edgeflag_value = ac_build_umin(&ctx->ac,
2678                                                       edgeflag_value,
2679                                                       ctx->i32_1);
2680
2681                         /* The LLVM intrinsic expects a float. */
2682                         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
2683                 }
2684
2685                 if (ctx->screen->info.chip_class >= GFX9) {
2686                         /* GFX9 has the layer in out.z[10:0] and the viewport
2687                          * index in out.z[19:16].
2688                          */
2689                         if (shader->selector->info.writes_layer)
2690                                 pos_args[1].out[2] = layer_value;
2691
2692                         if (shader->selector->info.writes_viewport_index) {
2693                                 LLVMValueRef v = viewport_index_value;
2694
2695                                 v = ac_to_integer(&ctx->ac, v);
2696                                 v = LLVMBuildShl(ctx->ac.builder, v,
2697                                                  LLVMConstInt(ctx->i32, 16, 0), "");
2698                                 v = LLVMBuildOr(ctx->ac.builder, v,
2699                                                 ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
2700                                 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
2701                                 pos_args[1].enabled_channels |= 1 << 2;
2702                         }
2703                 } else {
2704                         if (shader->selector->info.writes_layer)
2705                                 pos_args[1].out[2] = layer_value;
2706
2707                         if (shader->selector->info.writes_viewport_index) {
2708                                 pos_args[1].out[3] = viewport_index_value;
2709                                 pos_args[1].enabled_channels |= 1 << 3;
2710                         }
2711                 }
2712         }
2713
2714         for (i = 0; i < 4; i++)
2715                 if (pos_args[i].out[0])
2716                         shader->info.nr_pos_exports++;
2717
2718         pos_idx = 0;
2719         for (i = 0; i < 4; i++) {
2720                 if (!pos_args[i].out[0])
2721                         continue;
2722
2723                 /* Specify the target we are exporting */
2724                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2725
2726                 if (pos_idx == shader->info.nr_pos_exports)
2727                         /* Specify that this is the last export */
2728                         pos_args[i].done = 1;
2729
2730                 ac_build_export(&ctx->ac, &pos_args[i]);
2731         }
2732
2733         /* Build parameter exports. */
2734         si_build_param_exports(ctx, outputs, noutput);
2735 }
2736
2737 /**
2738  * Forward all outputs from the vertex shader to the TES. This is only used
2739  * for the fixed function TCS.
2740  */
2741 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2742 {
2743         struct si_shader_context *ctx = si_shader_context(bld_base);
2744         LLVMValueRef invocation_id, buffer, buffer_offset;
2745         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2746         uint64_t inputs;
2747
2748         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2749         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2750         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2751
2752         lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
2753         lds_vertex_offset = LLVMBuildMul(ctx->ac.builder, invocation_id,
2754                                          lds_vertex_stride, "");
2755         lds_base = get_tcs_in_current_patch_offset(ctx);
2756         lds_base = LLVMBuildAdd(ctx->ac.builder, lds_base, lds_vertex_offset, "");
2757
2758         inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2759         while (inputs) {
2760                 unsigned i = u_bit_scan64(&inputs);
2761
2762                 LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
2763                                             LLVMConstInt(ctx->i32, 4 * i, 0),
2764                                              "");
2765
2766                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2767                                               get_rel_patch_id(ctx),
2768                                               invocation_id,
2769                                               LLVMConstInt(ctx->i32, i, 0));
2770
2771                 LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0,
2772                                               lds_ptr);
2773
2774                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2775                                             buffer_offset, 0, 1, 0, true, false);
2776         }
2777 }
2778
2779 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2780                                   LLVMValueRef rel_patch_id,
2781                                   LLVMValueRef invocation_id,
2782                                   LLVMValueRef tcs_out_current_patch_data_offset,
2783                                   LLVMValueRef invoc0_tf_outer[4],
2784                                   LLVMValueRef invoc0_tf_inner[2])
2785 {
2786         struct si_shader_context *ctx = si_shader_context(bld_base);
2787         struct si_shader *shader = ctx->shader;
2788         unsigned tess_inner_index, tess_outer_index;
2789         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2790         LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2791         unsigned stride, outer_comps, inner_comps, i, offset;
2792         struct lp_build_if_state if_ctx, inner_if_ctx;
2793
2794         /* Add a barrier before loading tess factors from LDS. */
2795         if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
2796                 si_llvm_emit_barrier(NULL, bld_base, NULL);
2797
2798         /* Do this only for invocation 0, because the tess levels are per-patch,
2799          * not per-vertex.
2800          *
2801          * This can't jump, because invocation 0 executes this. It should
2802          * at least mask out the loads and stores for other invocations.
2803          */
2804         lp_build_if(&if_ctx, &ctx->gallivm,
2805                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
2806                                   invocation_id, ctx->i32_0, ""));
2807
2808         /* Determine the layout of one tess factor element in the buffer. */
2809         switch (shader->key.part.tcs.epilog.prim_mode) {
2810         case PIPE_PRIM_LINES:
2811                 stride = 2; /* 2 dwords, 1 vec2 store */
2812                 outer_comps = 2;
2813                 inner_comps = 0;
2814                 break;
2815         case PIPE_PRIM_TRIANGLES:
2816                 stride = 4; /* 4 dwords, 1 vec4 store */
2817                 outer_comps = 3;
2818                 inner_comps = 1;
2819                 break;
2820         case PIPE_PRIM_QUADS:
2821                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2822                 outer_comps = 4;
2823                 inner_comps = 2;
2824                 break;
2825         default:
2826                 assert(0);
2827                 return;
2828         }
2829
2830         for (i = 0; i < 4; i++) {
2831                 inner[i] = LLVMGetUndef(ctx->i32);
2832                 outer[i] = LLVMGetUndef(ctx->i32);
2833         }
2834
2835         if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
2836                 /* Tess factors are in VGPRs. */
2837                 for (i = 0; i < outer_comps; i++)
2838                         outer[i] = out[i] = invoc0_tf_outer[i];
2839                 for (i = 0; i < inner_comps; i++)
2840                         inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
2841         } else {
2842                 /* Load tess_inner and tess_outer from LDS.
2843                  * Any invocation can write them, so we can't get them from a temporary.
2844                  */
2845                 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2846                 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2847
2848                 lds_base = tcs_out_current_patch_data_offset;
2849                 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
2850                                          LLVMConstInt(ctx->i32,
2851                                                       tess_inner_index * 4, 0), "");
2852                 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
2853                                          LLVMConstInt(ctx->i32,
2854                                                       tess_outer_index * 4, 0), "");
2855
2856                 for (i = 0; i < outer_comps; i++) {
2857                         outer[i] = out[i] =
2858                                 lds_load(bld_base, ctx->ac.i32, i, lds_outer);
2859                 }
2860                 for (i = 0; i < inner_comps; i++) {
2861                         inner[i] = out[outer_comps+i] =
2862                                 lds_load(bld_base, ctx->ac.i32, i, lds_inner);
2863                 }
2864         }
2865
2866         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2867                 /* For isolines, the hardware expects tess factors in the
2868                  * reverse order from what GLSL / TGSI specify.
2869                  */
2870                 LLVMValueRef tmp = out[0];
2871                 out[0] = out[1];
2872                 out[1] = tmp;
2873         }
2874
2875         /* Convert the outputs to vectors for stores. */
2876         vec0 = lp_build_gather_values(&ctx->gallivm, out, MIN2(stride, 4));
2877         vec1 = NULL;
2878
2879         if (stride > 4)
2880                 vec1 = lp_build_gather_values(&ctx->gallivm, out+4, stride - 4);
2881
2882         /* Get the buffer. */
2883         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2884
2885         /* Get the offset. */
2886         tf_base = LLVMGetParam(ctx->main_fn,
2887                                ctx->param_tcs_factor_offset);
2888         byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
2889                                   LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2890
2891         lp_build_if(&inner_if_ctx, &ctx->gallivm,
2892                     LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
2893                                   rel_patch_id, ctx->i32_0, ""));
2894
2895         /* Store the dynamic HS control word. */
2896         offset = 0;
2897         if (ctx->screen->info.chip_class <= VI) {
2898                 ac_build_buffer_store_dword(&ctx->ac, buffer,
2899                                             LLVMConstInt(ctx->i32, 0x80000000, 0),
2900                                             1, ctx->i32_0, tf_base,
2901                                             offset, 1, 0, true, false);
2902                 offset += 4;
2903         }
2904
2905         lp_build_endif(&inner_if_ctx);
2906
2907         /* Store the tessellation factors. */
2908         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2909                                     MIN2(stride, 4), byteoffset, tf_base,
2910                                     offset, 1, 0, true, false);
2911         offset += 16;
2912         if (vec1)
2913                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2914                                             stride - 4, byteoffset, tf_base,
2915                                             offset, 1, 0, true, false);
2916
2917         /* Store the tess factors into the offchip buffer if TES reads them. */
2918         if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2919                 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2920                 LLVMValueRef tf_inner_offset;
2921                 unsigned param_outer, param_inner;
2922
2923                 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2924                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2925
2926                 param_outer = si_shader_io_get_unique_index_patch(
2927                                       TGSI_SEMANTIC_TESSOUTER, 0);
2928                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2929                                         LLVMConstInt(ctx->i32, param_outer, 0));
2930
2931                 outer_vec = lp_build_gather_values(&ctx->gallivm, outer,
2932                                                    util_next_power_of_two(outer_comps));
2933
2934                 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2935                                             outer_comps, tf_outer_offset,
2936                                             base, 0, 1, 0, true, false);
2937                 if (inner_comps) {
2938                         param_inner = si_shader_io_get_unique_index_patch(
2939                                               TGSI_SEMANTIC_TESSINNER, 0);
2940                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2941                                         LLVMConstInt(ctx->i32, param_inner, 0));
2942
2943                         inner_vec = inner_comps == 1 ? inner[0] :
2944                                     lp_build_gather_values(&ctx->gallivm, inner, inner_comps);
2945                         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2946                                                     inner_comps, tf_inner_offset,
2947                                                     base, 0, 1, 0, true, false);
2948                 }
2949         }
2950
2951         lp_build_endif(&if_ctx);
2952 }
2953
2954 static LLVMValueRef
2955 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2956                     unsigned param, unsigned return_index)
2957 {
2958         return LLVMBuildInsertValue(ctx->ac.builder, ret,
2959                                     LLVMGetParam(ctx->main_fn, param),
2960                                     return_index, "");
2961 }
2962
2963 static LLVMValueRef
2964 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2965                           unsigned param, unsigned return_index)
2966 {
2967         LLVMBuilderRef builder = ctx->ac.builder;
2968         LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2969
2970         return LLVMBuildInsertValue(builder, ret,
2971                                     ac_to_float(&ctx->ac, p),
2972                                     return_index, "");
2973 }
2974
2975 static LLVMValueRef
2976 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2977                              unsigned param, unsigned return_index)
2978 {
2979         LLVMBuilderRef builder = ctx->ac.builder;
2980         LLVMValueRef ptr, lo, hi;
2981
2982         ptr = LLVMGetParam(ctx->main_fn, param);
2983         ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2984         ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2985         lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2986         hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2987         ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2988         return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2989 }
2990
2991 /* This only writes the tessellation factor levels. */
2992 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2993 {
2994         struct si_shader_context *ctx = si_shader_context(bld_base);
2995         LLVMBuilderRef builder = ctx->ac.builder;
2996         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2997
2998         si_copy_tcs_inputs(bld_base);
2999
3000         rel_patch_id = get_rel_patch_id(ctx);
3001         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
3002         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
3003
3004         if (ctx->screen->info.chip_class >= GFX9) {
3005                 LLVMBasicBlockRef blocks[2] = {
3006                         LLVMGetInsertBlock(builder),
3007                         ctx->merged_wrap_if_state.entry_block
3008                 };
3009                 LLVMValueRef values[2];
3010
3011                 lp_build_endif(&ctx->merged_wrap_if_state);
3012
3013                 values[0] = rel_patch_id;
3014                 values[1] = LLVMGetUndef(ctx->i32);
3015                 rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3016
3017                 values[0] = tf_lds_offset;
3018                 values[1] = LLVMGetUndef(ctx->i32);
3019                 tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3020
3021                 values[0] = invocation_id;
3022                 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
3023                 invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3024         }
3025
3026         /* Return epilog parameters from this function. */
3027         LLVMValueRef ret = ctx->return_value;
3028         unsigned vgpr;
3029
3030         if (ctx->screen->info.chip_class >= GFX9) {
3031                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3032                                           8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3033                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
3034                                           8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
3035                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
3036                                           8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
3037                 /* Tess offchip and tess factor offsets are at the beginning. */
3038                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3039                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3040                 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
3041         } else {
3042                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3043                                           GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
3044                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
3045                                           GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
3046                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
3047                                           GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
3048                 /* Tess offchip and tess factor offsets are after user SGPRs. */
3049                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
3050                                           GFX6_TCS_NUM_USER_SGPR);
3051                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
3052                                           GFX6_TCS_NUM_USER_SGPR + 1);
3053                 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
3054         }
3055
3056         /* VGPRs */
3057         rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
3058         invocation_id = ac_to_float(&ctx->ac, invocation_id);
3059         tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
3060
3061         /* Leave a hole corresponding to the two input VGPRs. This ensures that
3062          * the invocation_id output does not alias the param_tcs_rel_ids input,
3063          * which saves a V_MOV on gfx9.
3064          */
3065         vgpr += 2;
3066
3067         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
3068         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
3069
3070         if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
3071                 vgpr++; /* skip the tess factor LDS offset */
3072                 for (unsigned i = 0; i < 6; i++) {
3073                         LLVMValueRef value =
3074                                 LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
3075                         value = ac_to_float(&ctx->ac, value);
3076                         ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
3077                 }
3078         } else {
3079                 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
3080         }
3081         ctx->return_value = ret;
3082 }
3083
3084 /* Pass TCS inputs from LS to TCS on GFX9. */
3085 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
3086 {
3087         LLVMValueRef ret = ctx->return_value;
3088
3089         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3090         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3091         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3092         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3093
3094         ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers,
3095                                            8 + SI_SGPR_RW_BUFFERS);
3096         ret = si_insert_input_ptr_as_2xi32(ctx, ret,
3097                 ctx->param_bindless_samplers_and_images,
3098                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3099
3100         ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
3101                                   8 + SI_SGPR_VS_STATE_BITS);
3102         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3103                                   8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3104         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
3105                                   8 + GFX9_SGPR_TCS_OUT_OFFSETS);
3106         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3107                                   8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3108         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
3109                                   8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
3110         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
3111                                   8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
3112
3113         unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
3114         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
3115                                            8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
3116         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
3117                                            8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
3118
3119         unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
3120         ret = si_insert_input_ret_float(ctx, ret,
3121                                         ctx->param_tcs_patch_id, vgpr++);
3122         ret = si_insert_input_ret_float(ctx, ret,
3123                                         ctx->param_tcs_rel_ids, vgpr++);
3124         ctx->return_value = ret;
3125 }
3126
3127 /* Pass GS inputs from ES to GS on GFX9. */
3128 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
3129 {
3130         LLVMValueRef ret = ctx->return_value;
3131
3132         ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
3133         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3134         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3135
3136         ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers,
3137                                            8 + SI_SGPR_RW_BUFFERS);
3138         ret = si_insert_input_ptr_as_2xi32(ctx, ret,
3139                 ctx->param_bindless_samplers_and_images,
3140                 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3141
3142         unsigned desc_param = ctx->param_vs_state_bits + 1;
3143         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
3144                                            8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
3145         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
3146                                            8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
3147
3148         unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
3149         for (unsigned i = 0; i < 5; i++) {
3150                 unsigned param = ctx->param_gs_vtx01_offset + i;
3151                 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
3152         }
3153         ctx->return_value = ret;
3154 }
3155
3156 static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi,
3157                                      unsigned max_outputs,
3158                                      LLVMValueRef *addrs)
3159 {
3160         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3161         struct si_shader *shader = ctx->shader;
3162         struct tgsi_shader_info *info = &shader->selector->info;
3163         unsigned i, chan;
3164         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3165                                               ctx->param_rel_auto_id);
3166         LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3167         LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
3168                                                  vertex_dw_stride, "");
3169
3170         /* Write outputs to LDS. The next shader (TCS aka HS) will read
3171          * its inputs from it. */
3172         for (i = 0; i < info->num_outputs; i++) {
3173                 unsigned name = info->output_semantic_name[i];
3174                 unsigned index = info->output_semantic_index[i];
3175
3176                 /* The ARB_shader_viewport_layer_array spec contains the
3177                  * following issue:
3178                  *
3179                  *    2) What happens if gl_ViewportIndex or gl_Layer is
3180                  *    written in the vertex shader and a geometry shader is
3181                  *    present?
3182                  *
3183                  *    RESOLVED: The value written by the last vertex processing
3184                  *    stage is used. If the last vertex processing stage
3185                  *    (vertex, tessellation evaluation or geometry) does not
3186                  *    statically assign to gl_ViewportIndex or gl_Layer, index
3187                  *    or layer zero is assumed.
3188                  *
3189                  * So writes to those outputs in VS-as-LS are simply ignored.
3190                  */
3191                 if (name == TGSI_SEMANTIC_LAYER ||
3192                     name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3193                         continue;
3194
3195                 int param = si_shader_io_get_unique_index(name, index);
3196                 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
3197                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
3198
3199                 for (chan = 0; chan < 4; chan++) {
3200                         if (!(info->output_usagemask[i] & (1 << chan)))
3201                                 continue;
3202
3203                         lds_store(ctx, chan, dw_addr,
3204                                   LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
3205                 }
3206         }
3207
3208         if (ctx->screen->info.chip_class >= GFX9)
3209                 si_set_ls_return_value_for_tcs(ctx);
3210 }
3211
3212 static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
3213                                      unsigned max_outputs,
3214                                      LLVMValueRef *addrs)
3215 {
3216         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3217         struct si_shader *es = ctx->shader;
3218         struct tgsi_shader_info *info = &es->selector->info;
3219         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3220                                             ctx->param_es2gs_offset);
3221         LLVMValueRef lds_base = NULL;
3222         unsigned chan;
3223         int i;
3224
3225         if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
3226                 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3227                 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3228                 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3229                 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
3230                                          LLVMBuildMul(ctx->ac.builder, wave_idx,
3231                                                       LLVMConstInt(ctx->i32, 64, false), ""), "");
3232                 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
3233                                         LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3234         }
3235
3236         for (i = 0; i < info->num_outputs; i++) {
3237                 int param;
3238
3239                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3240                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3241                         continue;
3242
3243                 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3244                                                       info->output_semantic_index[i]);
3245
3246                 for (chan = 0; chan < 4; chan++) {
3247                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
3248                         out_val = ac_to_integer(&ctx->ac, out_val);
3249
3250                         /* GFX9 has the ESGS ring in LDS. */
3251                         if (ctx->screen->info.chip_class >= GFX9) {
3252                                 lds_store(ctx, param * 4 + chan, lds_base, out_val);
3253                                 continue;
3254                         }
3255
3256                         ac_build_buffer_store_dword(&ctx->ac,
3257                                                     ctx->esgs_ring,
3258                                                     out_val, 1, NULL, soffset,
3259                                                     (4 * param + chan) * 4,
3260                                                     1, 1, true, true);
3261                 }
3262         }
3263
3264         if (ctx->screen->info.chip_class >= GFX9)
3265                 si_set_es_return_value_for_gs(ctx);
3266 }
3267
3268 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3269 {
3270         if (ctx->screen->info.chip_class >= GFX9)
3271                 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3272         else
3273                 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3274 }
3275
3276 static void emit_gs_epilogue(struct si_shader_context *ctx)
3277 {
3278         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3279                          si_get_gs_wave_id(ctx));
3280
3281         if (ctx->screen->info.chip_class >= GFX9)
3282                 lp_build_endif(&ctx->merged_wrap_if_state);
3283 }
3284
3285 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
3286                                      unsigned max_outputs,
3287                                      LLVMValueRef *addrs)
3288 {
3289         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3290         struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info;
3291
3292         assert(info->num_outputs <= max_outputs);
3293
3294         emit_gs_epilogue(ctx);
3295 }
3296
3297 static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3298 {
3299         struct si_shader_context *ctx = si_shader_context(bld_base);
3300         emit_gs_epilogue(ctx);
3301 }
3302
3303 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3304                                      unsigned max_outputs,
3305                                      LLVMValueRef *addrs)
3306 {
3307         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3308         struct tgsi_shader_info *info = &ctx->shader->selector->info;
3309         struct si_shader_output_values *outputs = NULL;
3310         int i,j;
3311
3312         assert(!ctx->shader->is_gs_copy_shader);
3313         assert(info->num_outputs <= max_outputs);
3314
3315         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3316
3317         /* Vertex color clamping.
3318          *
3319          * This uses a state constant loaded in a user data SGPR and
3320          * an IF statement is added that clamps all colors if the constant
3321          * is true.
3322          */
3323         if (ctx->type == PIPE_SHADER_VERTEX) {
3324                 struct lp_build_if_state if_ctx;
3325                 LLVMValueRef cond = NULL;
3326                 LLVMValueRef addr, val;
3327
3328                 for (i = 0; i < info->num_outputs; i++) {
3329                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3330                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3331                                 continue;
3332
3333                         /* We've found a color. */
3334                         if (!cond) {
3335                                 /* The state is in the first bit of the user SGPR. */
3336                                 cond = LLVMGetParam(ctx->main_fn,
3337                                                     ctx->param_vs_state_bits);
3338                                 cond = LLVMBuildTrunc(ctx->ac.builder, cond,
3339                                                       ctx->i1, "");
3340                                 lp_build_if(&if_ctx, &ctx->gallivm, cond);
3341                         }
3342
3343                         for (j = 0; j < 4; j++) {
3344                                 addr = addrs[4 * i + j];
3345                                 val = LLVMBuildLoad(ctx->ac.builder, addr, "");
3346                                 val = ac_build_clamp(&ctx->ac, val);
3347                                 LLVMBuildStore(ctx->ac.builder, val, addr);
3348                         }
3349                 }
3350
3351                 if (cond)
3352                         lp_build_endif(&if_ctx);
3353         }
3354
3355         for (i = 0; i < info->num_outputs; i++) {
3356                 outputs[i].semantic_name = info->output_semantic_name[i];
3357                 outputs[i].semantic_index = info->output_semantic_index[i];
3358
3359                 for (j = 0; j < 4; j++) {
3360                         outputs[i].values[j] =
3361                                 LLVMBuildLoad(ctx->ac.builder,
3362                                               addrs[4 * i + j],
3363                                               "");
3364                         outputs[i].vertex_stream[j] =
3365                                 (info->output_streams[i] >> (2 * j)) & 3;
3366                 }
3367         }
3368
3369         if (ctx->shader->selector->so.num_outputs)
3370                 si_llvm_emit_streamout(ctx, outputs, i, 0);
3371
3372         /* Export PrimitiveID. */
3373         if (ctx->shader->key.mono.u.vs_export_prim_id) {
3374                 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3375                 outputs[i].semantic_index = 0;
3376                 outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0));
3377                 for (j = 1; j < 4; j++)
3378                         outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3379
3380                 memset(outputs[i].vertex_stream, 0,
3381                        sizeof(outputs[i].vertex_stream));
3382                 i++;
3383         }
3384
3385         si_llvm_export_vs(ctx, outputs, i);
3386         FREE(outputs);
3387 }
3388
3389 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3390 {
3391         struct si_shader_context *ctx = si_shader_context(bld_base);
3392
3393         ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3394                               &ctx->outputs[0][0]);
3395 }
3396
3397 struct si_ps_exports {
3398         unsigned num;
3399         struct ac_export_args args[10];
3400 };
3401
3402 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3403                                     bool writes_samplemask)
3404 {
3405         if (writes_z) {
3406                 /* Z needs 32 bits. */
3407                 if (writes_samplemask)
3408                         return V_028710_SPI_SHADER_32_ABGR;
3409                 else if (writes_stencil)
3410                         return V_028710_SPI_SHADER_32_GR;
3411                 else
3412                         return V_028710_SPI_SHADER_32_R;
3413         } else if (writes_stencil || writes_samplemask) {
3414                 /* Both stencil and sample mask need only 16 bits. */
3415                 return V_028710_SPI_SHADER_UINT16_ABGR;
3416         } else {
3417                 return V_028710_SPI_SHADER_ZERO;
3418         }
3419 }
3420
3421 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3422                             LLVMValueRef depth, LLVMValueRef stencil,
3423                             LLVMValueRef samplemask, struct si_ps_exports *exp)
3424 {
3425         struct si_shader_context *ctx = si_shader_context(bld_base);
3426         struct lp_build_context *base = &bld_base->base;
3427         struct ac_export_args args;
3428         unsigned mask = 0;
3429         unsigned format = si_get_spi_shader_z_format(depth != NULL,
3430                                                      stencil != NULL,
3431                                                      samplemask != NULL);
3432
3433         assert(depth || stencil || samplemask);
3434
3435         args.valid_mask = 1; /* whether the EXEC mask is valid */
3436         args.done = 1; /* DONE bit */
3437
3438         /* Specify the target we are exporting */
3439         args.target = V_008DFC_SQ_EXP_MRTZ;
3440
3441         args.compr = 0; /* COMP flag */
3442         args.out[0] = base->undef; /* R, depth */
3443         args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3444         args.out[2] = base->undef; /* B, sample mask */
3445         args.out[3] = base->undef; /* A, alpha to mask */
3446
3447         if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3448                 assert(!depth);
3449                 args.compr = 1; /* COMPR flag */
3450
3451                 if (stencil) {
3452                         /* Stencil should be in X[23:16]. */
3453                         stencil = ac_to_integer(&ctx->ac, stencil);
3454                         stencil = LLVMBuildShl(ctx->ac.builder, stencil,
3455                                                LLVMConstInt(ctx->i32, 16, 0), "");
3456                         args.out[0] = ac_to_float(&ctx->ac, stencil);
3457                         mask |= 0x3;
3458                 }
3459                 if (samplemask) {
3460                         /* SampleMask should be in Y[15:0]. */
3461                         args.out[1] = samplemask;
3462                         mask |= 0xc;
3463                 }
3464         } else {
3465                 if (depth) {
3466                         args.out[0] = depth;
3467                         mask |= 0x1;
3468                 }
3469                 if (stencil) {
3470                         args.out[1] = stencil;
3471                         mask |= 0x2;
3472                 }
3473                 if (samplemask) {
3474                         args.out[2] = samplemask;
3475                         mask |= 0x4;
3476                 }
3477         }
3478
3479         /* SI (except OLAND and HAINAN) has a bug that it only looks
3480          * at the X writemask component. */
3481         if (ctx->screen->info.chip_class == SI &&
3482             ctx->screen->info.family != CHIP_OLAND &&
3483             ctx->screen->info.family != CHIP_HAINAN)
3484                 mask |= 0x1;
3485
3486         /* Specify which components to enable */
3487         args.enabled_channels = mask;
3488
3489         memcpy(&exp->args[exp->num++], &args, sizeof(args));
3490 }
3491
3492 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3493                                 LLVMValueRef *color, unsigned index,
3494                                 unsigned samplemask_param,
3495                                 bool is_last, struct si_ps_exports *exp)
3496 {
3497         struct si_shader_context *ctx = si_shader_context(bld_base);
3498         int i;
3499
3500         /* Clamp color */
3501         if (ctx->shader->key.part.ps.epilog.clamp_color)
3502                 for (i = 0; i < 4; i++)
3503                         color[i] = ac_build_clamp(&ctx->ac, color[i]);
3504
3505         /* Alpha to one */
3506         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3507                 color[3] = ctx->ac.f32_1;
3508
3509         /* Alpha test */
3510         if (index == 0 &&
3511             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3512                 si_alpha_test(bld_base, color[3]);
3513
3514         /* Line & polygon smoothing */
3515         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3516                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3517                                                          samplemask_param);
3518
3519         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3520         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3521                 struct ac_export_args args[8];
3522                 int c, last = -1;
3523
3524                 /* Get the export arguments, also find out what the last one is. */
3525                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3526                         si_llvm_init_export_args(ctx, color,
3527                                                  V_008DFC_SQ_EXP_MRT + c, &args[c]);
3528                         if (args[c].enabled_channels)
3529                                 last = c;
3530                 }
3531
3532                 /* Emit all exports. */
3533                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3534                         if (is_last && last == c) {
3535                                 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3536                                 args[c].done = 1; /* DONE bit */
3537                         } else if (!args[c].enabled_channels)
3538                                 continue; /* unnecessary NULL export */
3539
3540                         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3541                 }
3542         } else {
3543                 struct ac_export_args args;
3544
3545                 /* Export */
3546                 si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
3547                                          &args);
3548                 if (is_last) {
3549                         args.valid_mask = 1; /* whether the EXEC mask is valid */
3550                         args.done = 1; /* DONE bit */
3551                 } else if (!args.enabled_channels)
3552                         return; /* unnecessary NULL export */
3553
3554                 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3555         }
3556 }
3557
3558 static void si_emit_ps_exports(struct si_shader_context *ctx,
3559                                struct si_ps_exports *exp)
3560 {
3561         for (unsigned i = 0; i < exp->num; i++)
3562                 ac_build_export(&ctx->ac, &exp->args[i]);
3563 }
3564
3565 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3566 {
3567         struct si_shader_context *ctx = si_shader_context(bld_base);
3568         struct lp_build_context *base = &bld_base->base;
3569         struct ac_export_args args;
3570
3571         args.enabled_channels = 0x0; /* enabled channels */
3572         args.valid_mask = 1; /* whether the EXEC mask is valid */
3573         args.done = 1; /* DONE bit */
3574         args.target = V_008DFC_SQ_EXP_NULL;
3575         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3576         args.out[0] = base->undef; /* R */
3577         args.out[1] = base->undef; /* G */
3578         args.out[2] = base->undef; /* B */
3579         args.out[3] = base->undef; /* A */
3580
3581         ac_build_export(&ctx->ac, &args);
3582 }
3583
3584 /**
3585  * Return PS outputs in this order:
3586  *
3587  * v[0:3] = color0.xyzw
3588  * v[4:7] = color1.xyzw
3589  * ...
3590  * vN+0 = Depth
3591  * vN+1 = Stencil
3592  * vN+2 = SampleMask
3593  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3594  *
3595  * The alpha-ref SGPR is returned via its original location.
3596  */
3597 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3598                                       unsigned max_outputs,
3599                                       LLVMValueRef *addrs)
3600 {
3601         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3602         struct si_shader *shader = ctx->shader;
3603         struct tgsi_shader_info *info = &shader->selector->info;
3604         LLVMBuilderRef builder = ctx->ac.builder;
3605         unsigned i, j, first_vgpr, vgpr;
3606
3607         LLVMValueRef color[8][4] = {};
3608         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3609         LLVMValueRef ret;
3610
3611         if (ctx->postponed_kill)
3612                 ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3613
3614         /* Read the output values. */
3615         for (i = 0; i < info->num_outputs; i++) {
3616                 unsigned semantic_name = info->output_semantic_name[i];
3617                 unsigned semantic_index = info->output_semantic_index[i];
3618
3619                 switch (semantic_name) {
3620                 case TGSI_SEMANTIC_COLOR:
3621                         assert(semantic_index < 8);
3622                         for (j = 0; j < 4; j++) {
3623                                 LLVMValueRef ptr = addrs[4 * i + j];
3624                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3625                                 color[semantic_index][j] = result;
3626                         }
3627                         break;
3628                 case TGSI_SEMANTIC_POSITION:
3629                         depth = LLVMBuildLoad(builder,
3630                                               addrs[4 * i + 2], "");
3631                         break;
3632                 case TGSI_SEMANTIC_STENCIL:
3633                         stencil = LLVMBuildLoad(builder,
3634                                                 addrs[4 * i + 1], "");
3635                         break;
3636                 case TGSI_SEMANTIC_SAMPLEMASK:
3637                         samplemask = LLVMBuildLoad(builder,
3638                                                    addrs[4 * i + 0], "");
3639                         break;
3640                 default:
3641                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3642                                 semantic_name);
3643                 }
3644         }
3645
3646         /* Fill the return structure. */
3647         ret = ctx->return_value;
3648
3649         /* Set SGPRs. */
3650         ret = LLVMBuildInsertValue(builder, ret,
3651                                    ac_to_integer(&ctx->ac,
3652                                                  LLVMGetParam(ctx->main_fn,
3653                                                               SI_PARAM_ALPHA_REF)),
3654                                    SI_SGPR_ALPHA_REF, "");
3655
3656         /* Set VGPRs */
3657         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3658         for (i = 0; i < ARRAY_SIZE(color); i++) {
3659                 if (!color[i][0])
3660                         continue;
3661
3662                 for (j = 0; j < 4; j++)
3663                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3664         }
3665         if (depth)
3666                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3667         if (stencil)
3668                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3669         if (samplemask)
3670                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3671
3672         /* Add the input sample mask for smoothing at the end. */
3673         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3674                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3675         ret = LLVMBuildInsertValue(builder, ret,
3676                                    LLVMGetParam(ctx->main_fn,
3677                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3678
3679         ctx->return_value = ret;
3680 }
3681
3682 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3683 {
3684         LLVMValueRef args[1] = {
3685                 LLVMConstInt(ctx->i32, simm16, 0)
3686         };
3687         lp_build_intrinsic(ctx->ac.builder, "llvm.amdgcn.s.waitcnt",
3688                            ctx->voidt, args, 1, 0);
3689 }
3690
3691 static void membar_emit(
3692                 const struct lp_build_tgsi_action *action,
3693                 struct lp_build_tgsi_context *bld_base,
3694                 struct lp_build_emit_data *emit_data)
3695 {
3696         struct si_shader_context *ctx = si_shader_context(bld_base);
3697         LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3698         unsigned flags = LLVMConstIntGetZExtValue(src0);
3699         unsigned waitcnt = NOOP_WAITCNT;
3700
3701         if (flags & TGSI_MEMBAR_THREAD_GROUP)
3702                 waitcnt &= VM_CNT & LGKM_CNT;
3703
3704         if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3705                      TGSI_MEMBAR_SHADER_BUFFER |
3706                      TGSI_MEMBAR_SHADER_IMAGE))
3707                 waitcnt &= VM_CNT;
3708
3709         if (flags & TGSI_MEMBAR_SHARED)
3710                 waitcnt &= LGKM_CNT;
3711
3712         if (waitcnt != NOOP_WAITCNT)
3713                 si_emit_waitcnt(ctx, waitcnt);
3714 }
3715
3716 static void clock_emit(
3717                 const struct lp_build_tgsi_action *action,
3718                 struct lp_build_tgsi_context *bld_base,
3719                 struct lp_build_emit_data *emit_data)
3720 {
3721         struct si_shader_context *ctx = si_shader_context(bld_base);
3722         LLVMValueRef tmp;
3723
3724         tmp = lp_build_intrinsic(ctx->ac.builder, "llvm.readcyclecounter",
3725                                  ctx->i64, NULL, 0, 0);
3726         tmp = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->v2i32, "");
3727
3728         emit_data->output[0] =
3729                 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, "");
3730         emit_data->output[1] =
3731                 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, "");
3732 }
3733
3734 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3735 {
3736         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3737                                CONST_ADDR_SPACE);
3738 }
3739
3740 static void si_llvm_emit_ddxy(
3741         const struct lp_build_tgsi_action *action,
3742         struct lp_build_tgsi_context *bld_base,
3743         struct lp_build_emit_data *emit_data)
3744 {
3745         struct si_shader_context *ctx = si_shader_context(bld_base);
3746         unsigned opcode = emit_data->info->opcode;
3747         LLVMValueRef val;
3748         int idx;
3749         unsigned mask;
3750
3751         if (opcode == TGSI_OPCODE_DDX_FINE)
3752                 mask = AC_TID_MASK_LEFT;
3753         else if (opcode == TGSI_OPCODE_DDY_FINE)
3754                 mask = AC_TID_MASK_TOP;
3755         else
3756                 mask = AC_TID_MASK_TOP_LEFT;
3757
3758         /* for DDX we want to next X pixel, DDY next Y pixel. */
3759         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3760
3761         val = ac_to_integer(&ctx->ac, emit_data->args[0]);
3762         val = ac_build_ddxy(&ctx->ac, mask, idx, val);
3763         emit_data->output[emit_data->chan] = val;
3764 }
3765
3766 /*
3767  * this takes an I,J coordinate pair,
3768  * and works out the X and Y derivatives.
3769  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3770  */
3771 static LLVMValueRef si_llvm_emit_ddxy_interp(
3772         struct lp_build_tgsi_context *bld_base,
3773         LLVMValueRef interp_ij)
3774 {
3775         struct si_shader_context *ctx = si_shader_context(bld_base);
3776         LLVMValueRef result[4], a;
3777         unsigned i;
3778
3779         for (i = 0; i < 2; i++) {
3780                 a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij,
3781                                             LLVMConstInt(ctx->i32, i, 0), "");
3782                 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3783                 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3784         }
3785
3786         return lp_build_gather_values(&ctx->gallivm, result, 4);
3787 }
3788
3789 static void interp_fetch_args(
3790         struct lp_build_tgsi_context *bld_base,
3791         struct lp_build_emit_data *emit_data)
3792 {
3793         struct si_shader_context *ctx = si_shader_context(bld_base);
3794         const struct tgsi_full_instruction *inst = emit_data->inst;
3795
3796         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3797                 /* offset is in second src, first two channels */
3798                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3799                                                          emit_data->inst, 1,
3800                                                          TGSI_CHAN_X);
3801                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3802                                                          emit_data->inst, 1,
3803                                                          TGSI_CHAN_Y);
3804                 emit_data->arg_count = 2;
3805         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3806                 LLVMValueRef sample_position;
3807                 LLVMValueRef sample_id;
3808                 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3809
3810                 /* fetch sample ID, then fetch its sample position,
3811                  * and place into first two channels.
3812                  */
3813                 sample_id = lp_build_emit_fetch(bld_base,
3814                                                 emit_data->inst, 1, TGSI_CHAN_X);
3815                 sample_id = ac_to_integer(&ctx->ac, sample_id);
3816
3817                 /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading
3818                  * Language 4.50 spec says about interpolateAtSample:
3819                  *
3820                  *    "Returns the value of the input interpolant variable at
3821                  *     the location of sample number sample. If multisample
3822                  *     buffers are not available, the input variable will be
3823                  *     evaluated at the center of the pixel. If sample sample
3824                  *     does not exist, the position used to interpolate the
3825                  *     input variable is undefined."
3826                  *
3827                  * This means that sample_id values outside of the valid are
3828                  * in fact valid input, and the usual mechanism for loading the
3829                  * sample position doesn't work.
3830                  */
3831                 if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) {
3832                         LLVMValueRef center[4] = {
3833                                 LLVMConstReal(ctx->f32, 0.5),
3834                                 LLVMConstReal(ctx->f32, 0.5),
3835                                 ctx->ac.f32_0,
3836                                 ctx->ac.f32_0,
3837                         };
3838
3839                         sample_position = lp_build_gather_values(&ctx->gallivm, center, 4);
3840                 } else {
3841                         sample_position = load_sample_position(ctx, sample_id);
3842                 }
3843
3844                 emit_data->args[0] = LLVMBuildExtractElement(ctx->ac.builder,
3845                                                              sample_position,
3846                                                              ctx->i32_0, "");
3847
3848                 emit_data->args[0] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[0], halfval, "");
3849                 emit_data->args[1] = LLVMBuildExtractElement(ctx->ac.builder,
3850                                                              sample_position,
3851                                                              ctx->i32_1, "");
3852                 emit_data->args[1] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[1], halfval, "");
3853                 emit_data->arg_count = 2;
3854         }
3855 }
3856
3857 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3858                                 struct lp_build_tgsi_context *bld_base,
3859                                 struct lp_build_emit_data *emit_data)
3860 {
3861         struct si_shader_context *ctx = si_shader_context(bld_base);
3862         struct si_shader *shader = ctx->shader;
3863         const struct tgsi_shader_info *info = &shader->selector->info;
3864         LLVMValueRef interp_param;
3865         const struct tgsi_full_instruction *inst = emit_data->inst;
3866         const struct tgsi_full_src_register *input = &inst->Src[0];
3867         int input_base, input_array_size;
3868         int chan;
3869         int i;
3870         LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3871         LLVMValueRef array_idx;
3872         int interp_param_idx;
3873         unsigned interp;
3874         unsigned location;
3875
3876         assert(input->Register.File == TGSI_FILE_INPUT);
3877
3878         if (input->Register.Indirect) {
3879                 unsigned array_id = input->Indirect.ArrayID;
3880
3881                 if (array_id) {
3882                         input_base = info->input_array_first[array_id];
3883                         input_array_size = info->input_array_last[array_id] - input_base + 1;
3884                 } else {
3885                         input_base = inst->Src[0].Register.Index;
3886                         input_array_size = info->num_inputs - input_base;
3887                 }
3888
3889                 array_idx = si_get_indirect_index(ctx, &input->Indirect,
3890                                                   1, input->Register.Index - input_base);
3891         } else {
3892                 input_base = inst->Src[0].Register.Index;
3893                 input_array_size = 1;
3894                 array_idx = ctx->i32_0;
3895         }
3896
3897         interp = shader->selector->info.input_interpolate[input_base];
3898
3899         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3900             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3901                 location = TGSI_INTERPOLATE_LOC_CENTER;
3902         else
3903                 location = TGSI_INTERPOLATE_LOC_CENTROID;
3904
3905         interp_param_idx = lookup_interp_param_index(interp, location);
3906         if (interp_param_idx == -1)
3907                 return;
3908         else if (interp_param_idx)
3909                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3910         else
3911                 interp_param = NULL;
3912
3913         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3914             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3915                 LLVMValueRef ij_out[2];
3916                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3917
3918                 /*
3919                  * take the I then J parameters, and the DDX/Y for it, and
3920                  * calculate the IJ inputs for the interpolator.
3921                  * temp1 = ddx * offset/sample.x + I;
3922                  * interp_param.I = ddy * offset/sample.y + temp1;
3923                  * temp1 = ddx * offset/sample.x + J;
3924                  * interp_param.J = ddy * offset/sample.y + temp1;
3925                  */
3926                 for (i = 0; i < 2; i++) {
3927                         LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3928                         LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3929                         LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder,
3930                                                                       ddxy_out, ix_ll, "");
3931                         LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder,
3932                                                                       ddxy_out, iy_ll, "");
3933                         LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
3934                                                                          interp_param, ix_ll, "");
3935                         LLVMValueRef temp1, temp2;
3936
3937                         interp_el = ac_to_float(&ctx->ac, interp_el);
3938
3939                         temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, emit_data->args[0], "");
3940
3941                         temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, "");
3942
3943                         temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, emit_data->args[1], "");
3944
3945                         ij_out[i] = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, "");
3946                 }
3947                 interp_param = lp_build_gather_values(&ctx->gallivm, ij_out, 2);
3948         }
3949
3950         if (interp_param)
3951                 interp_param = ac_to_float(&ctx->ac, interp_param);
3952
3953         for (chan = 0; chan < 4; chan++) {
3954                 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3955                 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3956
3957                 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3958                         LLVMValueRef v, i = NULL, j = NULL;
3959
3960                         if (interp_param) {
3961                                 i = LLVMBuildExtractElement(
3962                                         ctx->ac.builder, interp_param, ctx->i32_0, "");
3963                                 j = LLVMBuildExtractElement(
3964                                         ctx->ac.builder, interp_param, ctx->i32_1, "");
3965                         }
3966                         v = si_build_fs_interp(ctx, input_base + idx, schan,
3967                                                prim_mask, i, j);
3968
3969                         gather = LLVMBuildInsertElement(ctx->ac.builder,
3970                                 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3971                 }
3972
3973                 emit_data->output[chan] = LLVMBuildExtractElement(
3974                         ctx->ac.builder, gather, array_idx, "");
3975         }
3976 }
3977
3978 static void vote_all_emit(
3979         const struct lp_build_tgsi_action *action,
3980         struct lp_build_tgsi_context *bld_base,
3981         struct lp_build_emit_data *emit_data)
3982 {
3983         struct si_shader_context *ctx = si_shader_context(bld_base);
3984
3985         LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]);
3986         emit_data->output[emit_data->chan] =
3987                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
3988 }
3989
3990 static void vote_any_emit(
3991         const struct lp_build_tgsi_action *action,
3992         struct lp_build_tgsi_context *bld_base,
3993         struct lp_build_emit_data *emit_data)
3994 {
3995         struct si_shader_context *ctx = si_shader_context(bld_base);
3996
3997         LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]);
3998         emit_data->output[emit_data->chan] =
3999                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4000 }
4001
4002 static void vote_eq_emit(
4003         const struct lp_build_tgsi_action *action,
4004         struct lp_build_tgsi_context *bld_base,
4005         struct lp_build_emit_data *emit_data)
4006 {
4007         struct si_shader_context *ctx = si_shader_context(bld_base);
4008
4009         LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]);
4010         emit_data->output[emit_data->chan] =
4011                 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4012 }
4013
4014 static void ballot_emit(
4015         const struct lp_build_tgsi_action *action,
4016         struct lp_build_tgsi_context *bld_base,
4017         struct lp_build_emit_data *emit_data)
4018 {
4019         struct si_shader_context *ctx = si_shader_context(bld_base);
4020         LLVMBuilderRef builder = ctx->ac.builder;
4021         LLVMValueRef tmp;
4022
4023         tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4024         tmp = ac_build_ballot(&ctx->ac, tmp);
4025         tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
4026
4027         emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
4028         emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
4029 }
4030
4031 static void read_invoc_fetch_args(
4032         struct lp_build_tgsi_context *bld_base,
4033         struct lp_build_emit_data *emit_data)
4034 {
4035         emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
4036                                                  0, emit_data->src_chan);
4037
4038         /* Always read the source invocation (= lane) from the X channel. */
4039         emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
4040                                                  1, TGSI_CHAN_X);
4041         emit_data->arg_count = 2;
4042 }
4043
4044 static void read_lane_emit(
4045         const struct lp_build_tgsi_action *action,
4046         struct lp_build_tgsi_context *bld_base,
4047         struct lp_build_emit_data *emit_data)
4048 {
4049         struct si_shader_context *ctx = si_shader_context(bld_base);
4050
4051         /* We currently have no other way to prevent LLVM from lifting the icmp
4052          * calls to a dominating basic block.
4053          */
4054         ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]);
4055
4056         for (unsigned i = 0; i < emit_data->arg_count; ++i)
4057                 emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]);
4058
4059         emit_data->output[emit_data->chan] =
4060                 ac_build_intrinsic(&ctx->ac, action->intr_name,
4061                                    ctx->i32, emit_data->args, emit_data->arg_count,
4062                                    AC_FUNC_ATTR_READNONE |
4063                                    AC_FUNC_ATTR_CONVERGENT);
4064 }
4065
4066 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4067                                        struct lp_build_emit_data *emit_data)
4068 {
4069         struct si_shader_context *ctx = si_shader_context(bld_base);
4070         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4071         LLVMValueRef imm;
4072         unsigned stream;
4073
4074         assert(src0.File == TGSI_FILE_IMMEDIATE);
4075
4076         imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
4077         stream = LLVMConstIntGetZExtValue(imm) & 0x3;
4078         return stream;
4079 }
4080
4081 /* Emit one vertex from the geometry shader */
4082 static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
4083                                 unsigned stream,
4084                                 LLVMValueRef *addrs)
4085 {
4086         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4087         struct tgsi_shader_info *info = &ctx->shader->selector->info;
4088         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
4089         struct si_shader *shader = ctx->shader;
4090         struct lp_build_if_state if_state;
4091         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
4092                                             ctx->param_gs2vs_offset);
4093         LLVMValueRef gs_next_vertex;
4094         LLVMValueRef can_emit;
4095         unsigned chan, offset;
4096         int i;
4097
4098         /* Write vertex attribute values to GSVS ring */
4099         gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
4100                                        ctx->gs_next_vertex[stream],
4101                                        "");
4102
4103         /* If this thread has already emitted the declared maximum number of
4104          * vertices, skip the write: excessive vertex emissions are not
4105          * supposed to have any effect.
4106          *
4107          * If the shader has no writes to memory, kill it instead. This skips
4108          * further memory loads and may allow LLVM to skip to the end
4109          * altogether.
4110          */
4111         can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
4112                                  LLVMConstInt(ctx->i32,
4113                                               shader->selector->gs_max_out_vertices, 0), "");
4114
4115         bool use_kill = !info->writes_memory;
4116         if (use_kill) {
4117                 ac_build_kill_if_false(&ctx->ac, can_emit);
4118         } else {
4119                 lp_build_if(&if_state, &ctx->gallivm, can_emit);
4120         }
4121
4122         offset = 0;
4123         for (i = 0; i < info->num_outputs; i++) {
4124                 for (chan = 0; chan < 4; chan++) {
4125                         if (!(info->output_usagemask[i] & (1 << chan)) ||
4126                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4127                                 continue;
4128
4129                         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
4130                         LLVMValueRef voffset =
4131                                 LLVMConstInt(ctx->i32, offset *
4132                                              shader->selector->gs_max_out_vertices, 0);
4133                         offset++;
4134
4135                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
4136                         voffset = lp_build_mul_imm(uint, voffset, 4);
4137
4138                         out_val = ac_to_integer(&ctx->ac, out_val);
4139
4140                         ac_build_buffer_store_dword(&ctx->ac,
4141                                                     ctx->gsvs_ring[stream],
4142                                                     out_val, 1,
4143                                                     voffset, soffset, 0,
4144                                                     1, 1, true, true);
4145                 }
4146         }
4147
4148         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4149                                       ctx->i32_1);
4150
4151         LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4152
4153         /* Signal vertex emission */
4154         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4155                          si_get_gs_wave_id(ctx));
4156         if (!use_kill)
4157                 lp_build_endif(&if_state);
4158 }
4159
4160 /* Emit one vertex from the geometry shader */
4161 static void si_tgsi_emit_vertex(
4162         const struct lp_build_tgsi_action *action,
4163         struct lp_build_tgsi_context *bld_base,
4164         struct lp_build_emit_data *emit_data)
4165 {
4166         struct si_shader_context *ctx = si_shader_context(bld_base);
4167         unsigned stream = si_llvm_get_stream(bld_base, emit_data);
4168
4169         si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]);
4170 }
4171
4172 /* Cut one primitive from the geometry shader */
4173 static void si_llvm_emit_primitive(
4174         const struct lp_build_tgsi_action *action,
4175         struct lp_build_tgsi_context *bld_base,
4176         struct lp_build_emit_data *emit_data)
4177 {
4178         struct si_shader_context *ctx = si_shader_context(bld_base);
4179         unsigned stream;
4180
4181         /* Signal primitive cut */
4182         stream = si_llvm_get_stream(bld_base, emit_data);
4183         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4184                          si_get_gs_wave_id(ctx));
4185 }
4186
4187 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4188                                  struct lp_build_tgsi_context *bld_base,
4189                                  struct lp_build_emit_data *emit_data)
4190 {
4191         struct si_shader_context *ctx = si_shader_context(bld_base);
4192
4193         /* SI only (thanks to a hw bug workaround):
4194          * The real barrier instruction isn’t needed, because an entire patch
4195          * always fits into a single wave.
4196          */
4197         if (ctx->screen->info.chip_class == SI &&
4198             ctx->type == PIPE_SHADER_TESS_CTRL) {
4199                 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
4200                 return;
4201         }
4202
4203         lp_build_intrinsic(ctx->ac.builder,
4204                            "llvm.amdgcn.s.barrier",
4205                            ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4206 }
4207
4208 static const struct lp_build_tgsi_action interp_action = {
4209         .fetch_args = interp_fetch_args,
4210         .emit = build_interp_intrinsic,
4211 };
4212
4213 static void si_create_function(struct si_shader_context *ctx,
4214                                const char *name,
4215                                LLVMTypeRef *returns, unsigned num_returns,
4216                                struct si_function_info *fninfo,
4217                                unsigned max_workgroup_size)
4218 {
4219         int i;
4220
4221         si_llvm_create_func(ctx, name, returns, num_returns,
4222                             fninfo->types, fninfo->num_params);
4223         ctx->return_value = LLVMGetUndef(ctx->return_type);
4224
4225         for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4226                 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4227
4228                 /* The combination of:
4229                  * - ByVal
4230                  * - dereferenceable
4231                  * - invariant.load
4232                  * allows the optimization passes to move loads and reduces
4233                  * SGPR spilling significantly.
4234                  */
4235                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4236                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4237                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4238                         ac_add_attr_dereferenceable(P, UINT64_MAX);
4239                 } else
4240                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4241         }
4242
4243         for (i = 0; i < fninfo->num_params; ++i) {
4244                 if (fninfo->assign[i])
4245                         *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4246         }
4247
4248         if (max_workgroup_size) {
4249                 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4250                                       max_workgroup_size);
4251         }
4252         LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4253                                            "no-signed-zeros-fp-math",
4254                                            "true");
4255
4256         if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) {
4257                 /* These were copied from some LLVM test. */
4258                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4259                                                    "less-precise-fpmad",
4260                                                    "true");
4261                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4262                                                    "no-infs-fp-math",
4263                                                    "true");
4264                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4265                                                    "no-nans-fp-math",
4266                                                    "true");
4267                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4268                                                    "unsafe-fp-math",
4269                                                    "true");
4270         }
4271 }
4272
4273 static void declare_streamout_params(struct si_shader_context *ctx,
4274                                      struct pipe_stream_output_info *so,
4275                                      struct si_function_info *fninfo)
4276 {
4277         int i;
4278
4279         /* Streamout SGPRs. */
4280         if (so->num_outputs) {
4281                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4282                         ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4283                 else
4284                         ctx->param_streamout_config = fninfo->num_params - 1;
4285
4286                 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4287         }
4288         /* A streamout buffer offset is loaded if the stride is non-zero. */
4289         for (i = 0; i < 4; i++) {
4290                 if (!so->stride[i])
4291                         continue;
4292
4293                 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4294         }
4295 }
4296
4297 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4298 {
4299         switch (shader->selector->type) {
4300         case PIPE_SHADER_TESS_CTRL:
4301                 /* Return this so that LLVM doesn't remove s_barrier
4302                  * instructions on chips where we use s_barrier. */
4303                 return shader->selector->screen->info.chip_class >= CIK ? 128 : 64;
4304
4305         case PIPE_SHADER_GEOMETRY:
4306                 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64;
4307
4308         case PIPE_SHADER_COMPUTE:
4309                 break; /* see below */
4310
4311         default:
4312                 return 0;
4313         }
4314
4315         const unsigned *properties = shader->selector->info.properties;
4316         unsigned max_work_group_size =
4317                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4318                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4319                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4320
4321         if (!max_work_group_size) {
4322                 /* This is a variable group size compute shader,
4323                  * compile it for the maximum possible group size.
4324                  */
4325                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4326         }
4327         return max_work_group_size;
4328 }
4329
4330 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4331                                             struct si_function_info *fninfo,
4332                                             bool assign_params)
4333 {
4334         LLVMTypeRef const_shader_buf_type;
4335
4336         if (ctx->shader->selector->info.const_buffers_declared == 1 &&
4337             ctx->shader->selector->info.shader_buffers_declared == 0)
4338                 const_shader_buf_type = ctx->f32;
4339         else
4340                 const_shader_buf_type = ctx->v4i32;
4341
4342         unsigned const_and_shader_buffers =
4343                 add_arg(fninfo, ARG_SGPR,
4344                         si_const_array(const_shader_buf_type, 0));
4345
4346         unsigned samplers_and_images =
4347                 add_arg(fninfo, ARG_SGPR,
4348                         si_const_array(ctx->v8i32,
4349                                        SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4350
4351         if (assign_params) {
4352                 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4353                 ctx->param_samplers_and_images = samplers_and_images;
4354         }
4355 }
4356
4357 static void declare_global_desc_pointers(struct si_shader_context *ctx,
4358                                          struct si_function_info *fninfo)
4359 {
4360         ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4361                 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4362         ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4363                 si_const_array(ctx->v8i32, 0));
4364 }
4365
4366 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4367                                             struct si_function_info *fninfo)
4368 {
4369         ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4370                 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4371         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4372         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4373         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4374         ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4375 }
4376
4377 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4378                                    struct si_function_info *fninfo,
4379                                    unsigned *num_prolog_vgprs)
4380 {
4381         struct si_shader *shader = ctx->shader;
4382
4383         add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4384         if (shader->key.as_ls) {
4385                 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4386                 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4387         } else {
4388                 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4389                 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4390         }
4391         add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4392
4393         if (!shader->is_gs_copy_shader) {
4394                 /* Vertex load indices. */
4395                 ctx->param_vertex_index0 = fninfo->num_params;
4396                 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4397                         add_arg(fninfo, ARG_VGPR, ctx->i32);
4398                 *num_prolog_vgprs += shader->selector->info.num_inputs;
4399         }
4400 }
4401
4402 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4403                                     struct si_function_info *fninfo)
4404 {
4405         ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4406         ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4407         ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4408         ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4409 }
4410
4411 enum {
4412         /* Convenient merged shader definitions. */
4413         SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4414         SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4415 };
4416
4417 static void create_function(struct si_shader_context *ctx)
4418 {
4419         struct si_shader *shader = ctx->shader;
4420         struct si_function_info fninfo;
4421         LLVMTypeRef returns[16+32*4];
4422         unsigned i, num_return_sgprs;
4423         unsigned num_returns = 0;
4424         unsigned num_prolog_vgprs = 0;
4425         unsigned type = ctx->type;
4426         unsigned vs_blit_property =
4427                 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
4428
4429         si_init_function_info(&fninfo);
4430
4431         /* Set MERGED shaders. */
4432         if (ctx->screen->info.chip_class >= GFX9) {
4433                 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4434                         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4435                 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4436                         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4437         }
4438
4439         LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4440
4441         switch (type) {
4442         case PIPE_SHADER_VERTEX:
4443                 declare_global_desc_pointers(ctx, &fninfo);
4444
4445                 if (vs_blit_property) {
4446                         ctx->param_vs_blit_inputs = fninfo.num_params;
4447                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
4448                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
4449                         add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */
4450
4451                         if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
4452                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */
4453                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */
4454                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */
4455                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */
4456                         } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
4457                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
4458                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
4459                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
4460                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
4461                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
4462                                 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
4463                         }
4464
4465                         /* VGPRs */
4466                         declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4467                         break;
4468                 }
4469
4470                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4471                 declare_vs_specific_input_sgprs(ctx, &fninfo);
4472
4473                 if (shader->key.as_es) {
4474                         ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4475                 } else if (shader->key.as_ls) {
4476                         /* no extra parameters */
4477                 } else {
4478                         if (shader->is_gs_copy_shader) {
4479                                 fninfo.num_params = ctx->param_rw_buffers + 1;
4480                                 fninfo.num_sgpr_params = fninfo.num_params;
4481                         }
4482
4483                         /* The locations of the other parameters are assigned dynamically. */
4484                         declare_streamout_params(ctx, &shader->selector->so,
4485                                                  &fninfo);
4486                 }
4487
4488                 /* VGPRs */
4489                 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4490                 break;
4491
4492         case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4493                 declare_global_desc_pointers(ctx, &fninfo);
4494                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4495                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4496                 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4497                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4498                 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4499                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4500                 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4501                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4502                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4503
4504                 /* VGPRs */
4505                 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4506                 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4507
4508                 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4509                  * placed after the user SGPRs.
4510                  */
4511                 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4512                         returns[num_returns++] = ctx->i32; /* SGPRs */
4513                 for (i = 0; i < 11; i++)
4514                         returns[num_returns++] = ctx->f32; /* VGPRs */
4515                 break;
4516
4517         case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4518                 /* Merged stages have 8 system SGPRs at the beginning. */
4519                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4520                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* SPI_SHADER_USER_DATA_ADDR_HI_HS */
4521                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4522                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4523                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4524                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4525                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4526                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4527
4528                 declare_global_desc_pointers(ctx, &fninfo);
4529                 declare_per_stage_desc_pointers(ctx, &fninfo,
4530                                                 ctx->type == PIPE_SHADER_VERTEX);
4531                 declare_vs_specific_input_sgprs(ctx, &fninfo);
4532
4533                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4534                 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4535                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4536                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4537                 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4538                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4539
4540                 declare_per_stage_desc_pointers(ctx, &fninfo,
4541                                                 ctx->type == PIPE_SHADER_TESS_CTRL);
4542
4543                 /* VGPRs (first TCS, then VS) */
4544                 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4545                 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4546
4547                 if (ctx->type == PIPE_SHADER_VERTEX) {
4548                         declare_vs_input_vgprs(ctx, &fninfo,
4549                                                &num_prolog_vgprs);
4550
4551                         /* LS return values are inputs to the TCS main shader part. */
4552                         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4553                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4554                         for (i = 0; i < 2; i++)
4555                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4556                 } else {
4557                         /* TCS return values are inputs to the TCS epilog.
4558                          *
4559                          * param_tcs_offchip_offset, param_tcs_factor_offset,
4560                          * param_tcs_offchip_layout, and param_rw_buffers
4561                          * should be passed to the epilog.
4562                          */
4563                         for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4564                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4565                         for (i = 0; i < 11; i++)
4566                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4567                 }
4568                 break;
4569
4570         case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4571                 /* Merged stages have 8 system SGPRs at the beginning. */
4572                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_USER_DATA_ADDR_LO_GS) */
4573                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_USER_DATA_ADDR_HI_GS) */
4574                 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4575                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4576                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4577                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4578                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4579                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4580
4581                 declare_global_desc_pointers(ctx, &fninfo);
4582                 declare_per_stage_desc_pointers(ctx, &fninfo,
4583                                                 (ctx->type == PIPE_SHADER_VERTEX ||
4584                                                  ctx->type == PIPE_SHADER_TESS_EVAL));
4585                 if (ctx->type == PIPE_SHADER_VERTEX) {
4586                         declare_vs_specific_input_sgprs(ctx, &fninfo);
4587                 } else {
4588                         /* TESS_EVAL (and also GEOMETRY):
4589                          * Declare as many input SGPRs as the VS has. */
4590                         ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4591                         ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4592                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4593                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4594                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4595                         ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4596                 }
4597
4598                 declare_per_stage_desc_pointers(ctx, &fninfo,
4599                                                 ctx->type == PIPE_SHADER_GEOMETRY);
4600
4601                 /* VGPRs (first GS, then VS/TES) */
4602                 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4603                 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4604                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4605                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4606                 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4607
4608                 if (ctx->type == PIPE_SHADER_VERTEX) {
4609                         declare_vs_input_vgprs(ctx, &fninfo,
4610                                                &num_prolog_vgprs);
4611                 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4612                         declare_tes_input_vgprs(ctx, &fninfo);
4613                 }
4614
4615                 if (ctx->type == PIPE_SHADER_VERTEX ||
4616                     ctx->type == PIPE_SHADER_TESS_EVAL) {
4617                         /* ES return values are inputs to GS. */
4618                         for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4619                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4620                         for (i = 0; i < 5; i++)
4621                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4622                 }
4623                 break;
4624
4625         case PIPE_SHADER_TESS_EVAL:
4626                 declare_global_desc_pointers(ctx, &fninfo);
4627                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4628                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4629                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4630
4631                 if (shader->key.as_es) {
4632                         ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4633                         add_arg(&fninfo, ARG_SGPR, ctx->i32);
4634                         ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4635                 } else {
4636                         add_arg(&fninfo, ARG_SGPR, ctx->i32);
4637                         declare_streamout_params(ctx, &shader->selector->so,
4638                                                  &fninfo);
4639                         ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4640                 }
4641
4642                 /* VGPRs */
4643                 declare_tes_input_vgprs(ctx, &fninfo);
4644                 break;
4645
4646         case PIPE_SHADER_GEOMETRY:
4647                 declare_global_desc_pointers(ctx, &fninfo);
4648                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4649                 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4650                 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4651
4652                 /* VGPRs */
4653                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]);
4654                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]);
4655                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4656                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]);
4657                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]);
4658                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]);
4659                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]);
4660                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4661                 break;
4662
4663         case PIPE_SHADER_FRAGMENT:
4664                 declare_global_desc_pointers(ctx, &fninfo);
4665                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4666                 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4667                 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4668
4669                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4670                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4671                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4672                 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4673                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4674                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4675                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4676                 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4677                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4678                                        &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4679                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4680                                        &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4681                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4682                                        &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4683                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4684                                        &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4685                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4686                                        &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4687                 shader->info.face_vgpr_index = 20;
4688                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4689                                        &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4690                 shader->info.ancillary_vgpr_index = 21;
4691                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4692                                        &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4693                 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4694
4695                 /* Color inputs from the prolog. */
4696                 if (shader->selector->info.colors_read) {
4697                         unsigned num_color_elements =
4698                                 util_bitcount(shader->selector->info.colors_read);
4699
4700                         assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4701                         for (i = 0; i < num_color_elements; i++)
4702                                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4703
4704                         num_prolog_vgprs += num_color_elements;
4705                 }
4706
4707                 /* Outputs for the epilog. */
4708                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4709                 num_returns =
4710                         num_return_sgprs +
4711                         util_bitcount(shader->selector->info.colors_written) * 4 +
4712                         shader->selector->info.writes_z +
4713                         shader->selector->info.writes_stencil +
4714                         shader->selector->info.writes_samplemask +
4715                         1 /* SampleMaskIn */;
4716
4717                 num_returns = MAX2(num_returns,
4718                                    num_return_sgprs +
4719                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4720
4721                 for (i = 0; i < num_return_sgprs; i++)
4722                         returns[i] = ctx->i32;
4723                 for (; i < num_returns; i++)
4724                         returns[i] = ctx->f32;
4725                 break;
4726
4727         case PIPE_SHADER_COMPUTE:
4728                 declare_global_desc_pointers(ctx, &fninfo);
4729                 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4730                 if (shader->selector->info.uses_grid_size)
4731                         ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4732                 if (shader->selector->info.uses_block_size)
4733                         ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4734
4735                 for (i = 0; i < 3; i++) {
4736                         ctx->param_block_id[i] = -1;
4737                         if (shader->selector->info.uses_block_id[i])
4738                                 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4739                 }
4740
4741                 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4742                 break;
4743         default:
4744                 assert(0 && "unimplemented shader");
4745                 return;
4746         }
4747
4748         si_create_function(ctx, "main", returns, num_returns, &fninfo,
4749                            si_get_max_workgroup_size(shader));
4750
4751         /* Reserve register locations for VGPR inputs the PS prolog may need. */
4752         if (ctx->type == PIPE_SHADER_FRAGMENT &&
4753             ctx->separate_prolog) {
4754                 si_llvm_add_attribute(ctx->main_fn,
4755                                       "InitialPSInputAddr",
4756                                       S_0286D0_PERSP_SAMPLE_ENA(1) |
4757                                       S_0286D0_PERSP_CENTER_ENA(1) |
4758                                       S_0286D0_PERSP_CENTROID_ENA(1) |
4759                                       S_0286D0_LINEAR_SAMPLE_ENA(1) |
4760                                       S_0286D0_LINEAR_CENTER_ENA(1) |
4761                                       S_0286D0_LINEAR_CENTROID_ENA(1) |
4762                                       S_0286D0_FRONT_FACE_ENA(1) |
4763                                       S_0286D0_ANCILLARY_ENA(1) |
4764                                       S_0286D0_POS_FIXED_PT_ENA(1));
4765         }
4766
4767         shader->info.num_input_sgprs = 0;
4768         shader->info.num_input_vgprs = 0;
4769
4770         for (i = 0; i < fninfo.num_sgpr_params; ++i)
4771                 shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4;
4772
4773         for (; i < fninfo.num_params; ++i)
4774                 shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4;
4775
4776         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4777         shader->info.num_input_vgprs -= num_prolog_vgprs;
4778
4779         if (shader->key.as_ls ||
4780             ctx->type == PIPE_SHADER_TESS_CTRL ||
4781             /* GFX9 has the ESGS ring buffer in LDS. */
4782             type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY)
4783                 ac_declare_lds_as_pointer(&ctx->ac);
4784 }
4785
4786 /**
4787  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4788  * for later use.
4789  */
4790 static void preload_ring_buffers(struct si_shader_context *ctx)
4791 {
4792         LLVMBuilderRef builder = ctx->ac.builder;
4793
4794         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4795                                             ctx->param_rw_buffers);
4796
4797         if (ctx->screen->info.chip_class <= VI &&
4798             (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4799                 unsigned ring =
4800                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4801                                                              : SI_ES_RING_ESGS;
4802                 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4803
4804                 ctx->esgs_ring =
4805                         ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4806         }
4807
4808         if (ctx->shader->is_gs_copy_shader) {
4809                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4810
4811                 ctx->gsvs_ring[0] =
4812                         ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4813         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4814                 const struct si_shader_selector *sel = ctx->shader->selector;
4815                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4816                 LLVMValueRef base_ring;
4817
4818                 base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4819
4820                 /* The conceptual layout of the GSVS ring is
4821                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
4822                  * but the real memory layout is swizzled across
4823                  * threads:
4824                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4825                  *   t16v0c0 ..
4826                  * Override the buffer descriptor accordingly.
4827                  */
4828                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4829                 uint64_t stream_offset = 0;
4830
4831                 for (unsigned stream = 0; stream < 4; ++stream) {
4832                         unsigned num_components;
4833                         unsigned stride;
4834                         unsigned num_records;
4835                         LLVMValueRef ring, tmp;
4836
4837                         num_components = sel->info.num_stream_output_components[stream];
4838                         if (!num_components)
4839                                 continue;
4840
4841                         stride = 4 * num_components * sel->gs_max_out_vertices;
4842
4843                         /* Limit on the stride field for <= CIK. */
4844                         assert(stride < (1 << 14));
4845
4846                         num_records = 64;
4847
4848                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4849                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4850                         tmp = LLVMBuildAdd(builder, tmp,
4851                                            LLVMConstInt(ctx->i64,
4852                                                         stream_offset, 0), "");
4853                         stream_offset += stride * 64;
4854
4855                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4856                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4857                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4858                         tmp = LLVMBuildOr(builder, tmp,
4859                                 LLVMConstInt(ctx->i32,
4860                                              S_008F04_STRIDE(stride) |
4861                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
4862                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4863                         ring = LLVMBuildInsertElement(builder, ring,
4864                                         LLVMConstInt(ctx->i32, num_records, 0),
4865                                         LLVMConstInt(ctx->i32, 2, 0), "");
4866                         ring = LLVMBuildInsertElement(builder, ring,
4867                                 LLVMConstInt(ctx->i32,
4868                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4869                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4870                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4871                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4872                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4873                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4874                                              S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4875                                              S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4876                                              S_008F0C_ADD_TID_ENABLE(1),
4877                                              0),
4878                                 LLVMConstInt(ctx->i32, 3, 0), "");
4879
4880                         ctx->gsvs_ring[stream] = ring;
4881                 }
4882         }
4883 }
4884
4885 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4886                                          LLVMValueRef param_rw_buffers,
4887                                          unsigned param_pos_fixed_pt)
4888 {
4889         LLVMBuilderRef builder = ctx->ac.builder;
4890         LLVMValueRef slot, desc, offset, row, bit, address[2];
4891
4892         /* Use the fixed-point gl_FragCoord input.
4893          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4894          * per coordinate to get the repeating effect.
4895          */
4896         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4897         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4898
4899         /* Load the buffer descriptor. */
4900         slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4901         desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
4902
4903         /* The stipple pattern is 32x32, each row has 32 bits. */
4904         offset = LLVMBuildMul(builder, address[1],
4905                               LLVMConstInt(ctx->i32, 4, 0), "");
4906         row = buffer_load_const(ctx, desc, offset);
4907         row = ac_to_integer(&ctx->ac, row);
4908         bit = LLVMBuildLShr(builder, row, address[0], "");
4909         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4910         ac_build_kill_if_false(&ctx->ac, bit);
4911 }
4912
4913 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4914                                   struct si_shader_config *conf,
4915                                   unsigned symbol_offset)
4916 {
4917         unsigned i;
4918         const unsigned char *config =
4919                 ac_shader_binary_config_start(binary, symbol_offset);
4920         bool really_needs_scratch = false;
4921
4922         /* LLVM adds SGPR spills to the scratch size.
4923          * Find out if we really need the scratch buffer.
4924          */
4925         for (i = 0; i < binary->reloc_count; i++) {
4926                 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4927
4928                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4929                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4930                         really_needs_scratch = true;
4931                         break;
4932                 }
4933         }
4934
4935         /* XXX: We may be able to emit some of these values directly rather than
4936          * extracting fields to be emitted later.
4937          */
4938
4939         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4940                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4941                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4942                 switch (reg) {
4943                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4944                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4945                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4946                 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4947                 case R_00B848_COMPUTE_PGM_RSRC1:
4948                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4949                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4950                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
4951                         conf->rsrc1 = value;
4952                         break;
4953                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4954                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4955                         break;
4956                 case R_00B84C_COMPUTE_PGM_RSRC2:
4957                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4958                         conf->rsrc2 = value;
4959                         break;
4960                 case R_0286CC_SPI_PS_INPUT_ENA:
4961                         conf->spi_ps_input_ena = value;
4962                         break;
4963                 case R_0286D0_SPI_PS_INPUT_ADDR:
4964                         conf->spi_ps_input_addr = value;
4965                         break;
4966                 case R_0286E8_SPI_TMPRING_SIZE:
4967                 case R_00B860_COMPUTE_TMPRING_SIZE:
4968                         /* WAVESIZE is in units of 256 dwords. */
4969                         if (really_needs_scratch)
4970                                 conf->scratch_bytes_per_wave =
4971                                         G_00B860_WAVESIZE(value) * 256 * 4;
4972                         break;
4973                 case 0x4: /* SPILLED_SGPRS */
4974                         conf->spilled_sgprs = value;
4975                         break;
4976                 case 0x8: /* SPILLED_VGPRS */
4977                         conf->spilled_vgprs = value;
4978                         break;
4979                 default:
4980                         {
4981                                 static bool printed;
4982
4983                                 if (!printed) {
4984                                         fprintf(stderr, "Warning: LLVM emitted unknown "
4985                                                 "config register: 0x%x\n", reg);
4986                                         printed = true;
4987                                 }
4988                         }
4989                         break;
4990                 }
4991         }
4992
4993         if (!conf->spi_ps_input_addr)
4994                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4995 }
4996
4997 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4998                                     uint64_t scratch_va)
4999 {
5000         unsigned i;
5001         uint32_t scratch_rsrc_dword0 = scratch_va;
5002         uint32_t scratch_rsrc_dword1 =
5003                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5004
5005         /* Enable scratch coalescing. */
5006         scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5007
5008         for (i = 0 ; i < shader->binary.reloc_count; i++) {
5009                 const struct ac_shader_reloc *reloc =
5010                                         &shader->binary.relocs[i];
5011                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5012                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5013                         &scratch_rsrc_dword0, 4);
5014                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5015                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5016                         &scratch_rsrc_dword1, 4);
5017                 }
5018         }
5019 }
5020
5021 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
5022 {
5023         unsigned size = shader->binary.code_size;
5024
5025         if (shader->prolog)
5026                 size += shader->prolog->binary.code_size;
5027         if (shader->previous_stage)
5028                 size += shader->previous_stage->binary.code_size;
5029         if (shader->prolog2)
5030                 size += shader->prolog2->binary.code_size;
5031         if (shader->epilog)
5032                 size += shader->epilog->binary.code_size;
5033         return size;
5034 }
5035
5036 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5037 {
5038         const struct ac_shader_binary *prolog =
5039                 shader->prolog ? &shader->prolog->binary : NULL;
5040         const struct ac_shader_binary *previous_stage =
5041                 shader->previous_stage ? &shader->previous_stage->binary : NULL;
5042         const struct ac_shader_binary *prolog2 =
5043                 shader->prolog2 ? &shader->prolog2->binary : NULL;
5044         const struct ac_shader_binary *epilog =
5045                 shader->epilog ? &shader->epilog->binary : NULL;
5046         const struct ac_shader_binary *mainb = &shader->binary;
5047         unsigned bo_size = si_get_shader_binary_size(shader) +
5048                            (!epilog ? mainb->rodata_size : 0);
5049         unsigned char *ptr;
5050
5051         assert(!prolog || !prolog->rodata_size);
5052         assert(!previous_stage || !previous_stage->rodata_size);
5053         assert(!prolog2 || !prolog2->rodata_size);
5054         assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
5055                !mainb->rodata_size);
5056         assert(!epilog || !epilog->rodata_size);
5057
5058         r600_resource_reference(&shader->bo, NULL);
5059         shader->bo = (struct r600_resource*)
5060                      si_aligned_buffer_create(&sscreen->b,
5061                                               sscreen->cpdma_prefetch_writes_memory ?
5062                                                 0 : R600_RESOURCE_FLAG_READ_ONLY,
5063                                               PIPE_USAGE_IMMUTABLE,
5064                                               align(bo_size, SI_CPDMA_ALIGNMENT),
5065                                               256);
5066         if (!shader->bo)
5067                 return -ENOMEM;
5068
5069         /* Upload. */
5070         ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
5071                                         PIPE_TRANSFER_READ_WRITE |
5072                                         PIPE_TRANSFER_UNSYNCHRONIZED);
5073
5074         /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
5075          * endian-independent. */
5076         if (prolog) {
5077                 memcpy(ptr, prolog->code, prolog->code_size);
5078                 ptr += prolog->code_size;
5079         }
5080         if (previous_stage) {
5081                 memcpy(ptr, previous_stage->code, previous_stage->code_size);
5082                 ptr += previous_stage->code_size;
5083         }
5084         if (prolog2) {
5085                 memcpy(ptr, prolog2->code, prolog2->code_size);
5086                 ptr += prolog2->code_size;
5087         }
5088
5089         memcpy(ptr, mainb->code, mainb->code_size);
5090         ptr += mainb->code_size;
5091
5092         if (epilog)
5093                 memcpy(ptr, epilog->code, epilog->code_size);
5094         else if (mainb->rodata_size > 0)
5095                 memcpy(ptr, mainb->rodata, mainb->rodata_size);
5096
5097         sscreen->ws->buffer_unmap(shader->bo->buf);
5098         return 0;
5099 }
5100
5101 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5102                                        struct pipe_debug_callback *debug,
5103                                        const char *name, FILE *file)
5104 {
5105         char *line, *p;
5106         unsigned i, count;
5107
5108         if (binary->disasm_string) {
5109                 fprintf(file, "Shader %s disassembly:\n", name);
5110                 fprintf(file, "%s", binary->disasm_string);
5111
5112                 if (debug && debug->debug_message) {
5113                         /* Very long debug messages are cut off, so send the
5114                          * disassembly one line at a time. This causes more
5115                          * overhead, but on the plus side it simplifies
5116                          * parsing of resulting logs.
5117                          */
5118                         pipe_debug_message(debug, SHADER_INFO,
5119                                            "Shader Disassembly Begin");
5120
5121                         line = binary->disasm_string;
5122                         while (*line) {
5123                                 p = util_strchrnul(line, '\n');
5124                                 count = p - line;
5125
5126                                 if (count) {
5127                                         pipe_debug_message(debug, SHADER_INFO,
5128                                                            "%.*s", count, line);
5129                                 }
5130
5131                                 if (!*p)
5132                                         break;
5133                                 line = p + 1;
5134                         }
5135
5136                         pipe_debug_message(debug, SHADER_INFO,
5137                                            "Shader Disassembly End");
5138                 }
5139         } else {
5140                 fprintf(file, "Shader %s binary:\n", name);
5141                 for (i = 0; i < binary->code_size; i += 4) {
5142                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5143                                 binary->code[i + 3], binary->code[i + 2],
5144                                 binary->code[i + 1], binary->code[i]);
5145                 }
5146         }
5147 }
5148
5149 static void si_shader_dump_stats(struct si_screen *sscreen,
5150                                  const struct si_shader *shader,
5151                                  struct pipe_debug_callback *debug,
5152                                  unsigned processor,
5153                                  FILE *file,
5154                                  bool check_debug_option)
5155 {
5156         const struct si_shader_config *conf = &shader->config;
5157         unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
5158         unsigned code_size = si_get_shader_binary_size(shader);
5159         unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256;
5160         unsigned lds_per_wave = 0;
5161         unsigned max_simd_waves;
5162
5163         switch (sscreen->info.family) {
5164         /* These always have 8 waves: */
5165         case CHIP_POLARIS10:
5166         case CHIP_POLARIS11:
5167         case CHIP_POLARIS12:
5168                 max_simd_waves = 8;
5169                 break;
5170         default:
5171                 max_simd_waves = 10;
5172         }
5173
5174         /* Compute LDS usage for PS. */
5175         switch (processor) {
5176         case PIPE_SHADER_FRAGMENT:
5177                 /* The minimum usage per wave is (num_inputs * 48). The maximum
5178                  * usage is (num_inputs * 48 * 16).
5179                  * We can get anything in between and it varies between waves.
5180                  *
5181                  * The 48 bytes per input for a single primitive is equal to
5182                  * 4 bytes/component * 4 components/input * 3 points.
5183                  *
5184                  * Other stages don't know the size at compile time or don't
5185                  * allocate LDS per wave, but instead they do it per thread group.
5186                  */
5187                 lds_per_wave = conf->lds_size * lds_increment +
5188                                align(num_inputs * 48, lds_increment);
5189                 break;
5190         case PIPE_SHADER_COMPUTE:
5191                 if (shader->selector) {
5192                         unsigned max_workgroup_size =
5193                                 si_get_max_workgroup_size(shader);
5194                         lds_per_wave = (conf->lds_size * lds_increment) /
5195                                        DIV_ROUND_UP(max_workgroup_size, 64);
5196                 }
5197                 break;
5198         }
5199
5200         /* Compute the per-SIMD wave counts. */
5201         if (conf->num_sgprs) {
5202                 if (sscreen->info.chip_class >= VI)
5203                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5204                 else
5205                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5206         }
5207
5208         if (conf->num_vgprs)
5209                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5210
5211         /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5212          * 16KB makes some SIMDs unoccupied). */
5213         if (lds_per_wave)
5214                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5215
5216         if (!check_debug_option ||
5217             si_can_dump_shader(sscreen, processor)) {
5218                 if (processor == PIPE_SHADER_FRAGMENT) {
5219                         fprintf(file, "*** SHADER CONFIG ***\n"
5220                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5221                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
5222                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5223                 }
5224
5225                 fprintf(file, "*** SHADER STATS ***\n"
5226                         "SGPRS: %d\n"
5227                         "VGPRS: %d\n"
5228                         "Spilled SGPRs: %d\n"
5229                         "Spilled VGPRs: %d\n"
5230                         "Private memory VGPRs: %d\n"
5231                         "Code Size: %d bytes\n"
5232                         "LDS: %d blocks\n"
5233                         "Scratch: %d bytes per wave\n"
5234                         "Max Waves: %d\n"
5235                         "********************\n\n\n",
5236                         conf->num_sgprs, conf->num_vgprs,
5237                         conf->spilled_sgprs, conf->spilled_vgprs,
5238                         conf->private_mem_vgprs, code_size,
5239                         conf->lds_size, conf->scratch_bytes_per_wave,
5240                         max_simd_waves);
5241         }
5242
5243         pipe_debug_message(debug, SHADER_INFO,
5244                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5245                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5246                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
5247                            conf->num_sgprs, conf->num_vgprs, code_size,
5248                            conf->lds_size, conf->scratch_bytes_per_wave,
5249                            max_simd_waves, conf->spilled_sgprs,
5250                            conf->spilled_vgprs, conf->private_mem_vgprs);
5251 }
5252
5253 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5254 {
5255         switch (processor) {
5256         case PIPE_SHADER_VERTEX:
5257                 if (shader->key.as_es)
5258                         return "Vertex Shader as ES";
5259                 else if (shader->key.as_ls)
5260                         return "Vertex Shader as LS";
5261                 else
5262                         return "Vertex Shader as VS";
5263         case PIPE_SHADER_TESS_CTRL:
5264                 return "Tessellation Control Shader";
5265         case PIPE_SHADER_TESS_EVAL:
5266                 if (shader->key.as_es)
5267                         return "Tessellation Evaluation Shader as ES";
5268                 else
5269                         return "Tessellation Evaluation Shader as VS";
5270         case PIPE_SHADER_GEOMETRY:
5271                 if (shader->is_gs_copy_shader)
5272                         return "GS Copy Shader as VS";
5273                 else
5274                         return "Geometry Shader";
5275         case PIPE_SHADER_FRAGMENT:
5276                 return "Pixel Shader";
5277         case PIPE_SHADER_COMPUTE:
5278                 return "Compute Shader";
5279         default:
5280                 return "Unknown Shader";
5281         }
5282 }
5283
5284 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5285                     struct pipe_debug_callback *debug, unsigned processor,
5286                     FILE *file, bool check_debug_option)
5287 {
5288         if (!check_debug_option ||
5289             si_can_dump_shader(sscreen, processor))
5290                 si_dump_shader_key(processor, shader, file);
5291
5292         if (!check_debug_option && shader->binary.llvm_ir_string) {
5293                 if (shader->previous_stage &&
5294                     shader->previous_stage->binary.llvm_ir_string) {
5295                         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5296                                 si_get_shader_name(shader, processor));
5297                         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5298                 }
5299
5300                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5301                         si_get_shader_name(shader, processor));
5302                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5303         }
5304
5305         if (!check_debug_option ||
5306             (si_can_dump_shader(sscreen, processor) &&
5307              !(sscreen->debug_flags & DBG(NO_ASM)))) {
5308                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5309
5310                 if (shader->prolog)
5311                         si_shader_dump_disassembly(&shader->prolog->binary,
5312                                                    debug, "prolog", file);
5313                 if (shader->previous_stage)
5314                         si_shader_dump_disassembly(&shader->previous_stage->binary,
5315                                                    debug, "previous stage", file);
5316                 if (shader->prolog2)
5317                         si_shader_dump_disassembly(&shader->prolog2->binary,
5318                                                    debug, "prolog2", file);
5319
5320                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5321
5322                 if (shader->epilog)
5323                         si_shader_dump_disassembly(&shader->epilog->binary,
5324                                                    debug, "epilog", file);
5325                 fprintf(file, "\n");
5326         }
5327
5328         si_shader_dump_stats(sscreen, shader, debug, processor, file,
5329                              check_debug_option);
5330 }
5331
5332 static int si_compile_llvm(struct si_screen *sscreen,
5333                            struct ac_shader_binary *binary,
5334                            struct si_shader_config *conf,
5335                            LLVMTargetMachineRef tm,
5336                            LLVMModuleRef mod,
5337                            struct pipe_debug_callback *debug,
5338                            unsigned processor,
5339                            const char *name)
5340 {
5341         int r = 0;
5342         unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
5343
5344         if (si_can_dump_shader(sscreen, processor)) {
5345                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5346
5347                 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
5348                         fprintf(stderr, "%s LLVM IR:\n\n", name);
5349                         ac_dump_module(mod);
5350                         fprintf(stderr, "\n");
5351                 }
5352         }
5353
5354         if (sscreen->record_llvm_ir) {
5355                 char *ir = LLVMPrintModuleToString(mod);
5356                 binary->llvm_ir_string = strdup(ir);
5357                 LLVMDisposeMessage(ir);
5358         }
5359
5360         if (!si_replace_shader(count, binary)) {
5361                 r = si_llvm_compile(mod, binary, tm, debug);
5362                 if (r)
5363                         return r;
5364         }
5365
5366         si_shader_binary_read_config(binary, conf, 0);
5367
5368         /* Enable 64-bit and 16-bit denormals, because there is no performance
5369          * cost.
5370          *
5371          * If denormals are enabled, all floating-point output modifiers are
5372          * ignored.
5373          *
5374          * Don't enable denormals for 32-bit floats, because:
5375          * - Floating-point output modifiers would be ignored by the hw.
5376          * - Some opcodes don't support denormals, such as v_mad_f32. We would
5377          *   have to stop using those.
5378          * - SI & CI would be very slow.
5379          */
5380         conf->float_mode |= V_00B028_FP_64_DENORMS;
5381
5382         FREE(binary->config);
5383         FREE(binary->global_symbol_offsets);
5384         binary->config = NULL;
5385         binary->global_symbol_offsets = NULL;
5386
5387         /* Some shaders can't have rodata because their binaries can be
5388          * concatenated.
5389          */
5390         if (binary->rodata_size &&
5391             (processor == PIPE_SHADER_VERTEX ||
5392              processor == PIPE_SHADER_TESS_CTRL ||
5393              processor == PIPE_SHADER_TESS_EVAL ||
5394              processor == PIPE_SHADER_FRAGMENT)) {
5395                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5396                 return -EINVAL;
5397         }
5398
5399         return r;
5400 }
5401
5402 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5403 {
5404         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5405                 LLVMBuildRetVoid(ctx->ac.builder);
5406         else
5407                 LLVMBuildRet(ctx->ac.builder, ret);
5408 }
5409
5410 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5411 struct si_shader *
5412 si_generate_gs_copy_shader(struct si_screen *sscreen,
5413                            LLVMTargetMachineRef tm,
5414                            struct si_shader_selector *gs_selector,
5415                            struct pipe_debug_callback *debug)
5416 {
5417         struct si_shader_context ctx;
5418         struct si_shader *shader;
5419         LLVMBuilderRef builder;
5420         struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5421         struct lp_build_context *uint = &bld_base->uint_bld;
5422         struct si_shader_output_values *outputs;
5423         struct tgsi_shader_info *gsinfo = &gs_selector->info;
5424         int i, r;
5425
5426         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5427
5428         if (!outputs)
5429                 return NULL;
5430
5431         shader = CALLOC_STRUCT(si_shader);
5432         if (!shader) {
5433                 FREE(outputs);
5434                 return NULL;
5435         }
5436
5437         /* We can leave the fence as permanently signaled because the GS copy
5438          * shader only becomes visible globally after it has been compiled. */
5439         util_queue_fence_init(&shader->ready);
5440
5441         shader->selector = gs_selector;
5442         shader->is_gs_copy_shader = true;
5443
5444         si_init_shader_ctx(&ctx, sscreen, tm);
5445         ctx.shader = shader;
5446         ctx.type = PIPE_SHADER_VERTEX;
5447
5448         builder = ctx.ac.builder;
5449
5450         create_function(&ctx);
5451         preload_ring_buffers(&ctx);
5452
5453         LLVMValueRef voffset =
5454                 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5455
5456         /* Fetch the vertex stream ID.*/
5457         LLVMValueRef stream_id;
5458
5459         if (gs_selector->so.num_outputs)
5460                 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5461         else
5462                 stream_id = ctx.i32_0;
5463
5464         /* Fill in output information. */
5465         for (i = 0; i < gsinfo->num_outputs; ++i) {
5466                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5467                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5468
5469                 for (int chan = 0; chan < 4; chan++) {
5470                         outputs[i].vertex_stream[chan] =
5471                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5472                 }
5473         }
5474
5475         LLVMBasicBlockRef end_bb;
5476         LLVMValueRef switch_inst;
5477
5478         end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
5479         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5480
5481         for (int stream = 0; stream < 4; stream++) {
5482                 LLVMBasicBlockRef bb;
5483                 unsigned offset;
5484
5485                 if (!gsinfo->num_stream_output_components[stream])
5486                         continue;
5487
5488                 if (stream > 0 && !gs_selector->so.num_outputs)
5489                         continue;
5490
5491                 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
5492                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5493                 LLVMPositionBuilderAtEnd(builder, bb);
5494
5495                 /* Fetch vertex data from GSVS ring */
5496                 offset = 0;
5497                 for (i = 0; i < gsinfo->num_outputs; ++i) {
5498                         for (unsigned chan = 0; chan < 4; chan++) {
5499                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5500                                     outputs[i].vertex_stream[chan] != stream) {
5501                                         outputs[i].values[chan] = ctx.bld_base.base.undef;
5502                                         continue;
5503                                 }
5504
5505                                 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5506                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5507                                 offset++;
5508
5509                                 outputs[i].values[chan] =
5510                                         ac_build_buffer_load(&ctx.ac,
5511                                                              ctx.gsvs_ring[0], 1,
5512                                                              ctx.i32_0, voffset,
5513                                                              soffset, 0, 1, 1,
5514                                                              true, false);
5515                         }
5516                 }
5517
5518                 /* Streamout and exports. */
5519                 if (gs_selector->so.num_outputs) {
5520                         si_llvm_emit_streamout(&ctx, outputs,
5521                                                gsinfo->num_outputs,
5522                                                stream);
5523                 }
5524
5525                 if (stream == 0)
5526                         si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
5527
5528                 LLVMBuildBr(builder, end_bb);
5529         }
5530
5531         LLVMPositionBuilderAtEnd(builder, end_bb);
5532
5533         LLVMBuildRetVoid(ctx.ac.builder);
5534
5535         ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5536         si_llvm_optimize_module(&ctx);
5537
5538         r = si_compile_llvm(sscreen, &ctx.shader->binary,
5539                             &ctx.shader->config, ctx.tm,
5540                             ctx.gallivm.module,
5541                             debug, PIPE_SHADER_GEOMETRY,
5542                             "GS Copy Shader");
5543         if (!r) {
5544                 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
5545                         fprintf(stderr, "GS Copy Shader:\n");
5546                 si_shader_dump(sscreen, ctx.shader, debug,
5547                                PIPE_SHADER_GEOMETRY, stderr, true);
5548                 r = si_shader_binary_upload(sscreen, ctx.shader);
5549         }
5550
5551         si_llvm_dispose(&ctx);
5552
5553         FREE(outputs);
5554
5555         if (r != 0) {
5556                 FREE(shader);
5557                 shader = NULL;
5558         }
5559         return shader;
5560 }
5561
5562 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5563                                   const struct si_vs_prolog_bits *prolog,
5564                                   const char *prefix, FILE *f)
5565 {
5566         fprintf(f, "  %s.instance_divisor_is_one = %u\n",
5567                 prefix, prolog->instance_divisor_is_one);
5568         fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
5569                 prefix, prolog->instance_divisor_is_fetched);
5570         fprintf(f, "  %s.ls_vgpr_fix = %u\n",
5571                 prefix, prolog->ls_vgpr_fix);
5572
5573         fprintf(f, "  mono.vs.fix_fetch = {");
5574         for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5575                 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5576         fprintf(f, "}\n");
5577 }
5578
5579 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5580                                FILE *f)
5581 {
5582         const struct si_shader_key *key = &shader->key;
5583
5584         fprintf(f, "SHADER KEY\n");
5585
5586         switch (processor) {
5587         case PIPE_SHADER_VERTEX:
5588                 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5589                                       "part.vs.prolog", f);
5590                 fprintf(f, "  as_es = %u\n", key->as_es);
5591                 fprintf(f, "  as_ls = %u\n", key->as_ls);
5592                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5593                         key->mono.u.vs_export_prim_id);
5594                 break;
5595
5596         case PIPE_SHADER_TESS_CTRL:
5597                 if (shader->selector->screen->info.chip_class >= GFX9) {
5598                         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5599                                               "part.tcs.ls_prolog", f);
5600                 }
5601                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5602                 fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5603                 break;
5604
5605         case PIPE_SHADER_TESS_EVAL:
5606                 fprintf(f, "  as_es = %u\n", key->as_es);
5607                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5608                         key->mono.u.vs_export_prim_id);
5609                 break;
5610
5611         case PIPE_SHADER_GEOMETRY:
5612                 if (shader->is_gs_copy_shader)
5613                         break;
5614
5615                 if (shader->selector->screen->info.chip_class >= GFX9 &&
5616                     key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5617                         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5618                                               "part.gs.vs_prolog", f);
5619                 }
5620                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5621                 break;
5622
5623         case PIPE_SHADER_COMPUTE:
5624                 break;
5625
5626         case PIPE_SHADER_FRAGMENT:
5627                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5628                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5629                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5630                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5631                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5632                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5633                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5634                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5635                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5636                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5637                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5638                 fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5639                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5640                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5641                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5642                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5643                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5644                 break;
5645
5646         default:
5647                 assert(0);
5648         }
5649
5650         if ((processor == PIPE_SHADER_GEOMETRY ||
5651              processor == PIPE_SHADER_TESS_EVAL ||
5652              processor == PIPE_SHADER_VERTEX) &&
5653             !key->as_es && !key->as_ls) {
5654                 fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5655                 fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
5656         }
5657 }
5658
5659 static void si_init_shader_ctx(struct si_shader_context *ctx,
5660                                struct si_screen *sscreen,
5661                                LLVMTargetMachineRef tm)
5662 {
5663         struct lp_build_tgsi_context *bld_base;
5664
5665         si_llvm_context_init(ctx, sscreen, tm);
5666
5667         bld_base = &ctx->bld_base;
5668         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5669
5670         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5671         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5672         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5673
5674         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5675
5676         bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5677
5678         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5679         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5680         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5681         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5682
5683         bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5684         bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5685         bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5686         bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5687         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5688         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5689         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5690         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5691         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5692
5693         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex;
5694         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5695         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5696 }
5697
5698 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5699 {
5700         struct si_shader *shader = ctx->shader;
5701         struct tgsi_shader_info *info = &shader->selector->info;
5702
5703         if ((ctx->type != PIPE_SHADER_VERTEX &&
5704              ctx->type != PIPE_SHADER_TESS_EVAL) ||
5705             shader->key.as_ls ||
5706             shader->key.as_es)
5707                 return;
5708
5709         ac_optimize_vs_outputs(&ctx->ac,
5710                                ctx->main_fn,
5711                                shader->info.vs_output_param_offset,
5712                                info->num_outputs,
5713                                &shader->info.nr_param_exports);
5714 }
5715
5716 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5717 {
5718         ctx->shader->config.private_mem_vgprs = 0;
5719
5720         /* Process all LLVM instructions. */
5721         LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5722         while (bb) {
5723                 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5724
5725                 while (next) {
5726                         LLVMValueRef inst = next;
5727                         next = LLVMGetNextInstruction(next);
5728
5729                         if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5730                                 continue;
5731
5732                         LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5733                         /* No idea why LLVM aligns allocas to 4 elements. */
5734                         unsigned alignment = LLVMGetAlignment(inst);
5735                         unsigned dw_size = align(ac_get_type_size(type) / 4, alignment);
5736                         ctx->shader->config.private_mem_vgprs += dw_size;
5737                 }
5738                 bb = LLVMGetNextBasicBlock(bb);
5739         }
5740 }
5741
5742 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5743 {
5744         LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5745         lp_build_intrinsic(ctx->ac.builder,
5746                            "llvm.amdgcn.init.exec", ctx->voidt,
5747                            &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5748 }
5749
5750 static void si_init_exec_from_input(struct si_shader_context *ctx,
5751                                     unsigned param, unsigned bitoffset)
5752 {
5753         LLVMValueRef args[] = {
5754                 LLVMGetParam(ctx->main_fn, param),
5755                 LLVMConstInt(ctx->i32, bitoffset, 0),
5756         };
5757         lp_build_intrinsic(ctx->ac.builder,
5758                            "llvm.amdgcn.init.exec.from.input",
5759                            ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5760 }
5761
5762 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
5763                                const struct si_vs_prolog_bits *key)
5764 {
5765         /* VGPR initialization fixup for Vega10 and Raven is always done in the
5766          * VS prolog. */
5767         return sel->vs_needs_prolog || key->ls_vgpr_fix;
5768 }
5769
5770 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5771                                  bool is_monolithic)
5772 {
5773         struct si_shader *shader = ctx->shader;
5774         struct si_shader_selector *sel = shader->selector;
5775         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5776
5777         // TODO clean all this up!
5778         switch (ctx->type) {
5779         case PIPE_SHADER_VERTEX:
5780                 ctx->load_input = declare_input_vs;
5781                 if (shader->key.as_ls)
5782                         ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
5783                 else if (shader->key.as_es)
5784                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5785                 else
5786                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5787                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5788                 break;
5789         case PIPE_SHADER_TESS_CTRL:
5790                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5791                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5792                 bld_base->emit_store = store_output_tcs;
5793                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5794                 break;
5795         case PIPE_SHADER_TESS_EVAL:
5796                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5797                 if (shader->key.as_es)
5798                         ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5799                 else
5800                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5801                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5802                 break;
5803         case PIPE_SHADER_GEOMETRY:
5804                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5805                 ctx->abi.load_inputs = si_nir_load_input_gs;
5806                 ctx->abi.emit_vertex = si_llvm_emit_vertex;
5807                 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
5808                 bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue;
5809                 break;
5810         case PIPE_SHADER_FRAGMENT:
5811                 ctx->load_input = declare_input_fs;
5812                 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5813                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5814                 break;
5815         case PIPE_SHADER_COMPUTE:
5816                 break;
5817         default:
5818                 assert(!"Unsupported shader type");
5819                 return false;
5820         }
5821
5822         ctx->abi.load_ubo = load_ubo;
5823         ctx->abi.load_ssbo = load_ssbo;
5824
5825         create_function(ctx);
5826         preload_ring_buffers(ctx);
5827
5828         /* For GFX9 merged shaders:
5829          * - Set EXEC for the first shader. If the prolog is present, set
5830          *   EXEC there instead.
5831          * - Add a barrier before the second shader.
5832          * - In the second shader, reset EXEC to ~0 and wrap the main part in
5833          *   an if-statement. This is required for correctness in geometry
5834          *   shaders, to ensure that empty GS waves do not send GS_EMIT and
5835          *   GS_CUT messages.
5836          *
5837          * For monolithic merged shaders, the first shader is wrapped in an
5838          * if-block together with its prolog in si_build_wrapper_function.
5839          */
5840         if (ctx->screen->info.chip_class >= GFX9) {
5841                 if (!is_monolithic &&
5842                     sel->info.num_instructions > 1 && /* not empty shader */
5843                     (shader->key.as_es || shader->key.as_ls) &&
5844                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
5845                      (ctx->type == PIPE_SHADER_VERTEX &&
5846                       !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
5847                         si_init_exec_from_input(ctx,
5848                                                 ctx->param_merged_wave_info, 0);
5849                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5850                            ctx->type == PIPE_SHADER_GEOMETRY) {
5851                         if (!is_monolithic)
5852                                 si_init_exec_full_mask(ctx);
5853
5854                         /* The barrier must execute for all shaders in a
5855                          * threadgroup.
5856                          */
5857                         si_llvm_emit_barrier(NULL, bld_base, NULL);
5858
5859                         LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5860                         LLVMValueRef ena =
5861                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5862                                             ac_get_thread_id(&ctx->ac), num_threads, "");
5863                         lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5864                 }
5865         }
5866
5867         if (ctx->type == PIPE_SHADER_TESS_CTRL &&
5868             sel->tcs_info.tessfactors_are_def_in_all_invocs) {
5869                 for (unsigned i = 0; i < 6; i++) {
5870                         ctx->invoc0_tess_factors[i] =
5871                                 lp_build_alloca_undef(&ctx->gallivm, ctx->i32, "");
5872                 }
5873         }
5874
5875         if (ctx->type == PIPE_SHADER_GEOMETRY) {
5876                 int i;
5877                 for (i = 0; i < 4; i++) {
5878                         ctx->gs_next_vertex[i] =
5879                                 lp_build_alloca(&ctx->gallivm,
5880                                                 ctx->i32, "");
5881                 }
5882         }
5883
5884         if (sel->force_correct_derivs_after_kill) {
5885                 ctx->postponed_kill = lp_build_alloca_undef(&ctx->gallivm, ctx->i1, "");
5886                 /* true = don't kill. */
5887                 LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0),
5888                                ctx->postponed_kill);
5889         }
5890
5891         if (sel->tokens) {
5892                 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5893                         fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5894                         return false;
5895                 }
5896         } else {
5897                 if (!si_nir_build_llvm(ctx, sel->nir)) {
5898                         fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
5899                         return false;
5900                 }
5901         }
5902
5903         si_llvm_build_ret(ctx, ctx->return_value);
5904         return true;
5905 }
5906
5907 /**
5908  * Compute the VS prolog key, which contains all the information needed to
5909  * build the VS prolog function, and set shader->info bits where needed.
5910  *
5911  * \param info             Shader info of the vertex shader.
5912  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
5913  * \param prolog_key       Key of the VS prolog
5914  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
5915  * \param key              Output shader part key.
5916  */
5917 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5918                                  unsigned num_input_sgprs,
5919                                  const struct si_vs_prolog_bits *prolog_key,
5920                                  struct si_shader *shader_out,
5921                                  union si_shader_part_key *key)
5922 {
5923         memset(key, 0, sizeof(*key));
5924         key->vs_prolog.states = *prolog_key;
5925         key->vs_prolog.num_input_sgprs = num_input_sgprs;
5926         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5927         key->vs_prolog.as_ls = shader_out->key.as_ls;
5928         key->vs_prolog.as_es = shader_out->key.as_es;
5929
5930         if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5931                 key->vs_prolog.as_ls = 1;
5932                 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5933         } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5934                 key->vs_prolog.as_es = 1;
5935                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5936         }
5937
5938         /* Enable loading the InstanceID VGPR. */
5939         uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5940
5941         if ((key->vs_prolog.states.instance_divisor_is_one |
5942              key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5943                 shader_out->info.uses_instanceid = true;
5944 }
5945
5946 /**
5947  * Compute the PS prolog key, which contains all the information needed to
5948  * build the PS prolog function, and set related bits in shader->config.
5949  */
5950 static void si_get_ps_prolog_key(struct si_shader *shader,
5951                                  union si_shader_part_key *key,
5952                                  bool separate_prolog)
5953 {
5954         struct tgsi_shader_info *info = &shader->selector->info;
5955
5956         memset(key, 0, sizeof(*key));
5957         key->ps_prolog.states = shader->key.part.ps.prolog;
5958         key->ps_prolog.colors_read = info->colors_read;
5959         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5960         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5961         key->ps_prolog.wqm = info->uses_derivatives &&
5962                 (key->ps_prolog.colors_read ||
5963                  key->ps_prolog.states.force_persp_sample_interp ||
5964                  key->ps_prolog.states.force_linear_sample_interp ||
5965                  key->ps_prolog.states.force_persp_center_interp ||
5966                  key->ps_prolog.states.force_linear_center_interp ||
5967                  key->ps_prolog.states.bc_optimize_for_persp ||
5968                  key->ps_prolog.states.bc_optimize_for_linear);
5969         key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
5970
5971         if (info->colors_read) {
5972                 unsigned *color = shader->selector->color_attr_index;
5973
5974                 if (shader->key.part.ps.prolog.color_two_side) {
5975                         /* BCOLORs are stored after the last input. */
5976                         key->ps_prolog.num_interp_inputs = info->num_inputs;
5977                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5978                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5979                 }
5980
5981                 for (unsigned i = 0; i < 2; i++) {
5982                         unsigned interp = info->input_interpolate[color[i]];
5983                         unsigned location = info->input_interpolate_loc[color[i]];
5984
5985                         if (!(info->colors_read & (0xf << i*4)))
5986                                 continue;
5987
5988                         key->ps_prolog.color_attr_index[i] = color[i];
5989
5990                         if (shader->key.part.ps.prolog.flatshade_colors &&
5991                             interp == TGSI_INTERPOLATE_COLOR)
5992                                 interp = TGSI_INTERPOLATE_CONSTANT;
5993
5994                         switch (interp) {
5995                         case TGSI_INTERPOLATE_CONSTANT:
5996                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5997                                 break;
5998                         case TGSI_INTERPOLATE_PERSPECTIVE:
5999                         case TGSI_INTERPOLATE_COLOR:
6000                                 /* Force the interpolation location for colors here. */
6001                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
6002                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
6003                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
6004                                         location = TGSI_INTERPOLATE_LOC_CENTER;
6005
6006                                 switch (location) {
6007                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
6008                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
6009                                         shader->config.spi_ps_input_ena |=
6010                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
6011                                         break;
6012                                 case TGSI_INTERPOLATE_LOC_CENTER:
6013                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
6014                                         shader->config.spi_ps_input_ena |=
6015                                                 S_0286CC_PERSP_CENTER_ENA(1);
6016                                         break;
6017                                 case TGSI_INTERPOLATE_LOC_CENTROID:
6018                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
6019                                         shader->config.spi_ps_input_ena |=
6020                                                 S_0286CC_PERSP_CENTROID_ENA(1);
6021                                         break;
6022                                 default:
6023                                         assert(0);
6024                                 }
6025                                 break;
6026                         case TGSI_INTERPOLATE_LINEAR:
6027                                 /* Force the interpolation location for colors here. */
6028                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
6029                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
6030                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
6031                                         location = TGSI_INTERPOLATE_LOC_CENTER;
6032
6033                                 /* The VGPR assignment for non-monolithic shaders
6034                                  * works because InitialPSInputAddr is set on the
6035                                  * main shader and PERSP_PULL_MODEL is never used.
6036                                  */
6037                                 switch (location) {
6038                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
6039                                         key->ps_prolog.color_interp_vgpr_index[i] =
6040                                                 separate_prolog ? 6 : 9;
6041                                         shader->config.spi_ps_input_ena |=
6042                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
6043                                         break;
6044                                 case TGSI_INTERPOLATE_LOC_CENTER:
6045                                         key->ps_prolog.color_interp_vgpr_index[i] =
6046                                                 separate_prolog ? 8 : 11;
6047                                         shader->config.spi_ps_input_ena |=
6048                                                 S_0286CC_LINEAR_CENTER_ENA(1);
6049                                         break;
6050                                 case TGSI_INTERPOLATE_LOC_CENTROID:
6051                                         key->ps_prolog.color_interp_vgpr_index[i] =
6052                                                 separate_prolog ? 10 : 13;
6053                                         shader->config.spi_ps_input_ena |=
6054                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
6055                                         break;
6056                                 default:
6057                                         assert(0);
6058                                 }
6059                                 break;
6060                         default:
6061                                 assert(0);
6062                         }
6063                 }
6064         }
6065 }
6066
6067 /**
6068  * Check whether a PS prolog is required based on the key.
6069  */
6070 static bool si_need_ps_prolog(const union si_shader_part_key *key)
6071 {
6072         return key->ps_prolog.colors_read ||
6073                key->ps_prolog.states.force_persp_sample_interp ||
6074                key->ps_prolog.states.force_linear_sample_interp ||
6075                key->ps_prolog.states.force_persp_center_interp ||
6076                key->ps_prolog.states.force_linear_center_interp ||
6077                key->ps_prolog.states.bc_optimize_for_persp ||
6078                key->ps_prolog.states.bc_optimize_for_linear ||
6079                key->ps_prolog.states.poly_stipple ||
6080                key->ps_prolog.states.samplemask_log_ps_iter;
6081 }
6082
6083 /**
6084  * Compute the PS epilog key, which contains all the information needed to
6085  * build the PS epilog function.
6086  */
6087 static void si_get_ps_epilog_key(struct si_shader *shader,
6088                                  union si_shader_part_key *key)
6089 {
6090         struct tgsi_shader_info *info = &shader->selector->info;
6091         memset(key, 0, sizeof(*key));
6092         key->ps_epilog.colors_written = info->colors_written;
6093         key->ps_epilog.writes_z = info->writes_z;
6094         key->ps_epilog.writes_stencil = info->writes_stencil;
6095         key->ps_epilog.writes_samplemask = info->writes_samplemask;
6096         key->ps_epilog.states = shader->key.part.ps.epilog;
6097 }
6098
6099 /**
6100  * Build the GS prolog function. Rotate the input vertices for triangle strips
6101  * with adjacency.
6102  */
6103 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
6104                                         union si_shader_part_key *key)
6105 {
6106         unsigned num_sgprs, num_vgprs;
6107         struct si_function_info fninfo;
6108         LLVMBuilderRef builder = ctx->ac.builder;
6109         LLVMTypeRef returns[48];
6110         LLVMValueRef func, ret;
6111
6112         si_init_function_info(&fninfo);
6113
6114         if (ctx->screen->info.chip_class >= GFX9) {
6115                 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
6116                 num_vgprs = 5; /* ES inputs are not needed by GS */
6117         } else {
6118                 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6119                 num_vgprs = 8;
6120         }
6121
6122         for (unsigned i = 0; i < num_sgprs; ++i) {
6123                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6124                 returns[i] = ctx->i32;
6125         }
6126
6127         for (unsigned i = 0; i < num_vgprs; ++i) {
6128                 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6129                 returns[num_sgprs + i] = ctx->f32;
6130         }
6131
6132         /* Create the function. */
6133         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6134                            &fninfo, 0);
6135         func = ctx->main_fn;
6136
6137         /* Set the full EXEC mask for the prolog, because we are only fiddling
6138          * with registers here. The main shader part will set the correct EXEC
6139          * mask.
6140          */
6141         if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6142                 si_init_exec_full_mask(ctx);
6143
6144         /* Copy inputs to outputs. This should be no-op, as the registers match,
6145          * but it will prevent the compiler from overwriting them unintentionally.
6146          */
6147         ret = ctx->return_value;
6148         for (unsigned i = 0; i < num_sgprs; i++) {
6149                 LLVMValueRef p = LLVMGetParam(func, i);
6150                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6151         }
6152         for (unsigned i = 0; i < num_vgprs; i++) {
6153                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6154                 p = ac_to_float(&ctx->ac, p);
6155                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6156         }
6157
6158         if (key->gs_prolog.states.tri_strip_adj_fix) {
6159                 /* Remap the input vertices for every other primitive. */
6160                 const unsigned gfx6_vtx_params[6] = {
6161                         num_sgprs,
6162                         num_sgprs + 1,
6163                         num_sgprs + 3,
6164                         num_sgprs + 4,
6165                         num_sgprs + 5,
6166                         num_sgprs + 6
6167                 };
6168                 const unsigned gfx9_vtx_params[3] = {
6169                         num_sgprs,
6170                         num_sgprs + 1,
6171                         num_sgprs + 4,
6172                 };
6173                 LLVMValueRef vtx_in[6], vtx_out[6];
6174                 LLVMValueRef prim_id, rotate;
6175
6176                 if (ctx->screen->info.chip_class >= GFX9) {
6177                         for (unsigned i = 0; i < 3; i++) {
6178                                 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6179                                 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6180                         }
6181                 } else {
6182                         for (unsigned i = 0; i < 6; i++)
6183                                 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6184                 }
6185
6186                 prim_id = LLVMGetParam(func, num_sgprs + 2);
6187                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6188
6189                 for (unsigned i = 0; i < 6; ++i) {
6190                         LLVMValueRef base, rotated;
6191                         base = vtx_in[i];
6192                         rotated = vtx_in[(i + 4) % 6];
6193                         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6194                 }
6195
6196                 if (ctx->screen->info.chip_class >= GFX9) {
6197                         for (unsigned i = 0; i < 3; i++) {
6198                                 LLVMValueRef hi, out;
6199
6200                                 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6201                                                   LLVMConstInt(ctx->i32, 16, 0), "");
6202                                 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6203                                 out = ac_to_float(&ctx->ac, out);
6204                                 ret = LLVMBuildInsertValue(builder, ret, out,
6205                                                            gfx9_vtx_params[i], "");
6206                         }
6207                 } else {
6208                         for (unsigned i = 0; i < 6; i++) {
6209                                 LLVMValueRef out;
6210
6211                                 out = ac_to_float(&ctx->ac, vtx_out[i]);
6212                                 ret = LLVMBuildInsertValue(builder, ret, out,
6213                                                            gfx6_vtx_params[i], "");
6214                         }
6215                 }
6216         }
6217
6218         LLVMBuildRet(builder, ret);
6219 }
6220
6221 /**
6222  * Given a list of shader part functions, build a wrapper function that
6223  * runs them in sequence to form a monolithic shader.
6224  */
6225 static void si_build_wrapper_function(struct si_shader_context *ctx,
6226                                       LLVMValueRef *parts,
6227                                       unsigned num_parts,
6228                                       unsigned main_part,
6229                                       unsigned next_shader_first_part)
6230 {
6231         LLVMBuilderRef builder = ctx->ac.builder;
6232         /* PS epilog has one arg per color component; gfx9 merged shader
6233          * prologs need to forward 32 user SGPRs.
6234          */
6235         struct si_function_info fninfo;
6236         LLVMValueRef initial[64], out[64];
6237         LLVMTypeRef function_type;
6238         unsigned num_first_params;
6239         unsigned num_out, initial_num_out;
6240         MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6241         MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6242         unsigned num_sgprs, num_vgprs;
6243         unsigned gprs;
6244         struct lp_build_if_state if_state;
6245
6246         si_init_function_info(&fninfo);
6247
6248         for (unsigned i = 0; i < num_parts; ++i) {
6249                 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6250                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6251         }
6252
6253         /* The parameters of the wrapper function correspond to those of the
6254          * first part in terms of SGPRs and VGPRs, but we use the types of the
6255          * main part to get the right types. This is relevant for the
6256          * dereferenceable attribute on descriptor table pointers.
6257          */
6258         num_sgprs = 0;
6259         num_vgprs = 0;
6260
6261         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6262         num_first_params = LLVMCountParamTypes(function_type);
6263
6264         for (unsigned i = 0; i < num_first_params; ++i) {
6265                 LLVMValueRef param = LLVMGetParam(parts[0], i);
6266
6267                 if (ac_is_sgpr_param(param)) {
6268                         assert(num_vgprs == 0);
6269                         num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6270                 } else {
6271                         num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6272                 }
6273         }
6274
6275         gprs = 0;
6276         while (gprs < num_sgprs + num_vgprs) {
6277                 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6278                 LLVMTypeRef type = LLVMTypeOf(param);
6279                 unsigned size = ac_get_type_size(type) / 4;
6280
6281                 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6282
6283                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6284                 assert(gprs + size <= num_sgprs + num_vgprs &&
6285                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
6286
6287                 gprs += size;
6288         }
6289
6290         si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6291                            si_get_max_workgroup_size(ctx->shader));
6292
6293         if (is_merged_shader(ctx->shader))
6294                 si_init_exec_full_mask(ctx);
6295
6296         /* Record the arguments of the function as if they were an output of
6297          * a previous part.
6298          */
6299         num_out = 0;
6300         num_out_sgpr = 0;
6301
6302         for (unsigned i = 0; i < fninfo.num_params; ++i) {
6303                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6304                 LLVMTypeRef param_type = LLVMTypeOf(param);
6305                 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6306                 unsigned size = ac_get_type_size(param_type) / 4;
6307
6308                 if (size == 1) {
6309                         if (param_type != out_type)
6310                                 param = LLVMBuildBitCast(builder, param, out_type, "");
6311                         out[num_out++] = param;
6312                 } else {
6313                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6314
6315                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6316                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6317                                 param_type = ctx->i64;
6318                         }
6319
6320                         if (param_type != vector_type)
6321                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
6322
6323                         for (unsigned j = 0; j < size; ++j)
6324                                 out[num_out++] = LLVMBuildExtractElement(
6325                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6326                 }
6327
6328                 if (i < fninfo.num_sgpr_params)
6329                         num_out_sgpr = num_out;
6330         }
6331
6332         memcpy(initial, out, sizeof(out));
6333         initial_num_out = num_out;
6334         initial_num_out_sgpr = num_out_sgpr;
6335
6336         /* Now chain the parts. */
6337         for (unsigned part = 0; part < num_parts; ++part) {
6338                 LLVMValueRef in[48];
6339                 LLVMValueRef ret;
6340                 LLVMTypeRef ret_type;
6341                 unsigned out_idx = 0;
6342                 unsigned num_params = LLVMCountParams(parts[part]);
6343
6344                 /* Merged shaders are executed conditionally depending
6345                  * on the number of enabled threads passed in the input SGPRs. */
6346                 if (is_merged_shader(ctx->shader) && part == 0) {
6347                         LLVMValueRef ena, count = initial[3];
6348
6349                         count = LLVMBuildAnd(builder, count,
6350                                              LLVMConstInt(ctx->i32, 0x7f, 0), "");
6351                         ena = LLVMBuildICmp(builder, LLVMIntULT,
6352                                             ac_get_thread_id(&ctx->ac), count, "");
6353                         lp_build_if(&if_state, &ctx->gallivm, ena);
6354                 }
6355
6356                 /* Derive arguments for the next part from outputs of the
6357                  * previous one.
6358                  */
6359                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6360                         LLVMValueRef param;
6361                         LLVMTypeRef param_type;
6362                         bool is_sgpr;
6363                         unsigned param_size;
6364                         LLVMValueRef arg = NULL;
6365
6366                         param = LLVMGetParam(parts[part], param_idx);
6367                         param_type = LLVMTypeOf(param);
6368                         param_size = ac_get_type_size(param_type) / 4;
6369                         is_sgpr = ac_is_sgpr_param(param);
6370
6371                         if (is_sgpr) {
6372 #if HAVE_LLVM < 0x0400
6373                                 LLVMRemoveAttribute(param, LLVMByValAttribute);
6374 #else
6375                                 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6376                                 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6377 #endif
6378                                 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6379                         }
6380
6381                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6382                         assert(is_sgpr || out_idx >= num_out_sgpr);
6383
6384                         if (param_size == 1)
6385                                 arg = out[out_idx];
6386                         else
6387                                 arg = lp_build_gather_values(&ctx->gallivm, &out[out_idx], param_size);
6388
6389                         if (LLVMTypeOf(arg) != param_type) {
6390                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6391                                         arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6392                                         arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6393                                 } else {
6394                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
6395                                 }
6396                         }
6397
6398                         in[param_idx] = arg;
6399                         out_idx += param_size;
6400                 }
6401
6402                 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6403
6404                 if (is_merged_shader(ctx->shader) &&
6405                     part + 1 == next_shader_first_part) {
6406                         lp_build_endif(&if_state);
6407
6408                         /* The second half of the merged shader should use
6409                          * the inputs from the toplevel (wrapper) function,
6410                          * not the return value from the last call.
6411                          *
6412                          * That's because the last call was executed condi-
6413                          * tionally, so we can't consume it in the main
6414                          * block.
6415                          */
6416                         memcpy(out, initial, sizeof(initial));
6417                         num_out = initial_num_out;
6418                         num_out_sgpr = initial_num_out_sgpr;
6419                         continue;
6420                 }
6421
6422                 /* Extract the returned GPRs. */
6423                 ret_type = LLVMTypeOf(ret);
6424                 num_out = 0;
6425                 num_out_sgpr = 0;
6426
6427                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6428                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6429
6430                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6431
6432                         for (unsigned i = 0; i < ret_size; ++i) {
6433                                 LLVMValueRef val =
6434                                         LLVMBuildExtractValue(builder, ret, i, "");
6435
6436                                 assert(num_out < ARRAY_SIZE(out));
6437                                 out[num_out++] = val;
6438
6439                                 if (LLVMTypeOf(val) == ctx->i32) {
6440                                         assert(num_out_sgpr + 1 == num_out);
6441                                         num_out_sgpr = num_out;
6442                                 }
6443                         }
6444                 }
6445         }
6446
6447         LLVMBuildRetVoid(builder);
6448 }
6449
6450 int si_compile_tgsi_shader(struct si_screen *sscreen,
6451                            LLVMTargetMachineRef tm,
6452                            struct si_shader *shader,
6453                            bool is_monolithic,
6454                            struct pipe_debug_callback *debug)
6455 {
6456         struct si_shader_selector *sel = shader->selector;
6457         struct si_shader_context ctx;
6458         int r = -1;
6459
6460         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6461          * conversion fails. */
6462         if (si_can_dump_shader(sscreen, sel->info.processor) &&
6463             !(sscreen->debug_flags & DBG(NO_TGSI))) {
6464                 if (sel->tokens)
6465                         tgsi_dump(sel->tokens, 0);
6466                 else
6467                         nir_print_shader(sel->nir, stderr);
6468                 si_dump_streamout(&sel->so);
6469         }
6470
6471         si_init_shader_ctx(&ctx, sscreen, tm);
6472         si_llvm_context_set_tgsi(&ctx, shader);
6473         ctx.separate_prolog = !is_monolithic;
6474
6475         memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6476                sizeof(shader->info.vs_output_param_offset));
6477
6478         shader->info.uses_instanceid = sel->info.uses_instanceid;
6479
6480         if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6481                 si_llvm_dispose(&ctx);
6482                 return -1;
6483         }
6484
6485         if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6486                 LLVMValueRef parts[2];
6487                 bool need_prolog = sel->vs_needs_prolog;
6488
6489                 parts[1] = ctx.main_fn;
6490
6491                 if (need_prolog) {
6492                         union si_shader_part_key prolog_key;
6493                         si_get_vs_prolog_key(&sel->info,
6494                                              shader->info.num_input_sgprs,
6495                                              &shader->key.part.vs.prolog,
6496                                              shader, &prolog_key);
6497                         si_build_vs_prolog_function(&ctx, &prolog_key);
6498                         parts[0] = ctx.main_fn;
6499                 }
6500
6501                 si_build_wrapper_function(&ctx, parts + !need_prolog,
6502                                           1 + need_prolog, need_prolog, 0);
6503         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6504                 if (sscreen->info.chip_class >= GFX9) {
6505                         struct si_shader_selector *ls = shader->key.part.tcs.ls;
6506                         LLVMValueRef parts[4];
6507                         bool vs_needs_prolog =
6508                                 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6509
6510                         /* TCS main part */
6511                         parts[2] = ctx.main_fn;
6512
6513                         /* TCS epilog */
6514                         union si_shader_part_key tcs_epilog_key;
6515                         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6516                         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6517                         si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6518                         parts[3] = ctx.main_fn;
6519
6520                         /* VS prolog */
6521                         if (vs_needs_prolog) {
6522                                 union si_shader_part_key vs_prolog_key;
6523                                 si_get_vs_prolog_key(&ls->info,
6524                                                      shader->info.num_input_sgprs,
6525                                                      &shader->key.part.tcs.ls_prolog,
6526                                                      shader, &vs_prolog_key);
6527                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6528                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6529                                 parts[0] = ctx.main_fn;
6530                         }
6531
6532                         /* VS as LS main part */
6533                         struct si_shader shader_ls = {};
6534                         shader_ls.selector = ls;
6535                         shader_ls.key.as_ls = 1;
6536                         shader_ls.key.mono = shader->key.mono;
6537                         shader_ls.key.opt = shader->key.opt;
6538                         si_llvm_context_set_tgsi(&ctx, &shader_ls);
6539
6540                         if (!si_compile_tgsi_main(&ctx, true)) {
6541                                 si_llvm_dispose(&ctx);
6542                                 return -1;
6543                         }
6544                         shader->info.uses_instanceid |= ls->info.uses_instanceid;
6545                         parts[1] = ctx.main_fn;
6546
6547                         /* Reset the shader context. */
6548                         ctx.shader = shader;
6549                         ctx.type = PIPE_SHADER_TESS_CTRL;
6550
6551                         si_build_wrapper_function(&ctx,
6552                                                   parts + !vs_needs_prolog,
6553                                                   4 - !vs_needs_prolog, 0,
6554                                                   vs_needs_prolog ? 2 : 1);
6555                 } else {
6556                         LLVMValueRef parts[2];
6557                         union si_shader_part_key epilog_key;
6558
6559                         parts[0] = ctx.main_fn;
6560
6561                         memset(&epilog_key, 0, sizeof(epilog_key));
6562                         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6563                         si_build_tcs_epilog_function(&ctx, &epilog_key);
6564                         parts[1] = ctx.main_fn;
6565
6566                         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6567                 }
6568         } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6569                 if (ctx.screen->info.chip_class >= GFX9) {
6570                         struct si_shader_selector *es = shader->key.part.gs.es;
6571                         LLVMValueRef es_prolog = NULL;
6572                         LLVMValueRef es_main = NULL;
6573                         LLVMValueRef gs_prolog = NULL;
6574                         LLVMValueRef gs_main = ctx.main_fn;
6575
6576                         /* GS prolog */
6577                         union si_shader_part_key gs_prolog_key;
6578                         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6579                         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6580                         gs_prolog_key.gs_prolog.is_monolithic = true;
6581                         si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6582                         gs_prolog = ctx.main_fn;
6583
6584                         /* ES prolog */
6585                         if (es->vs_needs_prolog) {
6586                                 union si_shader_part_key vs_prolog_key;
6587                                 si_get_vs_prolog_key(&es->info,
6588                                                      shader->info.num_input_sgprs,
6589                                                      &shader->key.part.gs.vs_prolog,
6590                                                      shader, &vs_prolog_key);
6591                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6592                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6593                                 es_prolog = ctx.main_fn;
6594                         }
6595
6596                         /* ES main part */
6597                         struct si_shader shader_es = {};
6598                         shader_es.selector = es;
6599                         shader_es.key.as_es = 1;
6600                         shader_es.key.mono = shader->key.mono;
6601                         shader_es.key.opt = shader->key.opt;
6602                         si_llvm_context_set_tgsi(&ctx, &shader_es);
6603
6604                         if (!si_compile_tgsi_main(&ctx, true)) {
6605                                 si_llvm_dispose(&ctx);
6606                                 return -1;
6607                         }
6608                         shader->info.uses_instanceid |= es->info.uses_instanceid;
6609                         es_main = ctx.main_fn;
6610
6611                         /* Reset the shader context. */
6612                         ctx.shader = shader;
6613                         ctx.type = PIPE_SHADER_GEOMETRY;
6614
6615                         /* Prepare the array of shader parts. */
6616                         LLVMValueRef parts[4];
6617                         unsigned num_parts = 0, main_part, next_first_part;
6618
6619                         if (es_prolog)
6620                                 parts[num_parts++] = es_prolog;
6621
6622                         parts[main_part = num_parts++] = es_main;
6623                         parts[next_first_part = num_parts++] = gs_prolog;
6624                         parts[num_parts++] = gs_main;
6625
6626                         si_build_wrapper_function(&ctx, parts, num_parts,
6627                                                   main_part, next_first_part);
6628                 } else {
6629                         LLVMValueRef parts[2];
6630                         union si_shader_part_key prolog_key;
6631
6632                         parts[1] = ctx.main_fn;
6633
6634                         memset(&prolog_key, 0, sizeof(prolog_key));
6635                         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6636                         si_build_gs_prolog_function(&ctx, &prolog_key);
6637                         parts[0] = ctx.main_fn;
6638
6639                         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6640                 }
6641         } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6642                 LLVMValueRef parts[3];
6643                 union si_shader_part_key prolog_key;
6644                 union si_shader_part_key epilog_key;
6645                 bool need_prolog;
6646
6647                 si_get_ps_prolog_key(shader, &prolog_key, false);
6648                 need_prolog = si_need_ps_prolog(&prolog_key);
6649
6650                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6651
6652                 if (need_prolog) {
6653                         si_build_ps_prolog_function(&ctx, &prolog_key);
6654                         parts[0] = ctx.main_fn;
6655                 }
6656
6657                 si_get_ps_epilog_key(shader, &epilog_key);
6658                 si_build_ps_epilog_function(&ctx, &epilog_key);
6659                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6660
6661                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6662                                           need_prolog ? 1 : 0, 0);
6663         }
6664
6665         si_llvm_optimize_module(&ctx);
6666
6667         /* Post-optimization transformations and analysis. */
6668         si_optimize_vs_outputs(&ctx);
6669
6670         if ((debug && debug->debug_message) ||
6671             si_can_dump_shader(sscreen, ctx.type))
6672                 si_count_scratch_private_memory(&ctx);
6673
6674         /* Compile to bytecode. */
6675         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6676                             ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6677         si_llvm_dispose(&ctx);
6678         if (r) {
6679                 fprintf(stderr, "LLVM failed to compile shader\n");
6680                 return r;
6681         }
6682
6683         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6684          * LLVM 3.9svn has this bug.
6685          */
6686         if (sel->type == PIPE_SHADER_COMPUTE) {
6687                 unsigned wave_size = 64;
6688                 unsigned max_vgprs = 256;
6689                 unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512;
6690                 unsigned max_sgprs_per_wave = 128;
6691                 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6692                 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6693                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6694
6695                 max_vgprs = max_vgprs / min_waves_per_simd;
6696                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6697
6698                 if (shader->config.num_sgprs > max_sgprs ||
6699                     shader->config.num_vgprs > max_vgprs) {
6700                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
6701                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6702                                 shader->config.num_sgprs, shader->config.num_vgprs,
6703                                 max_sgprs, max_vgprs);
6704
6705                         /* Just terminate the process, because dependent
6706                          * shaders can hang due to bad input data, but use
6707                          * the env var to allow shader-db to work.
6708                          */
6709                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6710                                 abort();
6711                 }
6712         }
6713
6714         /* Add the scratch offset to input SGPRs. */
6715         if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6716                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6717
6718         /* Calculate the number of fragment input VGPRs. */
6719         if (ctx.type == PIPE_SHADER_FRAGMENT) {
6720                 shader->info.num_input_vgprs = 0;
6721                 shader->info.face_vgpr_index = -1;
6722                 shader->info.ancillary_vgpr_index = -1;
6723
6724                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6725                         shader->info.num_input_vgprs += 2;
6726                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6727                         shader->info.num_input_vgprs += 2;
6728                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6729                         shader->info.num_input_vgprs += 2;
6730                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6731                         shader->info.num_input_vgprs += 3;
6732                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6733                         shader->info.num_input_vgprs += 2;
6734                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6735                         shader->info.num_input_vgprs += 2;
6736                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6737                         shader->info.num_input_vgprs += 2;
6738                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6739                         shader->info.num_input_vgprs += 1;
6740                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6741                         shader->info.num_input_vgprs += 1;
6742                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6743                         shader->info.num_input_vgprs += 1;
6744                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6745                         shader->info.num_input_vgprs += 1;
6746                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6747                         shader->info.num_input_vgprs += 1;
6748                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6749                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6750                         shader->info.num_input_vgprs += 1;
6751                 }
6752                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) {
6753                         shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs;
6754                         shader->info.num_input_vgprs += 1;
6755                 }
6756                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6757                         shader->info.num_input_vgprs += 1;
6758                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6759                         shader->info.num_input_vgprs += 1;
6760         }
6761
6762         return 0;
6763 }
6764
6765 /**
6766  * Create, compile and return a shader part (prolog or epilog).
6767  *
6768  * \param sscreen       screen
6769  * \param list          list of shader parts of the same category
6770  * \param type          shader type
6771  * \param key           shader part key
6772  * \param prolog        whether the part being requested is a prolog
6773  * \param tm            LLVM target machine
6774  * \param debug         debug callback
6775  * \param build         the callback responsible for building the main function
6776  * \return              non-NULL on success
6777  */
6778 static struct si_shader_part *
6779 si_get_shader_part(struct si_screen *sscreen,
6780                    struct si_shader_part **list,
6781                    enum pipe_shader_type type,
6782                    bool prolog,
6783                    union si_shader_part_key *key,
6784                    LLVMTargetMachineRef tm,
6785                    struct pipe_debug_callback *debug,
6786                    void (*build)(struct si_shader_context *,
6787                                  union si_shader_part_key *),
6788                    const char *name)
6789 {
6790         struct si_shader_part *result;
6791
6792         mtx_lock(&sscreen->shader_parts_mutex);
6793
6794         /* Find existing. */
6795         for (result = *list; result; result = result->next) {
6796                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6797                         mtx_unlock(&sscreen->shader_parts_mutex);
6798                         return result;
6799                 }
6800         }
6801
6802         /* Compile a new one. */
6803         result = CALLOC_STRUCT(si_shader_part);
6804         result->key = *key;
6805
6806         struct si_shader shader = {};
6807         struct si_shader_context ctx;
6808
6809         si_init_shader_ctx(&ctx, sscreen, tm);
6810         ctx.shader = &shader;
6811         ctx.type = type;
6812
6813         switch (type) {
6814         case PIPE_SHADER_VERTEX:
6815                 shader.key.as_ls = key->vs_prolog.as_ls;
6816                 shader.key.as_es = key->vs_prolog.as_es;
6817                 break;
6818         case PIPE_SHADER_TESS_CTRL:
6819                 assert(!prolog);
6820                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6821                 break;
6822         case PIPE_SHADER_GEOMETRY:
6823                 assert(prolog);
6824                 break;
6825         case PIPE_SHADER_FRAGMENT:
6826                 if (prolog)
6827                         shader.key.part.ps.prolog = key->ps_prolog.states;
6828                 else
6829                         shader.key.part.ps.epilog = key->ps_epilog.states;
6830                 break;
6831         default:
6832                 unreachable("bad shader part");
6833         }
6834
6835         build(&ctx, key);
6836
6837         /* Compile. */
6838         si_llvm_optimize_module(&ctx);
6839
6840         if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6841                             ctx.ac.module, debug, ctx.type, name)) {
6842                 FREE(result);
6843                 result = NULL;
6844                 goto out;
6845         }
6846
6847         result->next = *list;
6848         *list = result;
6849
6850 out:
6851         si_llvm_dispose(&ctx);
6852         mtx_unlock(&sscreen->shader_parts_mutex);
6853         return result;
6854 }
6855
6856 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6857 {
6858         LLVMValueRef ptr[2], list;
6859         bool is_merged_shader =
6860                 ctx->screen->info.chip_class >= GFX9 &&
6861                 (ctx->type == PIPE_SHADER_TESS_CTRL ||
6862                  ctx->type == PIPE_SHADER_GEOMETRY ||
6863                  ctx->shader->key.as_ls || ctx->shader->key.as_es);
6864
6865         /* Get the pointer to rw buffers. */
6866         ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
6867         ptr[1] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS_HI);
6868         list = lp_build_gather_values(&ctx->gallivm, ptr, 2);
6869         list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, "");
6870         list = LLVMBuildIntToPtr(ctx->ac.builder, list,
6871                                  si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6872         return list;
6873 }
6874
6875 /**
6876  * Build the vertex shader prolog function.
6877  *
6878  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6879  * All inputs are returned unmodified. The vertex load indices are
6880  * stored after them, which will be used by the API VS for fetching inputs.
6881  *
6882  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6883  *   input_v0,
6884  *   input_v1,
6885  *   input_v2,
6886  *   input_v3,
6887  *   (VertexID + BaseVertex),
6888  *   (InstanceID + StartInstance),
6889  *   (InstanceID / 2 + StartInstance)
6890  */
6891 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6892                                         union si_shader_part_key *key)
6893 {
6894         struct si_function_info fninfo;
6895         LLVMTypeRef *returns;
6896         LLVMValueRef ret, func;
6897         int num_returns, i;
6898         unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
6899         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6900         LLVMValueRef input_vgprs[9];
6901         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6902                                       num_input_vgprs;
6903         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6904
6905         si_init_function_info(&fninfo);
6906
6907         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6908         returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6909                          sizeof(LLVMTypeRef));
6910         num_returns = 0;
6911
6912         /* Declare input and output SGPRs. */
6913         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6914                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6915                 returns[num_returns++] = ctx->i32;
6916         }
6917
6918         /* Preloaded VGPRs (outputs must be floats) */
6919         for (i = 0; i < num_input_vgprs; i++) {
6920                 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
6921                 returns[num_returns++] = ctx->f32;
6922         }
6923
6924         /* Vertex load indices. */
6925         for (i = 0; i <= key->vs_prolog.last_input; i++)
6926                 returns[num_returns++] = ctx->f32;
6927
6928         /* Create the function. */
6929         si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6930         func = ctx->main_fn;
6931
6932         if (key->vs_prolog.num_merged_next_stage_vgprs) {
6933                 if (!key->vs_prolog.is_monolithic)
6934                         si_init_exec_from_input(ctx, 3, 0);
6935
6936                 if (key->vs_prolog.as_ls &&
6937                     ctx->screen->has_ls_vgpr_init_bug) {
6938                         /* If there are no HS threads, SPI loads the LS VGPRs
6939                          * starting at VGPR 0. Shift them back to where they
6940                          * belong.
6941                          */
6942                         LLVMValueRef has_hs_threads =
6943                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
6944                                     unpack_param(ctx, 3, 8, 8),
6945                                     ctx->i32_0, "");
6946
6947                         for (i = 4; i > 0; --i) {
6948                                 input_vgprs[i + 1] =
6949                                         LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
6950                                                         input_vgprs[i + 1],
6951                                                         input_vgprs[i - 1], "");
6952                         }
6953                 }
6954         }
6955
6956         ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
6957         ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
6958
6959         /* Copy inputs to outputs. This should be no-op, as the registers match,
6960          * but it will prevent the compiler from overwriting them unintentionally.
6961          */
6962         ret = ctx->return_value;
6963         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6964                 LLVMValueRef p = LLVMGetParam(func, i);
6965                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
6966         }
6967         for (i = 0; i < num_input_vgprs; i++) {
6968                 LLVMValueRef p = input_vgprs[i];
6969                 p = ac_to_float(&ctx->ac, p);
6970                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
6971                                            key->vs_prolog.num_input_sgprs + i, "");
6972         }
6973
6974         /* Compute vertex load indices from instance divisors. */
6975         LLVMValueRef instance_divisor_constbuf = NULL;
6976
6977         if (key->vs_prolog.states.instance_divisor_is_fetched) {
6978                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6979                 LLVMValueRef buf_index =
6980                         LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6981                 instance_divisor_constbuf =
6982                         ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
6983         }
6984
6985         for (i = 0; i <= key->vs_prolog.last_input; i++) {
6986                 bool divisor_is_one =
6987                         key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6988                 bool divisor_is_fetched =
6989                         key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6990                 LLVMValueRef index;
6991
6992                 if (divisor_is_one || divisor_is_fetched) {
6993                         LLVMValueRef divisor = ctx->i32_1;
6994
6995                         if (divisor_is_fetched) {
6996                                 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6997                                                             LLVMConstInt(ctx->i32, i * 4, 0));
6998                                 divisor = ac_to_integer(&ctx->ac, divisor);
6999                         }
7000
7001                         /* InstanceID / Divisor + StartInstance */
7002                         index = get_instance_index_for_fetch(ctx,
7003                                                              user_sgpr_base +
7004                                                              SI_SGPR_START_INSTANCE,
7005                                                              divisor);
7006                 } else {
7007                         /* VertexID + BaseVertex */
7008                         index = LLVMBuildAdd(ctx->ac.builder,
7009                                              ctx->abi.vertex_id,
7010                                              LLVMGetParam(func, user_sgpr_base +
7011                                                                 SI_SGPR_BASE_VERTEX), "");
7012                 }
7013
7014                 index = ac_to_float(&ctx->ac, index);
7015                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
7016                                            fninfo.num_params + i, "");
7017         }
7018
7019         si_llvm_build_ret(ctx, ret);
7020 }
7021
7022 static bool si_get_vs_prolog(struct si_screen *sscreen,
7023                              LLVMTargetMachineRef tm,
7024                              struct si_shader *shader,
7025                              struct pipe_debug_callback *debug,
7026                              struct si_shader *main_part,
7027                              const struct si_vs_prolog_bits *key)
7028 {
7029         struct si_shader_selector *vs = main_part->selector;
7030
7031         if (!si_vs_needs_prolog(vs, key))
7032                 return true;
7033
7034         /* Get the prolog. */
7035         union si_shader_part_key prolog_key;
7036         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
7037                              key, shader, &prolog_key);
7038
7039         shader->prolog =
7040                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7041                                    PIPE_SHADER_VERTEX, true, &prolog_key, tm,
7042                                    debug, si_build_vs_prolog_function,
7043                                    "Vertex Shader Prolog");
7044         return shader->prolog != NULL;
7045 }
7046
7047 /**
7048  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7049  */
7050 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7051                                       LLVMTargetMachineRef tm,
7052                                       struct si_shader *shader,
7053                                       struct pipe_debug_callback *debug)
7054 {
7055         return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
7056                                 &shader->key.part.vs.prolog);
7057 }
7058
7059 /**
7060  * Compile the TCS epilog function. This writes tesselation factors to memory
7061  * based on the output primitive type of the tesselator (determined by TES).
7062  */
7063 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
7064                                          union si_shader_part_key *key)
7065 {
7066         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7067         struct si_function_info fninfo;
7068         LLVMValueRef func;
7069
7070         si_init_function_info(&fninfo);
7071
7072         if (ctx->screen->info.chip_class >= GFX9) {
7073                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7074                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7075                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
7076                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7077                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7078                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7079                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7080                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7081                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7082                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7083                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7084                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7085                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7086                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7087                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7088                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7089                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7090                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7091                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7092                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7093                 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7094         } else {
7095                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7096                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7097                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7098                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7099                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7100                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7101                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7102                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7103                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7104                 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7105                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7106                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7107         }
7108
7109         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7110         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7111         unsigned tess_factors_idx =
7112                 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7113         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7114         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7115
7116         for (unsigned i = 0; i < 6; i++)
7117                 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
7118
7119         /* Create the function. */
7120         si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7121                            ctx->screen->info.chip_class >= CIK ? 128 : 64);
7122         ac_declare_lds_as_pointer(&ctx->ac);
7123         func = ctx->main_fn;
7124
7125         LLVMValueRef invoc0_tess_factors[6];
7126         for (unsigned i = 0; i < 6; i++)
7127                 invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i);
7128
7129         si_write_tess_factors(bld_base,
7130                               LLVMGetParam(func, tess_factors_idx),
7131                               LLVMGetParam(func, tess_factors_idx + 1),
7132                               LLVMGetParam(func, tess_factors_idx + 2),
7133                               invoc0_tess_factors, invoc0_tess_factors + 4);
7134
7135         LLVMBuildRetVoid(ctx->ac.builder);
7136 }
7137
7138 /**
7139  * Select and compile (or reuse) TCS parts (epilog).
7140  */
7141 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7142                                        LLVMTargetMachineRef tm,
7143                                        struct si_shader *shader,
7144                                        struct pipe_debug_callback *debug)
7145 {
7146         if (sscreen->info.chip_class >= GFX9) {
7147                 struct si_shader *ls_main_part =
7148                         shader->key.part.tcs.ls->main_shader_part_ls;
7149
7150                 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
7151                                       &shader->key.part.tcs.ls_prolog))
7152                         return false;
7153
7154                 shader->previous_stage = ls_main_part;
7155         }
7156
7157         /* Get the epilog. */
7158         union si_shader_part_key epilog_key;
7159         memset(&epilog_key, 0, sizeof(epilog_key));
7160         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7161
7162         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7163                                             PIPE_SHADER_TESS_CTRL, false,
7164                                             &epilog_key, tm, debug,
7165                                             si_build_tcs_epilog_function,
7166                                             "Tessellation Control Shader Epilog");
7167         return shader->epilog != NULL;
7168 }
7169
7170 /**
7171  * Select and compile (or reuse) GS parts (prolog).
7172  */
7173 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7174                                       LLVMTargetMachineRef tm,
7175                                       struct si_shader *shader,
7176                                       struct pipe_debug_callback *debug)
7177 {
7178         if (sscreen->info.chip_class >= GFX9) {
7179                 struct si_shader *es_main_part =
7180                         shader->key.part.gs.es->main_shader_part_es;
7181
7182                 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7183                     !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
7184                                       &shader->key.part.gs.vs_prolog))
7185                         return false;
7186
7187                 shader->previous_stage = es_main_part;
7188         }
7189
7190         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7191                 return true;
7192
7193         union si_shader_part_key prolog_key;
7194         memset(&prolog_key, 0, sizeof(prolog_key));
7195         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7196
7197         shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7198                                             PIPE_SHADER_GEOMETRY, true,
7199                                             &prolog_key, tm, debug,
7200                                             si_build_gs_prolog_function,
7201                                             "Geometry Shader Prolog");
7202         return shader->prolog2 != NULL;
7203 }
7204
7205 /**
7206  * Build the pixel shader prolog function. This handles:
7207  * - two-side color selection and interpolation
7208  * - overriding interpolation parameters for the API PS
7209  * - polygon stippling
7210  *
7211  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7212  * overriden by other states. (e.g. per-sample interpolation)
7213  * Interpolated colors are stored after the preloaded VGPRs.
7214  */
7215 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7216                                         union si_shader_part_key *key)
7217 {
7218         struct si_function_info fninfo;
7219         LLVMValueRef ret, func;
7220         int num_returns, i, num_color_channels;
7221
7222         assert(si_need_ps_prolog(key));
7223
7224         si_init_function_info(&fninfo);
7225
7226         /* Declare inputs. */
7227         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7228                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7229
7230         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7231                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7232
7233         /* Declare outputs (same as inputs + add colors if needed) */
7234         num_returns = fninfo.num_params;
7235         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7236         for (i = 0; i < num_color_channels; i++)
7237                 fninfo.types[num_returns++] = ctx->f32;
7238
7239         /* Create the function. */
7240         si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7241                            &fninfo, 0);
7242         func = ctx->main_fn;
7243
7244         /* Copy inputs to outputs. This should be no-op, as the registers match,
7245          * but it will prevent the compiler from overwriting them unintentionally.
7246          */
7247         ret = ctx->return_value;
7248         for (i = 0; i < fninfo.num_params; i++) {
7249                 LLVMValueRef p = LLVMGetParam(func, i);
7250                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7251         }
7252
7253         /* Polygon stippling. */
7254         if (key->ps_prolog.states.poly_stipple) {
7255                 /* POS_FIXED_PT is always last. */
7256                 unsigned pos = key->ps_prolog.num_input_sgprs +
7257                                key->ps_prolog.num_input_vgprs - 1;
7258                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7259
7260                 si_llvm_emit_polygon_stipple(ctx, list, pos);
7261         }
7262
7263         if (key->ps_prolog.states.bc_optimize_for_persp ||
7264             key->ps_prolog.states.bc_optimize_for_linear) {
7265                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7266                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7267
7268                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7269                  * The hw doesn't compute CENTROID if the whole wave only
7270                  * contains fully-covered quads.
7271                  *
7272                  * PRIM_MASK is after user SGPRs.
7273                  */
7274                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7275                 bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
7276                                             LLVMConstInt(ctx->i32, 31, 0), "");
7277                 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
7278                                              ctx->i1, "");
7279
7280                 if (key->ps_prolog.states.bc_optimize_for_persp) {
7281                         /* Read PERSP_CENTER. */
7282                         for (i = 0; i < 2; i++)
7283                                 center[i] = LLVMGetParam(func, base + 2 + i);
7284                         /* Read PERSP_CENTROID. */
7285                         for (i = 0; i < 2; i++)
7286                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
7287                         /* Select PERSP_CENTROID. */
7288                         for (i = 0; i < 2; i++) {
7289                                 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7290                                                       center[i], centroid[i], "");
7291                                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7292                                                            tmp, base + 4 + i, "");
7293                         }
7294                 }
7295                 if (key->ps_prolog.states.bc_optimize_for_linear) {
7296                         /* Read LINEAR_CENTER. */
7297                         for (i = 0; i < 2; i++)
7298                                 center[i] = LLVMGetParam(func, base + 8 + i);
7299                         /* Read LINEAR_CENTROID. */
7300                         for (i = 0; i < 2; i++)
7301                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
7302                         /* Select LINEAR_CENTROID. */
7303                         for (i = 0; i < 2; i++) {
7304                                 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7305                                                       center[i], centroid[i], "");
7306                                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7307                                                            tmp, base + 10 + i, "");
7308                         }
7309                 }
7310         }
7311
7312         /* Force per-sample interpolation. */
7313         if (key->ps_prolog.states.force_persp_sample_interp) {
7314                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7315                 LLVMValueRef persp_sample[2];
7316
7317                 /* Read PERSP_SAMPLE. */
7318                 for (i = 0; i < 2; i++)
7319                         persp_sample[i] = LLVMGetParam(func, base + i);
7320                 /* Overwrite PERSP_CENTER. */
7321                 for (i = 0; i < 2; i++)
7322                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7323                                                    persp_sample[i], base + 2 + i, "");
7324                 /* Overwrite PERSP_CENTROID. */
7325                 for (i = 0; i < 2; i++)
7326                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7327                                                    persp_sample[i], base + 4 + i, "");
7328         }
7329         if (key->ps_prolog.states.force_linear_sample_interp) {
7330                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7331                 LLVMValueRef linear_sample[2];
7332
7333                 /* Read LINEAR_SAMPLE. */
7334                 for (i = 0; i < 2; i++)
7335                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7336                 /* Overwrite LINEAR_CENTER. */
7337                 for (i = 0; i < 2; i++)
7338                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7339                                                    linear_sample[i], base + 8 + i, "");
7340                 /* Overwrite LINEAR_CENTROID. */
7341                 for (i = 0; i < 2; i++)
7342                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7343                                                    linear_sample[i], base + 10 + i, "");
7344         }
7345
7346         /* Force center interpolation. */
7347         if (key->ps_prolog.states.force_persp_center_interp) {
7348                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7349                 LLVMValueRef persp_center[2];
7350
7351                 /* Read PERSP_CENTER. */
7352                 for (i = 0; i < 2; i++)
7353                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
7354                 /* Overwrite PERSP_SAMPLE. */
7355                 for (i = 0; i < 2; i++)
7356                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7357                                                    persp_center[i], base + i, "");
7358                 /* Overwrite PERSP_CENTROID. */
7359                 for (i = 0; i < 2; i++)
7360                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7361                                                    persp_center[i], base + 4 + i, "");
7362         }
7363         if (key->ps_prolog.states.force_linear_center_interp) {
7364                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7365                 LLVMValueRef linear_center[2];
7366
7367                 /* Read LINEAR_CENTER. */
7368                 for (i = 0; i < 2; i++)
7369                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
7370                 /* Overwrite LINEAR_SAMPLE. */
7371                 for (i = 0; i < 2; i++)
7372                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7373                                                    linear_center[i], base + 6 + i, "");
7374                 /* Overwrite LINEAR_CENTROID. */
7375                 for (i = 0; i < 2; i++)
7376                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7377                                                    linear_center[i], base + 10 + i, "");
7378         }
7379
7380         /* Interpolate colors. */
7381         unsigned color_out_idx = 0;
7382         for (i = 0; i < 2; i++) {
7383                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7384                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7385                                      key->ps_prolog.face_vgpr_index;
7386                 LLVMValueRef interp[2], color[4];
7387                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7388
7389                 if (!writemask)
7390                         continue;
7391
7392                 /* If the interpolation qualifier is not CONSTANT (-1). */
7393                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7394                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7395                                                key->ps_prolog.color_interp_vgpr_index[i];
7396
7397                         /* Get the (i,j) updated by bc_optimize handling. */
7398                         interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7399                                                           interp_vgpr, "");
7400                         interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7401                                                           interp_vgpr + 1, "");
7402                         interp_ij = lp_build_gather_values(&ctx->gallivm, interp, 2);
7403                 }
7404
7405                 /* Use the absolute location of the input. */
7406                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7407
7408                 if (key->ps_prolog.states.color_two_side) {
7409                         face = LLVMGetParam(func, face_vgpr);
7410                         face = ac_to_integer(&ctx->ac, face);
7411                 }
7412
7413                 interp_fs_input(ctx,
7414                                 key->ps_prolog.color_attr_index[i],
7415                                 TGSI_SEMANTIC_COLOR, i,
7416                                 key->ps_prolog.num_interp_inputs,
7417                                 key->ps_prolog.colors_read, interp_ij,
7418                                 prim_mask, face, color);
7419
7420                 while (writemask) {
7421                         unsigned chan = u_bit_scan(&writemask);
7422                         ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
7423                                                    fninfo.num_params + color_out_idx++, "");
7424                 }
7425         }
7426
7427         /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
7428          * says:
7429          *
7430          *    "When per-sample shading is active due to the use of a fragment
7431          *     input qualified by sample or due to the use of the gl_SampleID
7432          *     or gl_SamplePosition variables, only the bit for the current
7433          *     sample is set in gl_SampleMaskIn. When state specifies multiple
7434          *     fragment shader invocations for a given fragment, the sample
7435          *     mask for any single fragment shader invocation may specify a
7436          *     subset of the covered samples for the fragment. In this case,
7437          *     the bit corresponding to each covered sample will be set in
7438          *     exactly one fragment shader invocation."
7439          *
7440          * The samplemask loaded by hardware is always the coverage of the
7441          * entire pixel/fragment, so mask bits out based on the sample ID.
7442          */
7443         if (key->ps_prolog.states.samplemask_log_ps_iter) {
7444                 /* The bit pattern matches that used by fixed function fragment
7445                  * processing. */
7446                 static const uint16_t ps_iter_masks[] = {
7447                         0xffff, /* not used */
7448                         0x5555,
7449                         0x1111,
7450                         0x0101,
7451                         0x0001,
7452                 };
7453                 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
7454
7455                 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
7456                 unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs +
7457                                           key->ps_prolog.ancillary_vgpr_index;
7458                 LLVMValueRef sampleid = unpack_param(ctx, ancillary_vgpr, 8, 4);
7459                 LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1);
7460
7461                 samplemask = ac_to_integer(&ctx->ac, samplemask);
7462                 samplemask = LLVMBuildAnd(
7463                         ctx->ac.builder,
7464                         samplemask,
7465                         LLVMBuildShl(ctx->ac.builder,
7466                                      LLVMConstInt(ctx->i32, ps_iter_mask, false),
7467                                      sampleid, ""),
7468                         "");
7469                 samplemask = ac_to_float(&ctx->ac, samplemask);
7470
7471                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
7472                                            ancillary_vgpr + 1, "");
7473         }
7474
7475         /* Tell LLVM to insert WQM instruction sequence when needed. */
7476         if (key->ps_prolog.wqm) {
7477                 LLVMAddTargetDependentFunctionAttr(func,
7478                                                    "amdgpu-ps-wqm-outputs", "");
7479         }
7480
7481         si_llvm_build_ret(ctx, ret);
7482 }
7483
7484 /**
7485  * Build the pixel shader epilog function. This handles everything that must be
7486  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7487  */
7488 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7489                                         union si_shader_part_key *key)
7490 {
7491         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7492         struct si_function_info fninfo;
7493         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7494         int i;
7495         struct si_ps_exports exp = {};
7496
7497         si_init_function_info(&fninfo);
7498
7499         /* Declare input SGPRs. */
7500         ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7501         ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7502         ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7503         ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7504         add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7505
7506         /* Declare input VGPRs. */
7507         unsigned required_num_params =
7508                      fninfo.num_sgpr_params +
7509                      util_bitcount(key->ps_epilog.colors_written) * 4 +
7510                      key->ps_epilog.writes_z +
7511                      key->ps_epilog.writes_stencil +
7512                      key->ps_epilog.writes_samplemask;
7513
7514         required_num_params = MAX2(required_num_params,
7515                                    fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7516
7517         while (fninfo.num_params < required_num_params)
7518                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7519
7520         /* Create the function. */
7521         si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7522         /* Disable elimination of unused inputs. */
7523         si_llvm_add_attribute(ctx->main_fn,
7524                                   "InitialPSInputAddr", 0xffffff);
7525
7526         /* Process colors. */
7527         unsigned vgpr = fninfo.num_sgpr_params;
7528         unsigned colors_written = key->ps_epilog.colors_written;
7529         int last_color_export = -1;
7530
7531         /* Find the last color export. */
7532         if (!key->ps_epilog.writes_z &&
7533             !key->ps_epilog.writes_stencil &&
7534             !key->ps_epilog.writes_samplemask) {
7535                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7536
7537                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7538                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7539                         /* Just set this if any of the colorbuffers are enabled. */
7540                         if (spi_format &
7541                             ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7542                                 last_color_export = 0;
7543                 } else {
7544                         for (i = 0; i < 8; i++)
7545                                 if (colors_written & (1 << i) &&
7546                                     (spi_format >> (i * 4)) & 0xf)
7547                                         last_color_export = i;
7548                 }
7549         }
7550
7551         while (colors_written) {
7552                 LLVMValueRef color[4];
7553                 int mrt = u_bit_scan(&colors_written);
7554
7555                 for (i = 0; i < 4; i++)
7556                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7557
7558                 si_export_mrt_color(bld_base, color, mrt,
7559                                     fninfo.num_params - 1,
7560                                     mrt == last_color_export, &exp);
7561         }
7562
7563         /* Process depth, stencil, samplemask. */
7564         if (key->ps_epilog.writes_z)
7565                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7566         if (key->ps_epilog.writes_stencil)
7567                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7568         if (key->ps_epilog.writes_samplemask)
7569                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7570
7571         if (depth || stencil || samplemask)
7572                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7573         else if (last_color_export == -1)
7574                 si_export_null(bld_base);
7575
7576         if (exp.num)
7577                 si_emit_ps_exports(ctx, &exp);
7578
7579         /* Compile. */
7580         LLVMBuildRetVoid(ctx->ac.builder);
7581 }
7582
7583 /**
7584  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7585  */
7586 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7587                                       LLVMTargetMachineRef tm,
7588                                       struct si_shader *shader,
7589                                       struct pipe_debug_callback *debug)
7590 {
7591         union si_shader_part_key prolog_key;
7592         union si_shader_part_key epilog_key;
7593
7594         /* Get the prolog. */
7595         si_get_ps_prolog_key(shader, &prolog_key, true);
7596
7597         /* The prolog is a no-op if these aren't set. */
7598         if (si_need_ps_prolog(&prolog_key)) {
7599                 shader->prolog =
7600                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
7601                                            PIPE_SHADER_FRAGMENT, true,
7602                                            &prolog_key, tm, debug,
7603                                            si_build_ps_prolog_function,
7604                                            "Fragment Shader Prolog");
7605                 if (!shader->prolog)
7606                         return false;
7607         }
7608
7609         /* Get the epilog. */
7610         si_get_ps_epilog_key(shader, &epilog_key);
7611
7612         shader->epilog =
7613                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7614                                    PIPE_SHADER_FRAGMENT, false,
7615                                    &epilog_key, tm, debug,
7616                                    si_build_ps_epilog_function,
7617                                    "Fragment Shader Epilog");
7618         if (!shader->epilog)
7619                 return false;
7620
7621         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7622         if (shader->key.part.ps.prolog.poly_stipple) {
7623                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7624                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7625         }
7626
7627         /* Set up the enable bits for per-sample shading if needed. */
7628         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7629             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7630              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7631                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7632                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7633                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7634         }
7635         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7636             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7637              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7638                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7639                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7640                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7641         }
7642         if (shader->key.part.ps.prolog.force_persp_center_interp &&
7643             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7644              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7645                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7646                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7647                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7648         }
7649         if (shader->key.part.ps.prolog.force_linear_center_interp &&
7650             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7651              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7652                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7653                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7654                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7655         }
7656
7657         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7658         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7659             !(shader->config.spi_ps_input_ena & 0xf)) {
7660                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7661                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7662         }
7663
7664         /* At least one pair of interpolation weights must be enabled. */
7665         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7666                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7667                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7668         }
7669
7670         /* Samplemask fixup requires the sample ID. */
7671         if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
7672                 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
7673                 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
7674         }
7675
7676         /* The sample mask input is always enabled, because the API shader always
7677          * passes it through to the epilog. Disable it here if it's unused.
7678          */
7679         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7680             !shader->selector->info.reads_samplemask)
7681                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7682
7683         return true;
7684 }
7685
7686 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7687                                       unsigned *lds_size)
7688 {
7689         /* SPI barrier management bug:
7690          *   Make sure we have at least 4k of LDS in use to avoid the bug.
7691          *   It applies to workgroup sizes of more than one wavefront.
7692          */
7693         if (sscreen->info.family == CHIP_BONAIRE ||
7694             sscreen->info.family == CHIP_KABINI ||
7695             sscreen->info.family == CHIP_MULLINS)
7696                 *lds_size = MAX2(*lds_size, 8);
7697 }
7698
7699 static void si_fix_resource_usage(struct si_screen *sscreen,
7700                                   struct si_shader *shader)
7701 {
7702         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7703
7704         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7705
7706         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7707             si_get_max_workgroup_size(shader) > 64) {
7708                 si_multiwave_lds_size_workaround(sscreen,
7709                                                  &shader->config.lds_size);
7710         }
7711 }
7712
7713 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7714                      struct si_shader *shader,
7715                      struct pipe_debug_callback *debug)
7716 {
7717         struct si_shader_selector *sel = shader->selector;
7718         struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7719         int r;
7720
7721         /* LS, ES, VS are compiled on demand if the main part hasn't been
7722          * compiled for that stage.
7723          *
7724          * Vertex shaders are compiled on demand when a vertex fetch
7725          * workaround must be applied.
7726          */
7727         if (shader->is_monolithic) {
7728                 /* Monolithic shader (compiled as a whole, has many variants,
7729                  * may take a long time to compile).
7730                  */
7731                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7732                 if (r)
7733                         return r;
7734         } else {
7735                 /* The shader consists of several parts:
7736                  *
7737                  * - the middle part is the user shader, it has 1 variant only
7738                  *   and it was compiled during the creation of the shader
7739                  *   selector
7740                  * - the prolog part is inserted at the beginning
7741                  * - the epilog part is inserted at the end
7742                  *
7743                  * The prolog and epilog have many (but simple) variants.
7744                  *
7745                  * Starting with gfx9, geometry and tessellation control
7746                  * shaders also contain the prolog and user shader parts of
7747                  * the previous shader stage.
7748                  */
7749
7750                 if (!mainp)
7751                         return -1;
7752
7753                 /* Copy the compiled TGSI shader data over. */
7754                 shader->is_binary_shared = true;
7755                 shader->binary = mainp->binary;
7756                 shader->config = mainp->config;
7757                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7758                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7759                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7760                 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
7761                 memcpy(shader->info.vs_output_param_offset,
7762                        mainp->info.vs_output_param_offset,
7763                        sizeof(mainp->info.vs_output_param_offset));
7764                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7765                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7766                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7767
7768                 /* Select prologs and/or epilogs. */
7769                 switch (sel->type) {
7770                 case PIPE_SHADER_VERTEX:
7771                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7772                                 return -1;
7773                         break;
7774                 case PIPE_SHADER_TESS_CTRL:
7775                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7776                                 return -1;
7777                         break;
7778                 case PIPE_SHADER_TESS_EVAL:
7779                         break;
7780                 case PIPE_SHADER_GEOMETRY:
7781                         if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7782                                 return -1;
7783                         break;
7784                 case PIPE_SHADER_FRAGMENT:
7785                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7786                                 return -1;
7787
7788                         /* Make sure we have at least as many VGPRs as there
7789                          * are allocated inputs.
7790                          */
7791                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7792                                                         shader->info.num_input_vgprs);
7793                         break;
7794                 }
7795
7796                 /* Update SGPR and VGPR counts. */
7797                 if (shader->prolog) {
7798                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7799                                                         shader->prolog->config.num_sgprs);
7800                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7801                                                         shader->prolog->config.num_vgprs);
7802                 }
7803                 if (shader->previous_stage) {
7804                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7805                                                         shader->previous_stage->config.num_sgprs);
7806                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7807                                                         shader->previous_stage->config.num_vgprs);
7808                         shader->config.spilled_sgprs =
7809                                 MAX2(shader->config.spilled_sgprs,
7810                                      shader->previous_stage->config.spilled_sgprs);
7811                         shader->config.spilled_vgprs =
7812                                 MAX2(shader->config.spilled_vgprs,
7813                                      shader->previous_stage->config.spilled_vgprs);
7814                         shader->config.private_mem_vgprs =
7815                                 MAX2(shader->config.private_mem_vgprs,
7816                                      shader->previous_stage->config.private_mem_vgprs);
7817                         shader->config.scratch_bytes_per_wave =
7818                                 MAX2(shader->config.scratch_bytes_per_wave,
7819                                      shader->previous_stage->config.scratch_bytes_per_wave);
7820                         shader->info.uses_instanceid |=
7821                                 shader->previous_stage->info.uses_instanceid;
7822                 }
7823                 if (shader->prolog2) {
7824                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7825                                                         shader->prolog2->config.num_sgprs);
7826                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7827                                                         shader->prolog2->config.num_vgprs);
7828                 }
7829                 if (shader->epilog) {
7830                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7831                                                         shader->epilog->config.num_sgprs);
7832                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7833                                                         shader->epilog->config.num_vgprs);
7834                 }
7835         }
7836
7837         si_fix_resource_usage(sscreen, shader);
7838         si_shader_dump(sscreen, shader, debug, sel->info.processor,
7839                        stderr, true);
7840
7841         /* Upload. */
7842         r = si_shader_binary_upload(sscreen, shader);
7843         if (r) {
7844                 fprintf(stderr, "LLVM failed to upload shader\n");
7845                 return r;
7846         }
7847
7848         return 0;
7849 }
7850
7851 void si_shader_destroy(struct si_shader *shader)
7852 {
7853         if (shader->scratch_bo)
7854                 r600_resource_reference(&shader->scratch_bo, NULL);
7855
7856         r600_resource_reference(&shader->bo, NULL);
7857
7858         if (!shader->is_binary_shared)
7859                 ac_shader_binary_clean(&shader->binary);
7860
7861         free(shader->shader_log);
7862 }