src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_flow.h"
  35 #include "gallivm/lp_bld_misc.h"
  36 #include "util/u_memory.h"
  37 #include "util/u_string.h"
  38 #include "tgsi/tgsi_build.h"
  39 #include "tgsi/tgsi_util.h"
  40 #include "tgsi/tgsi_dump.h"
  41
  42 #include "ac_binary.h"
  43 #include "ac_llvm_util.h"
  44 #include "ac_exp_param.h"
  45 #include "si_shader_internal.h"
  46 #include "si_pipe.h"
  47 #include "sid.h"
  48
  49 #include "compiler/nir/nir.h"
  50
  51 static const char *scratch_rsrc_dword0_symbol =
  52         "SCRATCH_RSRC_DWORD0";
  53
  54 static const char *scratch_rsrc_dword1_symbol =
  55         "SCRATCH_RSRC_DWORD1";
  56
  57 struct si_shader_output_values
  58 {
  59         LLVMValueRef values[4];
  60         unsigned semantic_name;
  61         unsigned semantic_index;
  62         ubyte vertex_stream[4];
  63 };
  64
  65 /**
  66  * Used to collect types and other info about arguments of the LLVM function
  67  * before the function is created.
  68  */
  69 struct si_function_info {
  70         LLVMTypeRef types[100];
  71         LLVMValueRef *assign[100];
  72         unsigned num_sgpr_params;
  73         unsigned num_params;
  74 };
  75
  76 enum si_arg_regfile {
  77         ARG_SGPR,
  78         ARG_VGPR
  79 };
  80
  81 static void si_init_shader_ctx(struct si_shader_context *ctx,
  82                                struct si_screen *sscreen,
  83                                LLVMTargetMachineRef tm);
  84
  85 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  86                                  struct lp_build_tgsi_context *bld_base,
  87                                  struct lp_build_emit_data *emit_data);
  88
  89 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
  90                                FILE *f);
  91
  92 static unsigned llvm_get_type_size(LLVMTypeRef type);
  93
  94 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  95                                         union si_shader_part_key *key);
  96 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  97                                          union si_shader_part_key *key);
  98 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  99                                         union si_shader_part_key *key);
 100 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
 101                                         union si_shader_part_key *key);
 102
 103 /* Ideally pass the sample mask input to the PS epilog as v13, which
 104  * is its usual location, so that the shader doesn't have to add v_mov.
 105  */
 106 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
 107
 108 enum {
 109         CONST_ADDR_SPACE = 2,
 110         LOCAL_ADDR_SPACE = 3,
 111 };
 112
 113 static bool is_merged_shader(struct si_shader *shader)
 114 {
 115         if (shader->selector->screen->b.chip_class <= VI)
 116                 return false;
 117
 118         return shader->key.as_ls ||
 119                shader->key.as_es ||
 120                shader->selector->type == PIPE_SHADER_TESS_CTRL ||
 121                shader->selector->type == PIPE_SHADER_GEOMETRY;
 122 }
 123
 124 static void si_init_function_info(struct si_function_info *fninfo)
 125 {
 126         fninfo->num_params = 0;
 127         fninfo->num_sgpr_params = 0;
 128 }
 129
 130 static unsigned add_arg_assign(struct si_function_info *fninfo,
 131                         enum si_arg_regfile regfile, LLVMTypeRef type,
 132                         LLVMValueRef *assign)
 133 {
 134         assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
 135
 136         unsigned idx = fninfo->num_params++;
 137         assert(idx < ARRAY_SIZE(fninfo->types));
 138
 139         if (regfile == ARG_SGPR)
 140                 fninfo->num_sgpr_params = fninfo->num_params;
 141
 142         fninfo->types[idx] = type;
 143         fninfo->assign[idx] = assign;
 144         return idx;
 145 }
 146
 147 static unsigned add_arg(struct si_function_info *fninfo,
 148                         enum si_arg_regfile regfile, LLVMTypeRef type)
 149 {
 150         return add_arg_assign(fninfo, regfile, type, NULL);
 151 }
 152
 153 static void add_arg_assign_checked(struct si_function_info *fninfo,
 154                                    enum si_arg_regfile regfile, LLVMTypeRef type,
 155                                    LLVMValueRef *assign, unsigned idx)
 156 {
 157         MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
 158         assert(actual == idx);
 159 }
 160
 161 static void add_arg_checked(struct si_function_info *fninfo,
 162                             enum si_arg_regfile regfile, LLVMTypeRef type,
 163                             unsigned idx)
 164 {
 165         add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
 166 }
 167
 168 /**
 169  * Returns a unique index for a per-patch semantic name and index. The index
 170  * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
 171  * can be calculated.
 172  */
 173 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
 174 {
 175         switch (semantic_name) {
 176         case TGSI_SEMANTIC_TESSOUTER:
 177                 return 0;
 178         case TGSI_SEMANTIC_TESSINNER:
 179                 return 1;
 180         case TGSI_SEMANTIC_PATCH:
 181                 assert(index < 30);
 182                 return 2 + index;
 183
 184         default:
 185                 assert(!"invalid semantic name");
 186                 return 0;
 187         }
 188 }
 189
 190 /**
 191  * Returns a unique index for a semantic name and index. The index must be
 192  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 193  * calculated.
 194  */
 195 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 196 {
 197         switch (semantic_name) {
 198         case TGSI_SEMANTIC_POSITION:
 199                 return 0;
 200         case TGSI_SEMANTIC_GENERIC:
 201                 /* Since some shader stages use the the highest used IO index
 202                  * to determine the size to allocate for inputs/outputs
 203                  * (in LDS, tess and GS rings). GENERIC should be placed right
 204                  * after POSITION to make that size as small as possible.
 205                  */
 206                 if (index < SI_MAX_IO_GENERIC)
 207                         return 1 + index;
 208
 209                 assert(!"invalid generic index");
 210                 return 0;
 211         case TGSI_SEMANTIC_PSIZE:
 212                 return SI_MAX_IO_GENERIC + 1;
 213         case TGSI_SEMANTIC_CLIPDIST:
 214                 assert(index <= 1);
 215                 return SI_MAX_IO_GENERIC + 2 + index;
 216         case TGSI_SEMANTIC_FOG:
 217                 return SI_MAX_IO_GENERIC + 4;
 218         case TGSI_SEMANTIC_LAYER:
 219                 return SI_MAX_IO_GENERIC + 5;
 220         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 221                 return SI_MAX_IO_GENERIC + 6;
 222         case TGSI_SEMANTIC_PRIMID:
 223                 return SI_MAX_IO_GENERIC + 7;
 224         case TGSI_SEMANTIC_COLOR: /* these alias */
 225         case TGSI_SEMANTIC_BCOLOR:
 226                 assert(index < 2);
 227                 return SI_MAX_IO_GENERIC + 8 + index;
 228         case TGSI_SEMANTIC_TEXCOORD:
 229                 assert(index < 8);
 230                 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
 231                 return SI_MAX_IO_GENERIC + 10 + index;
 232         default:
 233                 assert(!"invalid semantic name");
 234                 return 0;
 235         }
 236 }
 237
 238 /**
 239  * Helper function that builds an LLVM IR PHI node and immediately adds
 240  * incoming edges.
 241  */
 242 static LLVMValueRef
 243 build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
 244           unsigned count_incoming, LLVMValueRef *values,
 245           LLVMBasicBlockRef *blocks)
 246 {
 247         LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
 248         LLVMAddIncoming(phi, values, blocks, count_incoming);
 249         return phi;
 250 }
 251
 252 /**
 253  * Get the value of a shader input parameter and extract a bitfield.
 254  */
 255 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 256                                  unsigned param, unsigned rshift,
 257                                  unsigned bitwidth)
 258 {
 259         struct gallivm_state *gallivm = &ctx->gallivm;
 260         LLVMValueRef value = LLVMGetParam(ctx->main_fn,
 261                                           param);
 262
 263         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 264                 value = bitcast(&ctx->bld_base,
 265                                 TGSI_TYPE_UNSIGNED, value);
 266
 267         if (rshift)
 268                 value = LLVMBuildLShr(gallivm->builder, value,
 269                                       LLVMConstInt(ctx->i32, rshift, 0), "");
 270
 271         if (rshift + bitwidth < 32) {
 272                 unsigned mask = (1 << bitwidth) - 1;
 273                 value = LLVMBuildAnd(gallivm->builder, value,
 274                                      LLVMConstInt(ctx->i32, mask, 0), "");
 275         }
 276
 277         return value;
 278 }
 279
 280 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 281 {
 282         switch (ctx->type) {
 283         case PIPE_SHADER_TESS_CTRL:
 284                 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
 285
 286         case PIPE_SHADER_TESS_EVAL:
 287                 return LLVMGetParam(ctx->main_fn,
 288                                     ctx->param_tes_rel_patch_id);
 289
 290         default:
 291                 assert(0);
 292                 return NULL;
 293         }
 294 }
 295
 296 /* Tessellation shaders pass outputs to the next shader using LDS.
 297  *
 298  * LS outputs = TCS inputs
 299  * TCS outputs = TES inputs
 300  *
 301  * The LDS layout is:
 302  * - TCS inputs for patch 0
 303  * - TCS inputs for patch 1
 304  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 305  * - ...
 306  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 307  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 308  * - TCS outputs for patch 1
 309  * - Per-patch TCS outputs for patch 1
 310  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 311  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 312  * - ...
 313  *
 314  * All three shaders VS(LS), TCS, TES share the same LDS space.
 315  */
 316
 317 static LLVMValueRef
 318 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 319 {
 320         return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 321 }
 322
 323 static LLVMValueRef
 324 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 325 {
 326         return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 327 }
 328
 329 static LLVMValueRef
 330 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 331 {
 332         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 333                                 unpack_param(ctx,
 334                                              ctx->param_tcs_out_lds_offsets,
 335                                              0, 16),
 336                                 4);
 337 }
 338
 339 static LLVMValueRef
 340 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 341 {
 342         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 343                                 unpack_param(ctx,
 344                                              ctx->param_tcs_out_lds_offsets,
 345                                              16, 16),
 346                                 4);
 347 }
 348
 349 static LLVMValueRef
 350 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 351 {
 352         struct gallivm_state *gallivm = &ctx->gallivm;
 353         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 354         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 355
 356         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 357 }
 358
 359 static LLVMValueRef
 360 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 361 {
 362         struct gallivm_state *gallivm = &ctx->gallivm;
 363         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 364         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 365         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 366
 367         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 368                             LLVMBuildMul(gallivm->builder, patch_stride,
 369                                          rel_patch_id, ""),
 370                             "");
 371 }
 372
 373 static LLVMValueRef
 374 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 375 {
 376         struct gallivm_state *gallivm = &ctx->gallivm;
 377         LLVMValueRef patch0_patch_data_offset =
 378                 get_tcs_out_patch0_patch_data_offset(ctx);
 379         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 380         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 381
 382         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 383                             LLVMBuildMul(gallivm->builder, patch_stride,
 384                                          rel_patch_id, ""),
 385                             "");
 386 }
 387
 388 static LLVMValueRef get_instance_index_for_fetch(
 389         struct si_shader_context *ctx,
 390         unsigned param_start_instance, LLVMValueRef divisor)
 391 {
 392         struct gallivm_state *gallivm = &ctx->gallivm;
 393
 394         LLVMValueRef result = ctx->abi.instance_id;
 395
 396         /* The division must be done before START_INSTANCE is added. */
 397         if (divisor != ctx->i32_1)
 398                 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
 399
 400         return LLVMBuildAdd(gallivm->builder, result,
 401                             LLVMGetParam(ctx->main_fn, param_start_instance), "");
 402 }
 403
 404 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
 405  * to float. */
 406 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
 407                                             LLVMValueRef vec4,
 408                                             unsigned double_index)
 409 {
 410         LLVMBuilderRef builder = ctx->gallivm.builder;
 411         LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
 412         LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
 413                                               LLVMVectorType(f64, 2), "");
 414         LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
 415         LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
 416         return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
 417 }
 418
 419 void si_llvm_load_input_vs(
 420         struct si_shader_context *ctx,
 421         unsigned input_index,
 422         LLVMValueRef out[4])
 423 {
 424         struct gallivm_state *gallivm = &ctx->gallivm;
 425
 426         unsigned chan;
 427         unsigned fix_fetch;
 428         unsigned num_fetches;
 429         unsigned fetch_stride;
 430
 431         LLVMValueRef t_list_ptr;
 432         LLVMValueRef t_offset;
 433         LLVMValueRef t_list;
 434         LLVMValueRef vertex_index;
 435         LLVMValueRef input[3];
 436
 437         /* Load the T list */
 438         t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 439
 440         t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 441
 442         t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
 443
 444         vertex_index = LLVMGetParam(ctx->main_fn,
 445                                     ctx->param_vertex_index0 +
 446                                     input_index);
 447
 448         fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
 449
 450         /* Do multiple loads for special formats. */
 451         switch (fix_fetch) {
 452         case SI_FIX_FETCH_RGB_64_FLOAT:
 453                 num_fetches = 3; /* 3 2-dword loads */
 454                 fetch_stride = 8;
 455                 break;
 456         case SI_FIX_FETCH_RGBA_64_FLOAT:
 457                 num_fetches = 2; /* 2 4-dword loads */
 458                 fetch_stride = 16;
 459                 break;
 460         case SI_FIX_FETCH_RGB_8:
 461         case SI_FIX_FETCH_RGB_8_INT:
 462                 num_fetches = 3;
 463                 fetch_stride = 1;
 464                 break;
 465         case SI_FIX_FETCH_RGB_16:
 466         case SI_FIX_FETCH_RGB_16_INT:
 467                 num_fetches = 3;
 468                 fetch_stride = 2;
 469                 break;
 470         default:
 471                 num_fetches = 1;
 472                 fetch_stride = 0;
 473         }
 474
 475         for (unsigned i = 0; i < num_fetches; i++) {
 476                 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
 477
 478                 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
 479                                                        vertex_index, voffset,
 480                                                        true);
 481         }
 482
 483         /* Break up the vec4 into individual components */
 484         for (chan = 0; chan < 4; chan++) {
 485                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
 486                 out[chan] = LLVMBuildExtractElement(gallivm->builder,
 487                                                     input[0], llvm_chan, "");
 488         }
 489
 490         switch (fix_fetch) {
 491         case SI_FIX_FETCH_A2_SNORM:
 492         case SI_FIX_FETCH_A2_SSCALED:
 493         case SI_FIX_FETCH_A2_SINT: {
 494                 /* The hardware returns an unsigned value; convert it to a
 495                  * signed one.
 496                  */
 497                 LLVMValueRef tmp = out[3];
 498                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 499
 500                 /* First, recover the sign-extended signed integer value. */
 501                 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
 502                         tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
 503                 else
 504                         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
 505
 506                 /* For the integer-like cases, do a natural sign extension.
 507                  *
 508                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 509                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 510                  * exponent.
 511                  */
 512                 tmp = LLVMBuildShl(gallivm->builder, tmp,
 513                                    fix_fetch == SI_FIX_FETCH_A2_SNORM ?
 514                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 515                 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
 516
 517                 /* Convert back to the right type. */
 518                 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
 519                         LLVMValueRef clamp;
 520                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 521                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 522                         clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
 523                         tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
 524                 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
 525                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 526                 }
 527
 528                 out[3] = tmp;
 529                 break;
 530         }
 531         case SI_FIX_FETCH_RGBA_32_UNORM:
 532         case SI_FIX_FETCH_RGBX_32_UNORM:
 533                 for (chan = 0; chan < 4; chan++) {
 534                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 535                                                      ctx->i32, "");
 536                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 537                                                     out[chan], ctx->f32, "");
 538                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 539                                                   LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
 540                 }
 541                 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 542                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
 543                         out[3] = LLVMConstReal(ctx->f32, 1);
 544                 break;
 545         case SI_FIX_FETCH_RGBA_32_SNORM:
 546         case SI_FIX_FETCH_RGBX_32_SNORM:
 547         case SI_FIX_FETCH_RGBA_32_FIXED:
 548         case SI_FIX_FETCH_RGBX_32_FIXED: {
 549                 double scale;
 550                 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
 551                         scale = 1.0 / 0x10000;
 552                 else
 553                         scale = 1.0 / INT_MAX;
 554
 555                 for (chan = 0; chan < 4; chan++) {
 556                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 557                                                      ctx->i32, "");
 558                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 559                                                     out[chan], ctx->f32, "");
 560                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 561                                                   LLVMConstReal(ctx->f32, scale), "");
 562                 }
 563                 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 564                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
 565                     fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
 566                         out[3] = LLVMConstReal(ctx->f32, 1);
 567                 break;
 568         }
 569         case SI_FIX_FETCH_RGBA_32_USCALED:
 570                 for (chan = 0; chan < 4; chan++) {
 571                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 572                                                      ctx->i32, "");
 573                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 574                                                     out[chan], ctx->f32, "");
 575                 }
 576                 break;
 577         case SI_FIX_FETCH_RGBA_32_SSCALED:
 578                 for (chan = 0; chan < 4; chan++) {
 579                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 580                                                      ctx->i32, "");
 581                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 582                                                     out[chan], ctx->f32, "");
 583                 }
 584                 break;
 585         case SI_FIX_FETCH_RG_64_FLOAT:
 586                 for (chan = 0; chan < 2; chan++)
 587                         out[chan] = extract_double_to_float(ctx, input[0], chan);
 588
 589                 out[2] = LLVMConstReal(ctx->f32, 0);
 590                 out[3] = LLVMConstReal(ctx->f32, 1);
 591                 break;
 592         case SI_FIX_FETCH_RGB_64_FLOAT:
 593                 for (chan = 0; chan < 3; chan++)
 594                         out[chan] = extract_double_to_float(ctx, input[chan], 0);
 595
 596                 out[3] = LLVMConstReal(ctx->f32, 1);
 597                 break;
 598         case SI_FIX_FETCH_RGBA_64_FLOAT:
 599                 for (chan = 0; chan < 4; chan++) {
 600                         out[chan] = extract_double_to_float(ctx, input[chan / 2],
 601                                                             chan % 2);
 602                 }
 603                 break;
 604         case SI_FIX_FETCH_RGB_8:
 605         case SI_FIX_FETCH_RGB_8_INT:
 606         case SI_FIX_FETCH_RGB_16:
 607         case SI_FIX_FETCH_RGB_16_INT:
 608                 for (chan = 0; chan < 3; chan++) {
 609                         out[chan] = LLVMBuildExtractElement(gallivm->builder,
 610                                                             input[chan],
 611                                                             ctx->i32_0, "");
 612                 }
 613                 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
 614                     fix_fetch == SI_FIX_FETCH_RGB_16) {
 615                         out[3] = LLVMConstReal(ctx->f32, 1);
 616                 } else {
 617                         out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
 618                                                   ctx->f32, "");
 619                 }
 620                 break;
 621         }
 622 }
 623
 624 static void declare_input_vs(
 625         struct si_shader_context *ctx,
 626         unsigned input_index,
 627         const struct tgsi_full_declaration *decl,
 628         LLVMValueRef out[4])
 629 {
 630         si_llvm_load_input_vs(ctx, input_index, out);
 631 }
 632
 633 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
 634                                      unsigned swizzle)
 635 {
 636         if (swizzle > 0)
 637                 return ctx->i32_0;
 638
 639         switch (ctx->type) {
 640         case PIPE_SHADER_VERTEX:
 641                 return LLVMGetParam(ctx->main_fn,
 642                                     ctx->param_vs_prim_id);
 643         case PIPE_SHADER_TESS_CTRL:
 644                 return LLVMGetParam(ctx->main_fn,
 645                                     ctx->param_tcs_patch_id);
 646         case PIPE_SHADER_TESS_EVAL:
 647                 return LLVMGetParam(ctx->main_fn,
 648                                     ctx->param_tes_patch_id);
 649         case PIPE_SHADER_GEOMETRY:
 650                 return LLVMGetParam(ctx->main_fn,
 651                                     ctx->param_gs_prim_id);
 652         default:
 653                 assert(0);
 654                 return ctx->i32_0;
 655         }
 656 }
 657
 658 /**
 659  * Return the value of tgsi_ind_register for indexing.
 660  * This is the indirect index with the constant offset added to it.
 661  */
 662 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
 663                                    const struct tgsi_ind_register *ind,
 664                                    int rel_index)
 665 {
 666         struct gallivm_state *gallivm = &ctx->gallivm;
 667         LLVMValueRef result;
 668
 669         result = ctx->addrs[ind->Index][ind->Swizzle];
 670         result = LLVMBuildLoad(gallivm->builder, result, "");
 671         result = LLVMBuildAdd(gallivm->builder, result,
 672                               LLVMConstInt(ctx->i32, rel_index, 0), "");
 673         return result;
 674 }
 675
 676 /**
 677  * Like si_get_indirect_index, but restricts the return value to a (possibly
 678  * undefined) value inside [0..num).
 679  */
 680 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
 681                                            const struct tgsi_ind_register *ind,
 682                                            int rel_index, unsigned num)
 683 {
 684         LLVMValueRef result = si_get_indirect_index(ctx, ind, rel_index);
 685
 686         return si_llvm_bound_index(ctx, result, num);
 687 }
 688
 689
 690 /**
 691  * Calculate a dword address given an input or output register and a stride.
 692  */
 693 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 694                                    const struct tgsi_full_dst_register *dst,
 695                                    const struct tgsi_full_src_register *src,
 696                                    LLVMValueRef vertex_dw_stride,
 697                                    LLVMValueRef base_addr)
 698 {
 699         struct gallivm_state *gallivm = &ctx->gallivm;
 700         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 701         ubyte *name, *index, *array_first;
 702         int first, param;
 703         struct tgsi_full_dst_register reg;
 704
 705         /* Set the register description. The address computation is the same
 706          * for sources and destinations. */
 707         if (src) {
 708                 reg.Register.File = src->Register.File;
 709                 reg.Register.Index = src->Register.Index;
 710                 reg.Register.Indirect = src->Register.Indirect;
 711                 reg.Register.Dimension = src->Register.Dimension;
 712                 reg.Indirect = src->Indirect;
 713                 reg.Dimension = src->Dimension;
 714                 reg.DimIndirect = src->DimIndirect;
 715         } else
 716                 reg = *dst;
 717
 718         /* If the register is 2-dimensional (e.g. an array of vertices
 719          * in a primitive), calculate the base address of the vertex. */
 720         if (reg.Register.Dimension) {
 721                 LLVMValueRef index;
 722
 723                 if (reg.Dimension.Indirect)
 724                         index = si_get_indirect_index(ctx, &reg.DimIndirect,
 725                                                    reg.Dimension.Index);
 726                 else
 727                         index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 728
 729                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 730                                          LLVMBuildMul(gallivm->builder, index,
 731                                                       vertex_dw_stride, ""), "");
 732         }
 733
 734         /* Get information about the register. */
 735         if (reg.Register.File == TGSI_FILE_INPUT) {
 736                 name = info->input_semantic_name;
 737                 index = info->input_semantic_index;
 738                 array_first = info->input_array_first;
 739         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 740                 name = info->output_semantic_name;
 741                 index = info->output_semantic_index;
 742                 array_first = info->output_array_first;
 743         } else {
 744                 assert(0);
 745                 return NULL;
 746         }
 747
 748         if (reg.Register.Indirect) {
 749                 /* Add the relative address of the element. */
 750                 LLVMValueRef ind_index;
 751
 752                 if (reg.Indirect.ArrayID)
 753                         first = array_first[reg.Indirect.ArrayID];
 754                 else
 755                         first = reg.Register.Index;
 756
 757                 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
 758                                            reg.Register.Index - first);
 759
 760                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 761                                     LLVMBuildMul(gallivm->builder, ind_index,
 762                                                  LLVMConstInt(ctx->i32, 4, 0), ""), "");
 763
 764                 param = reg.Register.Dimension ?
 765                         si_shader_io_get_unique_index(name[first], index[first]) :
 766                         si_shader_io_get_unique_index_patch(name[first], index[first]);
 767         } else {
 768                 param = reg.Register.Dimension ?
 769                         si_shader_io_get_unique_index(name[reg.Register.Index],
 770                                                       index[reg.Register.Index]) :
 771                         si_shader_io_get_unique_index_patch(name[reg.Register.Index],
 772                                                             index[reg.Register.Index]);
 773         }
 774
 775         /* Add the base address of the element. */
 776         return LLVMBuildAdd(gallivm->builder, base_addr,
 777                             LLVMConstInt(ctx->i32, param * 4, 0), "");
 778 }
 779
 780 /* The offchip buffer layout for TCS->TES is
 781  *
 782  * - attribute 0 of patch 0 vertex 0
 783  * - attribute 0 of patch 0 vertex 1
 784  * - attribute 0 of patch 0 vertex 2
 785  *   ...
 786  * - attribute 0 of patch 1 vertex 0
 787  * - attribute 0 of patch 1 vertex 1
 788  *   ...
 789  * - attribute 1 of patch 0 vertex 0
 790  * - attribute 1 of patch 0 vertex 1
 791  *   ...
 792  * - per patch attribute 0 of patch 0
 793  * - per patch attribute 0 of patch 1
 794  *   ...
 795  *
 796  * Note that every attribute has 4 components.
 797  */
 798 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 799                                                LLVMValueRef rel_patch_id,
 800                                                LLVMValueRef vertex_index,
 801                                                LLVMValueRef param_index)
 802 {
 803         struct gallivm_state *gallivm = &ctx->gallivm;
 804         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 805         LLVMValueRef param_stride, constant16;
 806
 807         vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
 808         num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
 809         total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 810                                       num_patches, "");
 811
 812         constant16 = LLVMConstInt(ctx->i32, 16, 0);
 813         if (vertex_index) {
 814                 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
 815                                          vertices_per_patch, "");
 816
 817                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 818                                          vertex_index, "");
 819
 820                 param_stride = total_vertices;
 821         } else {
 822                 base_addr = rel_patch_id;
 823                 param_stride = num_patches;
 824         }
 825
 826         base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 827                                  LLVMBuildMul(gallivm->builder, param_index,
 828                                               param_stride, ""), "");
 829
 830         base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 831
 832         if (!vertex_index) {
 833                 LLVMValueRef patch_data_offset =
 834                            unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
 835
 836                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 837                                          patch_data_offset, "");
 838         }
 839         return base_addr;
 840 }
 841
 842 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 843                                        struct si_shader_context *ctx,
 844                                        const struct tgsi_full_dst_register *dst,
 845                                        const struct tgsi_full_src_register *src)
 846 {
 847         struct gallivm_state *gallivm = &ctx->gallivm;
 848         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 849         ubyte *name, *index, *array_first;
 850         struct tgsi_full_src_register reg;
 851         LLVMValueRef vertex_index = NULL;
 852         LLVMValueRef param_index = NULL;
 853         unsigned param_index_base, param_base;
 854
 855         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 856
 857         if (reg.Register.Dimension) {
 858
 859                 if (reg.Dimension.Indirect)
 860                         vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
 861                                                           reg.Dimension.Index);
 862                 else
 863                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 864         }
 865
 866         /* Get information about the register. */
 867         if (reg.Register.File == TGSI_FILE_INPUT) {
 868                 name = info->input_semantic_name;
 869                 index = info->input_semantic_index;
 870                 array_first = info->input_array_first;
 871         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 872                 name = info->output_semantic_name;
 873                 index = info->output_semantic_index;
 874                 array_first = info->output_array_first;
 875         } else {
 876                 assert(0);
 877                 return NULL;
 878         }
 879
 880         if (reg.Register.Indirect) {
 881                 if (reg.Indirect.ArrayID)
 882                         param_base = array_first[reg.Indirect.ArrayID];
 883                 else
 884                         param_base = reg.Register.Index;
 885
 886                 param_index = si_get_indirect_index(ctx, &reg.Indirect,
 887                                                  reg.Register.Index - param_base);
 888
 889         } else {
 890                 param_base = reg.Register.Index;
 891                 param_index = ctx->i32_0;
 892         }
 893
 894         param_index_base = reg.Register.Dimension ?
 895                 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
 896                 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
 897
 898         param_index = LLVMBuildAdd(gallivm->builder, param_index,
 899                                    LLVMConstInt(ctx->i32, param_index_base, 0),
 900                                    "");
 901
 902         return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
 903                                           vertex_index, param_index);
 904 }
 905
 906 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 907                                 enum tgsi_opcode_type type, unsigned swizzle,
 908                                 LLVMValueRef buffer, LLVMValueRef offset,
 909                                 LLVMValueRef base, bool can_speculate)
 910 {
 911         struct si_shader_context *ctx = si_shader_context(bld_base);
 912         struct gallivm_state *gallivm = &ctx->gallivm;
 913         LLVMValueRef value, value2;
 914         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
 915         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 916
 917         if (swizzle == ~0) {
 918                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 919                                              0, 1, 0, can_speculate, false);
 920
 921                 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 922         }
 923
 924         if (!tgsi_type_is_64bit(type)) {
 925                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 926                                              0, 1, 0, can_speculate, false);
 927
 928                 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 929                 return LLVMBuildExtractElement(gallivm->builder, value,
 930                                     LLVMConstInt(ctx->i32, swizzle, 0), "");
 931         }
 932
 933         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 934                                   swizzle * 4, 1, 0, can_speculate, false);
 935
 936         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 937                                    swizzle * 4 + 4, 1, 0, can_speculate, false);
 938
 939         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 940 }
 941
 942 /**
 943  * Load from LDS.
 944  *
 945  * \param type          output value type
 946  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 947  * \param dw_addr       address in dwords
 948  */
 949 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 950                              enum tgsi_opcode_type type, unsigned swizzle,
 951                              LLVMValueRef dw_addr)
 952 {
 953         struct si_shader_context *ctx = si_shader_context(bld_base);
 954         struct gallivm_state *gallivm = &ctx->gallivm;
 955         LLVMValueRef value;
 956
 957         if (swizzle == ~0) {
 958                 LLVMValueRef values[TGSI_NUM_CHANNELS];
 959
 960                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
 961                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
 962
 963                 return lp_build_gather_values(gallivm, values,
 964                                               TGSI_NUM_CHANNELS);
 965         }
 966
 967         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 968                             LLVMConstInt(ctx->i32, swizzle, 0));
 969
 970         value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 971         if (tgsi_type_is_64bit(type)) {
 972                 LLVMValueRef value2;
 973                 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 974                                        ctx->i32_1);
 975                 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 976                 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 977         }
 978
 979         return LLVMBuildBitCast(gallivm->builder, value,
 980                                 tgsi2llvmtype(bld_base, type), "");
 981 }
 982
 983 /**
 984  * Store to LDS.
 985  *
 986  * \param swizzle       offset (typically 0..3)
 987  * \param dw_addr       address in dwords
 988  * \param value         value to store
 989  */
 990 static void lds_store(struct lp_build_tgsi_context *bld_base,
 991                       unsigned dw_offset_imm, LLVMValueRef dw_addr,
 992                       LLVMValueRef value)
 993 {
 994         struct si_shader_context *ctx = si_shader_context(bld_base);
 995         struct gallivm_state *gallivm = &ctx->gallivm;
 996
 997         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 998                             LLVMConstInt(ctx->i32, dw_offset_imm, 0));
 999
1000         value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1001         ac_build_indexed_store(&ctx->ac, ctx->lds,
1002                                dw_addr, value);
1003 }
1004
1005 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
1006                                                   unsigned param)
1007 {
1008         LLVMBuilderRef builder = ctx->gallivm.builder;
1009
1010         LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1011         addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
1012         addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
1013
1014         uint64_t desc2 = 0xffffffff;
1015         uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1016                          S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1017                          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1018                          S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1019                          S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1020                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1021         LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1022
1023         LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1024         desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1025         desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1026         return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1027 }
1028
1029 static LLVMValueRef fetch_input_tcs(
1030         struct lp_build_tgsi_context *bld_base,
1031         const struct tgsi_full_src_register *reg,
1032         enum tgsi_opcode_type type, unsigned swizzle)
1033 {
1034         struct si_shader_context *ctx = si_shader_context(bld_base);
1035         LLVMValueRef dw_addr, stride;
1036
1037         stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
1038         dw_addr = get_tcs_in_current_patch_offset(ctx);
1039         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1040
1041         return lds_load(bld_base, type, swizzle, dw_addr);
1042 }
1043
1044 static LLVMValueRef fetch_output_tcs(
1045                 struct lp_build_tgsi_context *bld_base,
1046                 const struct tgsi_full_src_register *reg,
1047                 enum tgsi_opcode_type type, unsigned swizzle)
1048 {
1049         struct si_shader_context *ctx = si_shader_context(bld_base);
1050         LLVMValueRef dw_addr, stride;
1051
1052         if (reg->Register.Dimension) {
1053                 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1054                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1055                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1056         } else {
1057                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1058                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1059         }
1060
1061         return lds_load(bld_base, type, swizzle, dw_addr);
1062 }
1063
1064 static LLVMValueRef fetch_input_tes(
1065         struct lp_build_tgsi_context *bld_base,
1066         const struct tgsi_full_src_register *reg,
1067         enum tgsi_opcode_type type, unsigned swizzle)
1068 {
1069         struct si_shader_context *ctx = si_shader_context(bld_base);
1070         LLVMValueRef buffer, base, addr;
1071
1072         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1073
1074         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1075         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1076
1077         return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1078 }
1079
1080 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1081                              const struct tgsi_full_instruction *inst,
1082                              const struct tgsi_opcode_info *info,
1083                              LLVMValueRef dst[4])
1084 {
1085         struct si_shader_context *ctx = si_shader_context(bld_base);
1086         struct gallivm_state *gallivm = &ctx->gallivm;
1087         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1088         const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1089         unsigned chan_index;
1090         LLVMValueRef dw_addr, stride;
1091         LLVMValueRef buffer, base, buf_addr;
1092         LLVMValueRef values[4];
1093         bool skip_lds_store;
1094         bool is_tess_factor = false;
1095
1096         /* Only handle per-patch and per-vertex outputs here.
1097          * Vectors will be lowered to scalars and this function will be called again.
1098          */
1099         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1100             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1101                 si_llvm_emit_store(bld_base, inst, info, dst);
1102                 return;
1103         }
1104
1105         if (reg->Register.Dimension) {
1106                 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1107                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1108                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1109                 skip_lds_store = !sh_info->reads_pervertex_outputs;
1110         } else {
1111                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1112                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1113                 skip_lds_store = !sh_info->reads_perpatch_outputs;
1114
1115                 if (!reg->Register.Indirect) {
1116                         int name = sh_info->output_semantic_name[reg->Register.Index];
1117
1118                         /* Always write tess factors into LDS for the TCS epilog. */
1119                         if (name == TGSI_SEMANTIC_TESSINNER ||
1120                             name == TGSI_SEMANTIC_TESSOUTER) {
1121                                 skip_lds_store = false;
1122                                 is_tess_factor = true;
1123                         }
1124                 }
1125         }
1126
1127         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1128
1129         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1130         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1131
1132
1133         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1134                 LLVMValueRef value = dst[chan_index];
1135
1136                 if (inst->Instruction.Saturate)
1137                         value = ac_build_clamp(&ctx->ac, value);
1138
1139                 /* Skip LDS stores if there is no LDS read of this output. */
1140                 if (!skip_lds_store)
1141                         lds_store(bld_base, chan_index, dw_addr, value);
1142
1143                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1144                 values[chan_index] = value;
1145
1146                 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1147                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1148                                                     buf_addr, base,
1149                                                     4 * chan_index, 1, 0, true, false);
1150                 }
1151         }
1152
1153         if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1154                 LLVMValueRef value = lp_build_gather_values(gallivm,
1155                                                             values, 4);
1156                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1157                                             base, 0, 1, 0, true, false);
1158         }
1159 }
1160
1161 static LLVMValueRef fetch_input_gs(
1162         struct lp_build_tgsi_context *bld_base,
1163         const struct tgsi_full_src_register *reg,
1164         enum tgsi_opcode_type type,
1165         unsigned swizzle)
1166 {
1167         struct si_shader_context *ctx = si_shader_context(bld_base);
1168         struct si_shader *shader = ctx->shader;
1169         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1170         struct gallivm_state *gallivm = &ctx->gallivm;
1171         LLVMValueRef vtx_offset, soffset;
1172         struct tgsi_shader_info *info = &shader->selector->info;
1173         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1174         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1175         unsigned param;
1176         LLVMValueRef value;
1177
1178         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1179                 return get_primitive_id(ctx, swizzle);
1180
1181         if (!reg->Register.Dimension)
1182                 return NULL;
1183
1184         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1185
1186         /* GFX9 has the ESGS ring in LDS. */
1187         if (ctx->screen->b.chip_class >= GFX9) {
1188                 unsigned index = reg->Dimension.Index;
1189
1190                 switch (index / 2) {
1191                 case 0:
1192                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1193                                                   index % 2 ? 16 : 0, 16);
1194                         break;
1195                 case 1:
1196                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1197                                                   index % 2 ? 16 : 0, 16);
1198                         break;
1199                 case 2:
1200                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1201                                                   index % 2 ? 16 : 0, 16);
1202                         break;
1203                 default:
1204                         assert(0);
1205                         return NULL;
1206                 }
1207
1208                 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1209                                           LLVMConstInt(ctx->i32, param * 4, 0), "");
1210                 return lds_load(bld_base, type, swizzle, vtx_offset);
1211         }
1212
1213         /* GFX6: input load from the ESGS ring in memory. */
1214         if (swizzle == ~0) {
1215                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1216                 unsigned chan;
1217                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1218                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1219                 }
1220                 return lp_build_gather_values(gallivm, values,
1221                                               TGSI_NUM_CHANNELS);
1222         }
1223
1224         /* Get the vertex offset parameter on GFX6. */
1225         unsigned vtx_offset_param = reg->Dimension.Index;
1226         if (vtx_offset_param < 2) {
1227                 vtx_offset_param += ctx->param_gs_vtx0_offset;
1228         } else {
1229                 assert(vtx_offset_param < 6);
1230                 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1231         }
1232         vtx_offset = lp_build_mul_imm(uint,
1233                                       LLVMGetParam(ctx->main_fn,
1234                                                    vtx_offset_param),
1235                                       4);
1236
1237         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1238
1239         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1240                                      vtx_offset, soffset, 0, 1, 0, true, false);
1241         if (tgsi_type_is_64bit(type)) {
1242                 LLVMValueRef value2;
1243                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1244
1245                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1246                                               ctx->i32_0, vtx_offset, soffset,
1247                                               0, 1, 0, true, false);
1248                 return si_llvm_emit_fetch_64bit(bld_base, type,
1249                                                 value, value2);
1250         }
1251         return LLVMBuildBitCast(gallivm->builder,
1252                                 value,
1253                                 tgsi2llvmtype(bld_base, type), "");
1254 }
1255
1256 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1257 {
1258         switch (interpolate) {
1259         case TGSI_INTERPOLATE_CONSTANT:
1260                 return 0;
1261
1262         case TGSI_INTERPOLATE_LINEAR:
1263                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1264                         return SI_PARAM_LINEAR_SAMPLE;
1265                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1266                         return SI_PARAM_LINEAR_CENTROID;
1267                 else
1268                         return SI_PARAM_LINEAR_CENTER;
1269                 break;
1270         case TGSI_INTERPOLATE_COLOR:
1271         case TGSI_INTERPOLATE_PERSPECTIVE:
1272                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1273                         return SI_PARAM_PERSP_SAMPLE;
1274                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1275                         return SI_PARAM_PERSP_CENTROID;
1276                 else
1277                         return SI_PARAM_PERSP_CENTER;
1278                 break;
1279         default:
1280                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1281                 return -1;
1282         }
1283 }
1284
1285 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1286                                        unsigned attr_index, unsigned chan,
1287                                        LLVMValueRef prim_mask,
1288                                        LLVMValueRef i, LLVMValueRef j)
1289 {
1290         if (i || j) {
1291                 return ac_build_fs_interp(&ctx->ac,
1292                                           LLVMConstInt(ctx->i32, chan, 0),
1293                                           LLVMConstInt(ctx->i32, attr_index, 0),
1294                                           prim_mask, i, j);
1295         }
1296         return ac_build_fs_interp_mov(&ctx->ac,
1297                                       LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1298                                       LLVMConstInt(ctx->i32, chan, 0),
1299                                       LLVMConstInt(ctx->i32, attr_index, 0),
1300                                       prim_mask);
1301 }
1302
1303 /**
1304  * Interpolate a fragment shader input.
1305  *
1306  * @param ctx           context
1307  * @param input_index           index of the input in hardware
1308  * @param semantic_name         TGSI_SEMANTIC_*
1309  * @param semantic_index        semantic index
1310  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1311  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1312  * @param interp_param          interpolation weights (i,j)
1313  * @param prim_mask             SI_PARAM_PRIM_MASK
1314  * @param face                  SI_PARAM_FRONT_FACE
1315  * @param result                the return value (4 components)
1316  */
1317 static void interp_fs_input(struct si_shader_context *ctx,
1318                             unsigned input_index,
1319                             unsigned semantic_name,
1320                             unsigned semantic_index,
1321                             unsigned num_interp_inputs,
1322                             unsigned colors_read_mask,
1323                             LLVMValueRef interp_param,
1324                             LLVMValueRef prim_mask,
1325                             LLVMValueRef face,
1326                             LLVMValueRef result[4])
1327 {
1328         struct gallivm_state *gallivm = &ctx->gallivm;
1329         LLVMValueRef i = NULL, j = NULL;
1330         unsigned chan;
1331
1332         /* fs.constant returns the param from the middle vertex, so it's not
1333          * really useful for flat shading. It's meant to be used for custom
1334          * interpolation (but the intrinsic can't fetch from the other two
1335          * vertices).
1336          *
1337          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1338          * to do the right thing. The only reason we use fs.constant is that
1339          * fs.interp cannot be used on integers, because they can be equal
1340          * to NaN.
1341          *
1342          * When interp is false we will use fs.constant or for newer llvm,
1343          * amdgcn.interp.mov.
1344          */
1345         bool interp = interp_param != NULL;
1346
1347         if (interp) {
1348                 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1349                                                 LLVMVectorType(ctx->f32, 2), "");
1350
1351                 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1352                                                 ctx->i32_0, "");
1353                 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1354                                                 ctx->i32_1, "");
1355         }
1356
1357         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1358             ctx->shader->key.part.ps.prolog.color_two_side) {
1359                 LLVMValueRef is_face_positive;
1360
1361                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1362                  * otherwise it's at offset "num_inputs".
1363                  */
1364                 unsigned back_attr_offset = num_interp_inputs;
1365                 if (semantic_index == 1 && colors_read_mask & 0xf)
1366                         back_attr_offset += 1;
1367
1368                 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1369                                                  face, ctx->i32_0, "");
1370
1371                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1372                         LLVMValueRef front, back;
1373
1374                         front = si_build_fs_interp(ctx,
1375                                                    input_index, chan,
1376                                                    prim_mask, i, j);
1377                         back = si_build_fs_interp(ctx,
1378                                                   back_attr_offset, chan,
1379                                                   prim_mask, i, j);
1380
1381                         result[chan] = LLVMBuildSelect(gallivm->builder,
1382                                                 is_face_positive,
1383                                                 front,
1384                                                 back,
1385                                                 "");
1386                 }
1387         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1388                 result[0] = si_build_fs_interp(ctx, input_index,
1389                                                0, prim_mask, i, j);
1390                 result[1] =
1391                 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1392                 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1393         } else {
1394                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1395                         result[chan] = si_build_fs_interp(ctx,
1396                                                           input_index, chan,
1397                                                           prim_mask, i, j);
1398                 }
1399         }
1400 }
1401
1402 void si_llvm_load_input_fs(
1403         struct si_shader_context *ctx,
1404         unsigned input_index,
1405         LLVMValueRef out[4])
1406 {
1407         struct lp_build_context *base = &ctx->bld_base.base;
1408         struct si_shader *shader = ctx->shader;
1409         struct tgsi_shader_info *info = &shader->selector->info;
1410         LLVMValueRef main_fn = ctx->main_fn;
1411         LLVMValueRef interp_param = NULL;
1412         int interp_param_idx;
1413         enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1414         unsigned semantic_index = info->input_semantic_index[input_index];
1415         enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1416         enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1417
1418         /* Get colors from input VGPRs (set by the prolog). */
1419         if (semantic_name == TGSI_SEMANTIC_COLOR) {
1420                 unsigned colors_read = shader->selector->info.colors_read;
1421                 unsigned mask = colors_read >> (semantic_index * 4);
1422                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1423                                   (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1424
1425                 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1426                 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1427                 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1428                 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1429                 return;
1430         }
1431
1432         interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1433         if (interp_param_idx == -1)
1434                 return;
1435         else if (interp_param_idx) {
1436                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1437         }
1438
1439         interp_fs_input(ctx, input_index, semantic_name,
1440                         semantic_index, 0, /* this param is unused */
1441                         shader->selector->info.colors_read, interp_param,
1442                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1443                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1444                         &out[0]);
1445 }
1446
1447 static void declare_input_fs(
1448         struct si_shader_context *ctx,
1449         unsigned input_index,
1450         const struct tgsi_full_declaration *decl,
1451         LLVMValueRef out[4])
1452 {
1453         si_llvm_load_input_fs(ctx, input_index, out);
1454 }
1455
1456 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1457 {
1458         return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1459 }
1460
1461
1462 /**
1463  * Load a dword from a constant buffer.
1464  */
1465 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1466                                       LLVMValueRef resource,
1467                                       LLVMValueRef offset)
1468 {
1469         return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1470                                     0, 0, 0, true, true);
1471 }
1472
1473 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1474 {
1475         struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1476         struct gallivm_state *gallivm = &ctx->gallivm;
1477         LLVMBuilderRef builder = gallivm->builder;
1478         LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1479         LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1480         LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1481
1482         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1483         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1484         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1485
1486         LLVMValueRef pos[4] = {
1487                 buffer_load_const(ctx, resource, offset0),
1488                 buffer_load_const(ctx, resource, offset1),
1489                 LLVMConstReal(ctx->f32, 0),
1490                 LLVMConstReal(ctx->f32, 0)
1491         };
1492
1493         return lp_build_gather_values(gallivm, pos, 4);
1494 }
1495
1496 static void declare_system_value(struct si_shader_context *ctx,
1497                                  unsigned index,
1498                                  const struct tgsi_full_declaration *decl)
1499 {
1500         struct lp_build_context *bld = &ctx->bld_base.base;
1501         struct gallivm_state *gallivm = &ctx->gallivm;
1502         LLVMValueRef value = 0;
1503
1504         assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1505
1506         switch (decl->Semantic.Name) {
1507         case TGSI_SEMANTIC_INSTANCEID:
1508                 value = ctx->abi.instance_id;
1509                 break;
1510
1511         case TGSI_SEMANTIC_VERTEXID:
1512                 value = LLVMBuildAdd(gallivm->builder,
1513                                      ctx->abi.vertex_id,
1514                                      ctx->abi.base_vertex, "");
1515                 break;
1516
1517         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1518                 /* Unused. Clarify the meaning in indexed vs. non-indexed
1519                  * draws if this is ever used again. */
1520                 assert(false);
1521                 break;
1522
1523         case TGSI_SEMANTIC_BASEVERTEX:
1524         {
1525                 /* For non-indexed draws, the base vertex set by the driver
1526                  * (for direct draws) or the CP (for indirect draws) is the
1527                  * first vertex ID, but GLSL expects 0 to be returned.
1528                  */
1529                 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1530                 LLVMValueRef indexed;
1531
1532                 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1533                 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1534
1535                 value = LLVMBuildSelect(gallivm->builder, indexed,
1536                                         ctx->abi.base_vertex, ctx->i32_0, "");
1537                 break;
1538         }
1539
1540         case TGSI_SEMANTIC_BASEINSTANCE:
1541                 value = ctx->abi.start_instance;
1542                 break;
1543
1544         case TGSI_SEMANTIC_DRAWID:
1545                 value = ctx->abi.draw_id;
1546                 break;
1547
1548         case TGSI_SEMANTIC_INVOCATIONID:
1549                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1550                         value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1551                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1552                         value = LLVMGetParam(ctx->main_fn,
1553                                              ctx->param_gs_instance_id);
1554                 else
1555                         assert(!"INVOCATIONID not implemented");
1556                 break;
1557
1558         case TGSI_SEMANTIC_POSITION:
1559         {
1560                 LLVMValueRef pos[4] = {
1561                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1562                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1563                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1564                         lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1565                                                  LLVMGetParam(ctx->main_fn,
1566                                                               SI_PARAM_POS_W_FLOAT)),
1567                 };
1568                 value = lp_build_gather_values(gallivm, pos, 4);
1569                 break;
1570         }
1571
1572         case TGSI_SEMANTIC_FACE:
1573                 value = ctx->abi.front_face;
1574                 break;
1575
1576         case TGSI_SEMANTIC_SAMPLEID:
1577                 value = get_sample_id(ctx);
1578                 break;
1579
1580         case TGSI_SEMANTIC_SAMPLEPOS: {
1581                 LLVMValueRef pos[4] = {
1582                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1583                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1584                         LLVMConstReal(ctx->f32, 0),
1585                         LLVMConstReal(ctx->f32, 0)
1586                 };
1587                 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1588                                                   TGSI_OPCODE_FRC, pos[0]);
1589                 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1590                                                   TGSI_OPCODE_FRC, pos[1]);
1591                 value = lp_build_gather_values(gallivm, pos, 4);
1592                 break;
1593         }
1594
1595         case TGSI_SEMANTIC_SAMPLEMASK:
1596                 /* This can only occur with the OpenGL Core profile, which
1597                  * doesn't support smoothing.
1598                  */
1599                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1600                 break;
1601
1602         case TGSI_SEMANTIC_TESSCOORD:
1603         {
1604                 LLVMValueRef coord[4] = {
1605                         LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1606                         LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1607                         bld->zero,
1608                         bld->zero
1609                 };
1610
1611                 /* For triangles, the vector should be (u, v, 1-u-v). */
1612                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1613                     PIPE_PRIM_TRIANGLES)
1614                         coord[2] = lp_build_sub(bld, bld->one,
1615                                                 lp_build_add(bld, coord[0], coord[1]));
1616
1617                 value = lp_build_gather_values(gallivm, coord, 4);
1618                 break;
1619         }
1620
1621         case TGSI_SEMANTIC_VERTICESIN:
1622                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1623                         value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1624                 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1625                         value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1626                 else
1627                         assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1628                 break;
1629
1630         case TGSI_SEMANTIC_TESSINNER:
1631         case TGSI_SEMANTIC_TESSOUTER:
1632         {
1633                 LLVMValueRef buffer, base, addr;
1634                 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1635
1636                 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1637
1638                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1639                 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1640                                           LLVMConstInt(ctx->i32, param, 0));
1641
1642                 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1643                                     ~0, buffer, base, addr, true);
1644
1645                 break;
1646         }
1647
1648         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1649         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1650         {
1651                 LLVMValueRef buf, slot, val[4];
1652                 int i, offset;
1653
1654                 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1655                 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1656                 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1657                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1658
1659                 for (i = 0; i < 4; i++)
1660                         val[i] = buffer_load_const(ctx, buf,
1661                                                    LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1662                 value = lp_build_gather_values(gallivm, val, 4);
1663                 break;
1664         }
1665
1666         case TGSI_SEMANTIC_PRIMID:
1667                 value = get_primitive_id(ctx, 0);
1668                 break;
1669
1670         case TGSI_SEMANTIC_GRID_SIZE:
1671                 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1672                 break;
1673
1674         case TGSI_SEMANTIC_BLOCK_SIZE:
1675         {
1676                 LLVMValueRef values[3];
1677                 unsigned i;
1678                 unsigned *properties = ctx->shader->selector->info.properties;
1679
1680                 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1681                         unsigned sizes[3] = {
1682                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1683                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1684                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1685                         };
1686
1687                         for (i = 0; i < 3; ++i)
1688                                 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1689
1690                         value = lp_build_gather_values(gallivm, values, 3);
1691                 } else {
1692                         value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1693                 }
1694                 break;
1695         }
1696
1697         case TGSI_SEMANTIC_BLOCK_ID:
1698         {
1699                 LLVMValueRef values[3];
1700
1701                 for (int i = 0; i < 3; i++) {
1702                         values[i] = ctx->i32_0;
1703                         if (ctx->param_block_id[i] >= 0) {
1704                                 values[i] = LLVMGetParam(ctx->main_fn,
1705                                                          ctx->param_block_id[i]);
1706                         }
1707                 }
1708                 value = lp_build_gather_values(gallivm, values, 3);
1709                 break;
1710         }
1711
1712         case TGSI_SEMANTIC_THREAD_ID:
1713                 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1714                 break;
1715
1716         case TGSI_SEMANTIC_HELPER_INVOCATION:
1717                 value = lp_build_intrinsic(gallivm->builder,
1718                                            "llvm.amdgcn.ps.live",
1719                                            ctx->i1, NULL, 0,
1720                                            LP_FUNC_ATTR_READNONE);
1721                 value = LLVMBuildNot(gallivm->builder, value, "");
1722                 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1723                 break;
1724
1725         case TGSI_SEMANTIC_SUBGROUP_SIZE:
1726                 value = LLVMConstInt(ctx->i32, 64, 0);
1727                 break;
1728
1729         case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1730                 value = ac_get_thread_id(&ctx->ac);
1731                 break;
1732
1733         case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1734         {
1735                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1736                 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1737                 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1738                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1739                 break;
1740         }
1741
1742         case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1743         case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1744         case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1745         case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1746         {
1747                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1748                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1749                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1750                         /* All bits set except LSB */
1751                         value = LLVMConstInt(ctx->i64, -2, 0);
1752                 } else {
1753                         /* All bits set */
1754                         value = LLVMConstInt(ctx->i64, -1, 0);
1755                 }
1756                 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1757                 value = LLVMBuildShl(gallivm->builder, value, id, "");
1758                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1759                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1760                         value = LLVMBuildNot(gallivm->builder, value, "");
1761                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1762                 break;
1763         }
1764
1765         default:
1766                 assert(!"unknown system value");
1767                 return;
1768         }
1769
1770         ctx->system_values[index] = value;
1771 }
1772
1773 static void declare_compute_memory(struct si_shader_context *ctx,
1774                                    const struct tgsi_full_declaration *decl)
1775 {
1776         struct si_shader_selector *sel = ctx->shader->selector;
1777         struct gallivm_state *gallivm = &ctx->gallivm;
1778
1779         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1780         LLVMValueRef var;
1781
1782         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1783         assert(decl->Range.First == decl->Range.Last);
1784         assert(!ctx->shared_memory);
1785
1786         var = LLVMAddGlobalInAddressSpace(gallivm->module,
1787                                           LLVMArrayType(ctx->i8, sel->local_size),
1788                                           "compute_lds",
1789                                           LOCAL_ADDR_SPACE);
1790         LLVMSetAlignment(var, 4);
1791
1792         ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1793 }
1794
1795 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1796 {
1797         LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1798                                              ctx->param_const_and_shader_buffers);
1799
1800         return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1801                         LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1802 }
1803
1804 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
1805 {
1806         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1807         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1808
1809         index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
1810         index = LLVMBuildAdd(ctx->gallivm.builder, index,
1811                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1812
1813         return ac_build_indexed_load_const(&ctx->ac, ptr, index);
1814 }
1815
1816 static LLVMValueRef
1817 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
1818 {
1819         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1820         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
1821                                              ctx->param_const_and_shader_buffers);
1822
1823         index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
1824         index = LLVMBuildSub(ctx->gallivm.builder,
1825                              LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
1826                              index, "");
1827
1828         return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
1829 }
1830
1831 static LLVMValueRef fetch_constant(
1832         struct lp_build_tgsi_context *bld_base,
1833         const struct tgsi_full_src_register *reg,
1834         enum tgsi_opcode_type type,
1835         unsigned swizzle)
1836 {
1837         struct si_shader_context *ctx = si_shader_context(bld_base);
1838         struct lp_build_context *base = &bld_base->base;
1839         const struct tgsi_ind_register *ireg = &reg->Indirect;
1840         unsigned buf, idx;
1841
1842         LLVMValueRef addr, bufp;
1843         LLVMValueRef result;
1844
1845         if (swizzle == LP_CHAN_ALL) {
1846                 unsigned chan;
1847                 LLVMValueRef values[4];
1848                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1849                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1850
1851                 return lp_build_gather_values(&ctx->gallivm, values, 4);
1852         }
1853
1854         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1855         idx = reg->Register.Index * 4 + swizzle;
1856
1857         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1858                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1859                 LLVMValueRef index;
1860                 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1861                                                       reg->Dimension.Index,
1862                                                       ctx->num_const_buffers);
1863                 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1864                                      LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1865                 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1866         } else
1867                 bufp = load_const_buffer_desc(ctx, buf);
1868
1869         if (reg->Register.Indirect) {
1870                 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1871                 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1872                 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1873                 addr = lp_build_add(&bld_base->uint_bld, addr,
1874                                     LLVMConstInt(ctx->i32, idx * 4, 0));
1875         } else {
1876                 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1877         }
1878
1879         result = buffer_load_const(ctx, bufp, addr);
1880
1881         if (!tgsi_type_is_64bit(type))
1882                 result = bitcast(bld_base, type, result);
1883         else {
1884                 LLVMValueRef addr2, result2;
1885
1886                 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1887                                      LLVMConstInt(ctx->i32, 4, 0));
1888                 result2 = buffer_load_const(ctx, bufp, addr2);
1889
1890                 result = si_llvm_emit_fetch_64bit(bld_base, type,
1891                                                   result, result2);
1892         }
1893         return result;
1894 }
1895
1896 /* Upper 16 bits must be zero. */
1897 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1898                                            LLVMValueRef val[2])
1899 {
1900         return LLVMBuildOr(ctx->gallivm.builder, val[0],
1901                            LLVMBuildShl(ctx->gallivm.builder, val[1],
1902                                         LLVMConstInt(ctx->i32, 16, 0),
1903                                         ""), "");
1904 }
1905
1906 /* Upper 16 bits are ignored and will be dropped. */
1907 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1908                                                     LLVMValueRef val[2])
1909 {
1910         LLVMValueRef v[2] = {
1911                 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1912                              LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1913                 val[1],
1914         };
1915         return si_llvm_pack_two_int16(ctx, v);
1916 }
1917
1918 /* Initialize arguments for the shader export intrinsic */
1919 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1920                                      LLVMValueRef *values,
1921                                      unsigned target,
1922                                      struct ac_export_args *args)
1923 {
1924         struct si_shader_context *ctx = si_shader_context(bld_base);
1925         struct lp_build_context *base = &bld_base->base;
1926         LLVMBuilderRef builder = ctx->gallivm.builder;
1927         LLVMValueRef val[4];
1928         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1929         unsigned chan;
1930         bool is_int8, is_int10;
1931
1932         /* Default is 0xf. Adjusted below depending on the format. */
1933         args->enabled_channels = 0xf; /* writemask */
1934
1935         /* Specify whether the EXEC mask represents the valid mask */
1936         args->valid_mask = 0;
1937
1938         /* Specify whether this is the last export */
1939         args->done = 0;
1940
1941         /* Specify the target we are exporting */
1942         args->target = target;
1943
1944         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1945                 const struct si_shader_key *key = &ctx->shader->key;
1946                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1947                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1948
1949                 assert(cbuf >= 0 && cbuf < 8);
1950                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1951                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1952                 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1953         }
1954
1955         args->compr = false;
1956         args->out[0] = base->undef;
1957         args->out[1] = base->undef;
1958         args->out[2] = base->undef;
1959         args->out[3] = base->undef;
1960
1961         switch (spi_shader_col_format) {
1962         case V_028714_SPI_SHADER_ZERO:
1963                 args->enabled_channels = 0; /* writemask */
1964                 args->target = V_008DFC_SQ_EXP_NULL;
1965                 break;
1966
1967         case V_028714_SPI_SHADER_32_R:
1968                 args->enabled_channels = 1; /* writemask */
1969                 args->out[0] = values[0];
1970                 break;
1971
1972         case V_028714_SPI_SHADER_32_GR:
1973                 args->enabled_channels = 0x3; /* writemask */
1974                 args->out[0] = values[0];
1975                 args->out[1] = values[1];
1976                 break;
1977
1978         case V_028714_SPI_SHADER_32_AR:
1979                 args->enabled_channels = 0x9; /* writemask */
1980                 args->out[0] = values[0];
1981                 args->out[3] = values[3];
1982                 break;
1983
1984         case V_028714_SPI_SHADER_FP16_ABGR:
1985                 args->compr = 1; /* COMPR flag */
1986
1987                 for (chan = 0; chan < 2; chan++) {
1988                         LLVMValueRef pack_args[2] = {
1989                                 values[2 * chan],
1990                                 values[2 * chan + 1]
1991                         };
1992                         LLVMValueRef packed;
1993
1994                         packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1995                         args->out[chan] =
1996                                 LLVMBuildBitCast(ctx->gallivm.builder,
1997                                                  packed, ctx->f32, "");
1998                 }
1999                 break;
2000
2001         case V_028714_SPI_SHADER_UNORM16_ABGR:
2002                 for (chan = 0; chan < 4; chan++) {
2003                         val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
2004                         val[chan] = LLVMBuildFMul(builder, val[chan],
2005                                                   LLVMConstReal(ctx->f32, 65535), "");
2006                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2007                                                   LLVMConstReal(ctx->f32, 0.5), "");
2008                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
2009                                                     ctx->i32, "");
2010                 }
2011
2012                 args->compr = 1; /* COMPR flag */
2013                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2014                                   si_llvm_pack_two_int16(ctx, val));
2015                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2016                                   si_llvm_pack_two_int16(ctx, val+2));
2017                 break;
2018
2019         case V_028714_SPI_SHADER_SNORM16_ABGR:
2020                 for (chan = 0; chan < 4; chan++) {
2021                         /* Clamp between [-1, 1]. */
2022                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2023                                                               values[chan],
2024                                                               LLVMConstReal(ctx->f32, 1));
2025                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2026                                                               val[chan],
2027                                                               LLVMConstReal(ctx->f32, -1));
2028                         /* Convert to a signed integer in [-32767, 32767]. */
2029                         val[chan] = LLVMBuildFMul(builder, val[chan],
2030                                                   LLVMConstReal(ctx->f32, 32767), "");
2031                         /* If positive, add 0.5, else add -0.5. */
2032                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2033                                         LLVMBuildSelect(builder,
2034                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
2035                                                               val[chan], base->zero, ""),
2036                                                 LLVMConstReal(ctx->f32, 0.5),
2037                                                 LLVMConstReal(ctx->f32, -0.5), ""), "");
2038                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2039                 }
2040
2041                 args->compr = 1; /* COMPR flag */
2042                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2043                                   si_llvm_pack_two_int32_as_int16(ctx, val));
2044                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2045                                   si_llvm_pack_two_int32_as_int16(ctx, val+2));
2046                 break;
2047
2048         case V_028714_SPI_SHADER_UINT16_ABGR: {
2049                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2050                         is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
2051                 LLVMValueRef max_alpha =
2052                         !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2053
2054                 /* Clamp. */
2055                 for (chan = 0; chan < 4; chan++) {
2056                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2057                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2058                                         val[chan],
2059                                         chan == 3 ? max_alpha : max_rgb);
2060                 }
2061
2062                 args->compr = 1; /* COMPR flag */
2063                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2064                                   si_llvm_pack_two_int16(ctx, val));
2065                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2066                                   si_llvm_pack_two_int16(ctx, val+2));
2067                 break;
2068         }
2069
2070         case V_028714_SPI_SHADER_SINT16_ABGR: {
2071                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2072                         is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2073                 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2074                         is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2075                 LLVMValueRef max_alpha =
2076                         !is_int10 ? max_rgb : ctx->i32_1;
2077                 LLVMValueRef min_alpha =
2078                         !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2079
2080                 /* Clamp. */
2081                 for (chan = 0; chan < 4; chan++) {
2082                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2083                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2084                                         TGSI_OPCODE_IMIN,
2085                                         val[chan], chan == 3 ? max_alpha : max_rgb);
2086                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2087                                         TGSI_OPCODE_IMAX,
2088                                         val[chan], chan == 3 ? min_alpha : min_rgb);
2089                 }
2090
2091                 args->compr = 1; /* COMPR flag */
2092                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2093                                   si_llvm_pack_two_int32_as_int16(ctx, val));
2094                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2095                                   si_llvm_pack_two_int32_as_int16(ctx, val+2));
2096                 break;
2097         }
2098
2099         case V_028714_SPI_SHADER_32_ABGR:
2100                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2101                 break;
2102         }
2103 }
2104
2105 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2106                           LLVMValueRef alpha)
2107 {
2108         struct si_shader_context *ctx = si_shader_context(bld_base);
2109
2110         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2111                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2112                                 SI_PARAM_ALPHA_REF);
2113
2114                 LLVMValueRef alpha_pass =
2115                         lp_build_cmp(&bld_base->base,
2116                                      ctx->shader->key.part.ps.epilog.alpha_func,
2117                                      alpha, alpha_ref);
2118                 LLVMValueRef arg =
2119                         lp_build_select(&bld_base->base,
2120                                         alpha_pass,
2121                                         LLVMConstReal(ctx->f32, 1.0f),
2122                                         LLVMConstReal(ctx->f32, -1.0f));
2123
2124                 ac_build_kill(&ctx->ac, arg);
2125         } else {
2126                 ac_build_kill(&ctx->ac, NULL);
2127         }
2128 }
2129
2130 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2131                                                   LLVMValueRef alpha,
2132                                                   unsigned samplemask_param)
2133 {
2134         struct si_shader_context *ctx = si_shader_context(bld_base);
2135         struct gallivm_state *gallivm = &ctx->gallivm;
2136         LLVMValueRef coverage;
2137
2138         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2139         coverage = LLVMGetParam(ctx->main_fn,
2140                                 samplemask_param);
2141         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2142
2143         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2144                                    ctx->i32,
2145                                    &coverage, 1, LP_FUNC_ATTR_READNONE);
2146
2147         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2148                                    ctx->f32, "");
2149
2150         coverage = LLVMBuildFMul(gallivm->builder, coverage,
2151                                  LLVMConstReal(ctx->f32,
2152                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2153
2154         return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2155 }
2156
2157 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2158                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
2159 {
2160         struct si_shader_context *ctx = si_shader_context(bld_base);
2161         struct lp_build_context *base = &bld_base->base;
2162         unsigned reg_index;
2163         unsigned chan;
2164         unsigned const_chan;
2165         LLVMValueRef base_elt;
2166         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2167         LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2168                                                    SI_VS_CONST_CLIP_PLANES, 0);
2169         LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2170
2171         for (reg_index = 0; reg_index < 2; reg_index ++) {
2172                 struct ac_export_args *args = &pos[2 + reg_index];
2173
2174                 args->out[0] =
2175                 args->out[1] =
2176                 args->out[2] =
2177                 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2178
2179                 /* Compute dot products of position and user clip plane vectors */
2180                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2181                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2182                                 LLVMValueRef addr =
2183                                         LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2184                                                                 const_chan) * 4, 0);
2185                                 base_elt = buffer_load_const(ctx, const_resource,
2186                                                              addr);
2187                                 args->out[chan] =
2188                                         lp_build_add(base, args->out[chan],
2189                                                      lp_build_mul(base, base_elt,
2190                                                                   out_elts[const_chan]));
2191                         }
2192                 }
2193
2194                 args->enabled_channels = 0xf;
2195                 args->valid_mask = 0;
2196                 args->done = 0;
2197                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2198                 args->compr = 0;
2199         }
2200 }
2201
2202 static void si_dump_streamout(struct pipe_stream_output_info *so)
2203 {
2204         unsigned i;
2205
2206         if (so->num_outputs)
2207                 fprintf(stderr, "STREAMOUT\n");
2208
2209         for (i = 0; i < so->num_outputs; i++) {
2210                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2211                                 so->output[i].start_component;
2212                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2213                         i, so->output[i].output_buffer,
2214                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2215                         so->output[i].register_index,
2216                         mask & 1 ? "x" : "",
2217                         mask & 2 ? "y" : "",
2218                         mask & 4 ? "z" : "",
2219                         mask & 8 ? "w" : "");
2220         }
2221 }
2222
2223 static void emit_streamout_output(struct si_shader_context *ctx,
2224                                   LLVMValueRef const *so_buffers,
2225                                   LLVMValueRef const *so_write_offsets,
2226                                   struct pipe_stream_output *stream_out,
2227                                   struct si_shader_output_values *shader_out)
2228 {
2229         struct gallivm_state *gallivm = &ctx->gallivm;
2230         LLVMBuilderRef builder = gallivm->builder;
2231         unsigned buf_idx = stream_out->output_buffer;
2232         unsigned start = stream_out->start_component;
2233         unsigned num_comps = stream_out->num_components;
2234         LLVMValueRef out[4];
2235
2236         assert(num_comps && num_comps <= 4);
2237         if (!num_comps || num_comps > 4)
2238                 return;
2239
2240         /* Load the output as int. */
2241         for (int j = 0; j < num_comps; j++) {
2242                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2243
2244                 out[j] = LLVMBuildBitCast(builder,
2245                                           shader_out->values[start + j],
2246                                 ctx->i32, "");
2247         }
2248
2249         /* Pack the output. */
2250         LLVMValueRef vdata = NULL;
2251
2252         switch (num_comps) {
2253         case 1: /* as i32 */
2254                 vdata = out[0];
2255                 break;
2256         case 2: /* as v2i32 */
2257         case 3: /* as v4i32 (aligned to 4) */
2258         case 4: /* as v4i32 */
2259                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2260                 for (int j = 0; j < num_comps; j++) {
2261                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2262                                                        LLVMConstInt(ctx->i32, j, 0), "");
2263                 }
2264                 break;
2265         }
2266
2267         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2268                                     vdata, num_comps,
2269                                     so_write_offsets[buf_idx],
2270                                     ctx->i32_0,
2271                                     stream_out->dst_offset * 4, 1, 1, true, false);
2272 }
2273
2274 /**
2275  * Write streamout data to buffers for vertex stream @p stream (different
2276  * vertex streams can occur for GS copy shaders).
2277  */
2278 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2279                                    struct si_shader_output_values *outputs,
2280                                    unsigned noutput, unsigned stream)
2281 {
2282         struct si_shader_selector *sel = ctx->shader->selector;
2283         struct pipe_stream_output_info *so = &sel->so;
2284         struct gallivm_state *gallivm = &ctx->gallivm;
2285         LLVMBuilderRef builder = gallivm->builder;
2286         int i;
2287         struct lp_build_if_state if_ctx;
2288
2289         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2290         LLVMValueRef so_vtx_count =
2291                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2292
2293         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2294
2295         /* can_emit = tid < so_vtx_count; */
2296         LLVMValueRef can_emit =
2297                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2298
2299         /* Emit the streamout code conditionally. This actually avoids
2300          * out-of-bounds buffer access. The hw tells us via the SGPR
2301          * (so_vtx_count) which threads are allowed to emit streamout data. */
2302         lp_build_if(&if_ctx, gallivm, can_emit);
2303         {
2304                 /* The buffer offset is computed as follows:
2305                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2306                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2307                  *                attrib_offset
2308                  */
2309
2310                 LLVMValueRef so_write_index =
2311                         LLVMGetParam(ctx->main_fn,
2312                                      ctx->param_streamout_write_index);
2313
2314                 /* Compute (streamout_write_index + thread_id). */
2315                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2316
2317                 /* Load the descriptor and compute the write offset for each
2318                  * enabled buffer. */
2319                 LLVMValueRef so_write_offset[4] = {};
2320                 LLVMValueRef so_buffers[4];
2321                 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2322                                                     ctx->param_rw_buffers);
2323
2324                 for (i = 0; i < 4; i++) {
2325                         if (!so->stride[i])
2326                                 continue;
2327
2328                         LLVMValueRef offset = LLVMConstInt(ctx->i32,
2329                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
2330
2331                         so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2332
2333                         LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2334                                                               ctx->param_streamout_offset[i]);
2335                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2336
2337                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2338                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2339                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2340                 }
2341
2342                 /* Write streamout data. */
2343                 for (i = 0; i < so->num_outputs; i++) {
2344                         unsigned reg = so->output[i].register_index;
2345
2346                         if (reg >= noutput)
2347                                 continue;
2348
2349                         if (stream != so->output[i].stream)
2350                                 continue;
2351
2352                         emit_streamout_output(ctx, so_buffers, so_write_offset,
2353                                               &so->output[i], &outputs[reg]);
2354                 }
2355         }
2356         lp_build_endif(&if_ctx);
2357 }
2358
2359 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2360                             LLVMValueRef *values)
2361 {
2362         struct ac_export_args args;
2363
2364         si_llvm_init_export_args(&ctx->bld_base, values,
2365                                  V_008DFC_SQ_EXP_PARAM + index, &args);
2366         ac_build_export(&ctx->ac, &args);
2367 }
2368
2369 static void si_build_param_exports(struct si_shader_context *ctx,
2370                                    struct si_shader_output_values *outputs,
2371                                    unsigned noutput)
2372 {
2373         struct si_shader *shader = ctx->shader;
2374         unsigned param_count = 0;
2375
2376         for (unsigned i = 0; i < noutput; i++) {
2377                 unsigned semantic_name = outputs[i].semantic_name;
2378                 unsigned semantic_index = outputs[i].semantic_index;
2379
2380                 if (outputs[i].vertex_stream[0] != 0 &&
2381                     outputs[i].vertex_stream[1] != 0 &&
2382                     outputs[i].vertex_stream[2] != 0 &&
2383                     outputs[i].vertex_stream[3] != 0)
2384                         continue;
2385
2386                 switch (semantic_name) {
2387                 case TGSI_SEMANTIC_LAYER:
2388                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2389                 case TGSI_SEMANTIC_CLIPDIST:
2390                 case TGSI_SEMANTIC_COLOR:
2391                 case TGSI_SEMANTIC_BCOLOR:
2392                 case TGSI_SEMANTIC_PRIMID:
2393                 case TGSI_SEMANTIC_FOG:
2394                 case TGSI_SEMANTIC_TEXCOORD:
2395                 case TGSI_SEMANTIC_GENERIC:
2396                         break;
2397                 default:
2398                         continue;
2399                 }
2400
2401                 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2402                      semantic_index < SI_MAX_IO_GENERIC) &&
2403                     shader->key.opt.kill_outputs &
2404                     (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2405                         continue;
2406
2407                 si_export_param(ctx, param_count, outputs[i].values);
2408
2409                 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2410                 shader->info.vs_output_param_offset[i] = param_count++;
2411         }
2412
2413         shader->info.nr_param_exports = param_count;
2414 }
2415
2416 /* Generate export instructions for hardware VS shader stage */
2417 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2418                               struct si_shader_output_values *outputs,
2419                               unsigned noutput)
2420 {
2421         struct si_shader_context *ctx = si_shader_context(bld_base);
2422         struct si_shader *shader = ctx->shader;
2423         struct lp_build_context *base = &bld_base->base;
2424         struct ac_export_args pos_args[4] = {};
2425         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2426         unsigned pos_idx;
2427         int i;
2428
2429         /* Build position exports. */
2430         for (i = 0; i < noutput; i++) {
2431                 switch (outputs[i].semantic_name) {
2432                 case TGSI_SEMANTIC_POSITION:
2433                         si_llvm_init_export_args(bld_base, outputs[i].values,
2434                                                  V_008DFC_SQ_EXP_POS, &pos_args[0]);
2435                         break;
2436                 case TGSI_SEMANTIC_PSIZE:
2437                         psize_value = outputs[i].values[0];
2438                         break;
2439                 case TGSI_SEMANTIC_LAYER:
2440                         layer_value = outputs[i].values[0];
2441                         break;
2442                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2443                         viewport_index_value = outputs[i].values[0];
2444                         break;
2445                 case TGSI_SEMANTIC_EDGEFLAG:
2446                         edgeflag_value = outputs[i].values[0];
2447                         break;
2448                 case TGSI_SEMANTIC_CLIPDIST:
2449                         if (!shader->key.opt.clip_disable) {
2450                                 unsigned index = 2 + outputs[i].semantic_index;
2451                                 si_llvm_init_export_args(bld_base, outputs[i].values,
2452                                                          V_008DFC_SQ_EXP_POS + index,
2453                                                          &pos_args[index]);
2454                         }
2455                         break;
2456                 case TGSI_SEMANTIC_CLIPVERTEX:
2457                         if (!shader->key.opt.clip_disable) {
2458                                 si_llvm_emit_clipvertex(bld_base, pos_args,
2459                                                         outputs[i].values);
2460                         }
2461                         break;
2462                 }
2463         }
2464
2465         /* We need to add the position output manually if it's missing. */
2466         if (!pos_args[0].out[0]) {
2467                 pos_args[0].enabled_channels = 0xf; /* writemask */
2468                 pos_args[0].valid_mask = 0; /* EXEC mask */
2469                 pos_args[0].done = 0; /* last export? */
2470                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2471                 pos_args[0].compr = 0; /* COMPR flag */
2472                 pos_args[0].out[0] = base->zero; /* X */
2473                 pos_args[0].out[1] = base->zero; /* Y */
2474                 pos_args[0].out[2] = base->zero; /* Z */
2475                 pos_args[0].out[3] = base->one;  /* W */
2476         }
2477
2478         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2479         if (shader->selector->info.writes_psize ||
2480             shader->selector->info.writes_edgeflag ||
2481             shader->selector->info.writes_viewport_index ||
2482             shader->selector->info.writes_layer) {
2483                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2484                                                (shader->selector->info.writes_edgeflag << 1) |
2485                                                (shader->selector->info.writes_layer << 2);
2486
2487                 pos_args[1].valid_mask = 0; /* EXEC mask */
2488                 pos_args[1].done = 0; /* last export? */
2489                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2490                 pos_args[1].compr = 0; /* COMPR flag */
2491                 pos_args[1].out[0] = base->zero; /* X */
2492                 pos_args[1].out[1] = base->zero; /* Y */
2493                 pos_args[1].out[2] = base->zero; /* Z */
2494                 pos_args[1].out[3] = base->zero; /* W */
2495
2496                 if (shader->selector->info.writes_psize)
2497                         pos_args[1].out[0] = psize_value;
2498
2499                 if (shader->selector->info.writes_edgeflag) {
2500                         /* The output is a float, but the hw expects an integer
2501                          * with the first bit containing the edge flag. */
2502                         edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2503                                                          edgeflag_value,
2504                                                          ctx->i32, "");
2505                         edgeflag_value = ac_build_umin(&ctx->ac,
2506                                                       edgeflag_value,
2507                                                       ctx->i32_1);
2508
2509                         /* The LLVM intrinsic expects a float. */
2510                         pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2511                                                           edgeflag_value,
2512                                                           ctx->f32, "");
2513                 }
2514
2515                 if (ctx->screen->b.chip_class >= GFX9) {
2516                         /* GFX9 has the layer in out.z[10:0] and the viewport
2517                          * index in out.z[19:16].
2518                          */
2519                         if (shader->selector->info.writes_layer)
2520                                 pos_args[1].out[2] = layer_value;
2521
2522                         if (shader->selector->info.writes_viewport_index) {
2523                                 LLVMValueRef v = viewport_index_value;
2524
2525                                 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2526                                 v = LLVMBuildShl(ctx->gallivm.builder, v,
2527                                                  LLVMConstInt(ctx->i32, 16, 0), "");
2528                                 v = LLVMBuildOr(ctx->gallivm.builder, v,
2529                                                 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2530                                                         pos_args[1].out[2]), "");
2531                                 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2532                                 pos_args[1].enabled_channels |= 1 << 2;
2533                         }
2534                 } else {
2535                         if (shader->selector->info.writes_layer)
2536                                 pos_args[1].out[2] = layer_value;
2537
2538                         if (shader->selector->info.writes_viewport_index) {
2539                                 pos_args[1].out[3] = viewport_index_value;
2540                                 pos_args[1].enabled_channels |= 1 << 3;
2541                         }
2542                 }
2543         }
2544
2545         for (i = 0; i < 4; i++)
2546                 if (pos_args[i].out[0])
2547                         shader->info.nr_pos_exports++;
2548
2549         pos_idx = 0;
2550         for (i = 0; i < 4; i++) {
2551                 if (!pos_args[i].out[0])
2552                         continue;
2553
2554                 /* Specify the target we are exporting */
2555                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2556
2557                 if (pos_idx == shader->info.nr_pos_exports)
2558                         /* Specify that this is the last export */
2559                         pos_args[i].done = 1;
2560
2561                 ac_build_export(&ctx->ac, &pos_args[i]);
2562         }
2563
2564         /* Build parameter exports. */
2565         si_build_param_exports(ctx, outputs, noutput);
2566 }
2567
2568 /**
2569  * Forward all outputs from the vertex shader to the TES. This is only used
2570  * for the fixed function TCS.
2571  */
2572 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2573 {
2574         struct si_shader_context *ctx = si_shader_context(bld_base);
2575         struct gallivm_state *gallivm = &ctx->gallivm;
2576         LLVMValueRef invocation_id, buffer, buffer_offset;
2577         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2578         uint64_t inputs;
2579
2580         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2581         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2582         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2583
2584         lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2585         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2586                                          lds_vertex_stride, "");
2587         lds_base = get_tcs_in_current_patch_offset(ctx);
2588         lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2589
2590         inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2591         while (inputs) {
2592                 unsigned i = u_bit_scan64(&inputs);
2593
2594                 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2595                                             LLVMConstInt(ctx->i32, 4 * i, 0),
2596                                              "");
2597
2598                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2599                                               get_rel_patch_id(ctx),
2600                                               invocation_id,
2601                                               LLVMConstInt(ctx->i32, i, 0));
2602
2603                 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2604                                               lds_ptr);
2605
2606                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2607                                             buffer_offset, 0, 1, 0, true, false);
2608         }
2609 }
2610
2611 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2612                                   LLVMValueRef rel_patch_id,
2613                                   LLVMValueRef invocation_id,
2614                                   LLVMValueRef tcs_out_current_patch_data_offset)
2615 {
2616         struct si_shader_context *ctx = si_shader_context(bld_base);
2617         struct gallivm_state *gallivm = &ctx->gallivm;
2618         struct si_shader *shader = ctx->shader;
2619         unsigned tess_inner_index, tess_outer_index;
2620         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2621         LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2622         unsigned stride, outer_comps, inner_comps, i, offset;
2623         struct lp_build_if_state if_ctx, inner_if_ctx;
2624
2625         si_llvm_emit_barrier(NULL, bld_base, NULL);
2626
2627         /* Do this only for invocation 0, because the tess levels are per-patch,
2628          * not per-vertex.
2629          *
2630          * This can't jump, because invocation 0 executes this. It should
2631          * at least mask out the loads and stores for other invocations.
2632          */
2633         lp_build_if(&if_ctx, gallivm,
2634                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2635                                   invocation_id, ctx->i32_0, ""));
2636
2637         /* Determine the layout of one tess factor element in the buffer. */
2638         switch (shader->key.part.tcs.epilog.prim_mode) {
2639         case PIPE_PRIM_LINES:
2640                 stride = 2; /* 2 dwords, 1 vec2 store */
2641                 outer_comps = 2;
2642                 inner_comps = 0;
2643                 break;
2644         case PIPE_PRIM_TRIANGLES:
2645                 stride = 4; /* 4 dwords, 1 vec4 store */
2646                 outer_comps = 3;
2647                 inner_comps = 1;
2648                 break;
2649         case PIPE_PRIM_QUADS:
2650                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2651                 outer_comps = 4;
2652                 inner_comps = 2;
2653                 break;
2654         default:
2655                 assert(0);
2656                 return;
2657         }
2658
2659         /* Load tess_inner and tess_outer from LDS.
2660          * Any invocation can write them, so we can't get them from a temporary.
2661          */
2662         tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2663         tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2664
2665         lds_base = tcs_out_current_patch_data_offset;
2666         lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2667                                  LLVMConstInt(ctx->i32,
2668                                               tess_inner_index * 4, 0), "");
2669         lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2670                                  LLVMConstInt(ctx->i32,
2671                                               tess_outer_index * 4, 0), "");
2672
2673         for (i = 0; i < 4; i++) {
2674                 inner[i] = LLVMGetUndef(ctx->i32);
2675                 outer[i] = LLVMGetUndef(ctx->i32);
2676         }
2677
2678         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2679                 /* For isolines, the hardware expects tess factors in the
2680                  * reverse order from what GLSL / TGSI specify.
2681                  */
2682                 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2683                 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2684         } else {
2685                 for (i = 0; i < outer_comps; i++) {
2686                         outer[i] = out[i] =
2687                                 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2688                 }
2689                 for (i = 0; i < inner_comps; i++) {
2690                         inner[i] = out[outer_comps+i] =
2691                                 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2692                 }
2693         }
2694
2695         /* Convert the outputs to vectors for stores. */
2696         vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2697         vec1 = NULL;
2698
2699         if (stride > 4)
2700                 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2701
2702         /* Get the buffer. */
2703         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2704
2705         /* Get the offset. */
2706         tf_base = LLVMGetParam(ctx->main_fn,
2707                                ctx->param_tcs_factor_offset);
2708         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2709                                   LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2710
2711         lp_build_if(&inner_if_ctx, gallivm,
2712                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2713                                   rel_patch_id, ctx->i32_0, ""));
2714
2715         /* Store the dynamic HS control word. */
2716         offset = 0;
2717         if (ctx->screen->b.chip_class <= VI) {
2718                 ac_build_buffer_store_dword(&ctx->ac, buffer,
2719                                             LLVMConstInt(ctx->i32, 0x80000000, 0),
2720                                             1, ctx->i32_0, tf_base,
2721                                             offset, 1, 0, true, false);
2722                 offset += 4;
2723         }
2724
2725         lp_build_endif(&inner_if_ctx);
2726
2727         /* Store the tessellation factors. */
2728         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2729                                     MIN2(stride, 4), byteoffset, tf_base,
2730                                     offset, 1, 0, true, false);
2731         offset += 16;
2732         if (vec1)
2733                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2734                                             stride - 4, byteoffset, tf_base,
2735                                             offset, 1, 0, true, false);
2736
2737         /* Store the tess factors into the offchip buffer if TES reads them. */
2738         if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2739                 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2740                 LLVMValueRef tf_inner_offset;
2741                 unsigned param_outer, param_inner;
2742
2743                 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2744                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2745
2746                 param_outer = si_shader_io_get_unique_index_patch(
2747                                       TGSI_SEMANTIC_TESSOUTER, 0);
2748                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2749                                         LLVMConstInt(ctx->i32, param_outer, 0));
2750
2751                 outer_vec = lp_build_gather_values(gallivm, outer,
2752                                                    util_next_power_of_two(outer_comps));
2753
2754                 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2755                                             outer_comps, tf_outer_offset,
2756                                             base, 0, 1, 0, true, false);
2757                 if (inner_comps) {
2758                         param_inner = si_shader_io_get_unique_index_patch(
2759                                               TGSI_SEMANTIC_TESSINNER, 0);
2760                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2761                                         LLVMConstInt(ctx->i32, param_inner, 0));
2762
2763                         inner_vec = inner_comps == 1 ? inner[0] :
2764                                     lp_build_gather_values(gallivm, inner, inner_comps);
2765                         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2766                                                     inner_comps, tf_inner_offset,
2767                                                     base, 0, 1, 0, true, false);
2768                 }
2769         }
2770
2771         lp_build_endif(&if_ctx);
2772 }
2773
2774 static LLVMValueRef
2775 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2776                     unsigned param, unsigned return_index)
2777 {
2778         return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2779                                     LLVMGetParam(ctx->main_fn, param),
2780                                     return_index, "");
2781 }
2782
2783 static LLVMValueRef
2784 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2785                           unsigned param, unsigned return_index)
2786 {
2787         LLVMBuilderRef builder = ctx->gallivm.builder;
2788         LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2789
2790         return LLVMBuildInsertValue(builder, ret,
2791                                     LLVMBuildBitCast(builder, p, ctx->f32, ""),
2792                                     return_index, "");
2793 }
2794
2795 static LLVMValueRef
2796 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2797                              unsigned param, unsigned return_index)
2798 {
2799         LLVMBuilderRef builder = ctx->gallivm.builder;
2800         LLVMValueRef ptr, lo, hi;
2801
2802         ptr = LLVMGetParam(ctx->main_fn, param);
2803         ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2804         ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2805         lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2806         hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2807         ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2808         return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2809 }
2810
2811 /* This only writes the tessellation factor levels. */
2812 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2813 {
2814         struct si_shader_context *ctx = si_shader_context(bld_base);
2815         LLVMBuilderRef builder = ctx->gallivm.builder;
2816         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2817
2818         si_copy_tcs_inputs(bld_base);
2819
2820         rel_patch_id = get_rel_patch_id(ctx);
2821         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2822         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2823
2824         if (ctx->screen->b.chip_class >= GFX9) {
2825                 LLVMBasicBlockRef blocks[2] = {
2826                         LLVMGetInsertBlock(builder),
2827                         ctx->merged_wrap_if_state.entry_block
2828                 };
2829                 LLVMValueRef values[2];
2830
2831                 lp_build_endif(&ctx->merged_wrap_if_state);
2832
2833                 values[0] = rel_patch_id;
2834                 values[1] = LLVMGetUndef(ctx->i32);
2835                 rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2836
2837                 values[0] = tf_lds_offset;
2838                 values[1] = LLVMGetUndef(ctx->i32);
2839                 tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2840
2841                 values[0] = invocation_id;
2842                 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2843                 invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2844         }
2845
2846         /* Return epilog parameters from this function. */
2847         LLVMValueRef ret = ctx->return_value;
2848         unsigned vgpr;
2849
2850         if (ctx->screen->b.chip_class >= GFX9) {
2851                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2852                                           8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2853                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2854                                           8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2855                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2856                                           8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2857                 /* Tess offchip and tess factor offsets are at the beginning. */
2858                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2859                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2860                 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2861         } else {
2862                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2863                                           GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2864                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2865                                           GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2866                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2867                                           GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2868                 /* Tess offchip and tess factor offsets are after user SGPRs. */
2869                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2870                                           GFX6_TCS_NUM_USER_SGPR);
2871                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2872                                           GFX6_TCS_NUM_USER_SGPR + 1);
2873                 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2874         }
2875
2876         /* VGPRs */
2877         rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2878         invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2879         tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2880
2881         /* Leave a hole corresponding to the two input VGPRs. This ensures that
2882          * the invocation_id output does not alias the param_tcs_rel_ids input,
2883          * which saves a V_MOV on gfx9.
2884          */
2885         vgpr += 2;
2886
2887         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2888         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2889         ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2890         ctx->return_value = ret;
2891 }
2892
2893 /* Pass TCS inputs from LS to TCS on GFX9. */
2894 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2895 {
2896         LLVMValueRef ret = ctx->return_value;
2897
2898         ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2899         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2900         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2901         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2902         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2903
2904         ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2905                                   8 + SI_SGPR_VS_STATE_BITS);
2906         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2907                                   8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2908         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2909                                   8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2910         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2911                                   8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2912         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2913                                   8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2914         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2915                                   8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2916
2917         unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2918         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2919                                            8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2920         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2921                                            8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2922
2923         unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2924         ret = si_insert_input_ret_float(ctx, ret,
2925                                         ctx->param_tcs_patch_id, vgpr++);
2926         ret = si_insert_input_ret_float(ctx, ret,
2927                                         ctx->param_tcs_rel_ids, vgpr++);
2928         ctx->return_value = ret;
2929 }
2930
2931 /* Pass GS inputs from ES to GS on GFX9. */
2932 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2933 {
2934         LLVMValueRef ret = ctx->return_value;
2935
2936         ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2937         ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2938         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2939
2940         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2941
2942         unsigned desc_param = ctx->param_vs_state_bits + 1;
2943         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2944                                            8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
2945         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2946                                            8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
2947
2948         unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2949         for (unsigned i = 0; i < 5; i++) {
2950                 unsigned param = ctx->param_gs_vtx01_offset + i;
2951                 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2952         }
2953         ctx->return_value = ret;
2954 }
2955
2956 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2957 {
2958         struct si_shader_context *ctx = si_shader_context(bld_base);
2959         struct si_shader *shader = ctx->shader;
2960         struct tgsi_shader_info *info = &shader->selector->info;
2961         struct gallivm_state *gallivm = &ctx->gallivm;
2962         unsigned i, chan;
2963         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2964                                               ctx->param_rel_auto_id);
2965         LLVMValueRef vertex_dw_stride =
2966                 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2967         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2968                                                  vertex_dw_stride, "");
2969
2970         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2971          * its inputs from it. */
2972         for (i = 0; i < info->num_outputs; i++) {
2973                 LLVMValueRef *out_ptr = ctx->outputs[i];
2974                 unsigned name = info->output_semantic_name[i];
2975                 unsigned index = info->output_semantic_index[i];
2976
2977                 /* The ARB_shader_viewport_layer_array spec contains the
2978                  * following issue:
2979                  *
2980                  *    2) What happens if gl_ViewportIndex or gl_Layer is
2981                  *    written in the vertex shader and a geometry shader is
2982                  *    present?
2983                  *
2984                  *    RESOLVED: The value written by the last vertex processing
2985                  *    stage is used. If the last vertex processing stage
2986                  *    (vertex, tessellation evaluation or geometry) does not
2987                  *    statically assign to gl_ViewportIndex or gl_Layer, index
2988                  *    or layer zero is assumed.
2989                  *
2990                  * So writes to those outputs in VS-as-LS are simply ignored.
2991                  */
2992                 if (name == TGSI_SEMANTIC_LAYER ||
2993                     name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2994                         continue;
2995
2996                 int param = si_shader_io_get_unique_index(name, index);
2997                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2998                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
2999
3000                 for (chan = 0; chan < 4; chan++) {
3001                         lds_store(bld_base, chan, dw_addr,
3002                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
3003                 }
3004         }
3005
3006         if (ctx->screen->b.chip_class >= GFX9)
3007                 si_set_ls_return_value_for_tcs(ctx);
3008 }
3009
3010 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
3011 {
3012         struct si_shader_context *ctx = si_shader_context(bld_base);
3013         struct gallivm_state *gallivm = &ctx->gallivm;
3014         struct si_shader *es = ctx->shader;
3015         struct tgsi_shader_info *info = &es->selector->info;
3016         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3017                                             ctx->param_es2gs_offset);
3018         LLVMValueRef lds_base = NULL;
3019         unsigned chan;
3020         int i;
3021
3022         if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
3023                 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3024                 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3025                 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3026                 vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
3027                                          LLVMBuildMul(gallivm->builder, wave_idx,
3028                                                       LLVMConstInt(ctx->i32, 64, false), ""), "");
3029                 lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
3030                                         LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3031         }
3032
3033         for (i = 0; i < info->num_outputs; i++) {
3034                 LLVMValueRef *out_ptr = ctx->outputs[i];
3035                 int param;
3036
3037                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3038                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3039                         continue;
3040
3041                 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3042                                                       info->output_semantic_index[i]);
3043
3044                 for (chan = 0; chan < 4; chan++) {
3045                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3046                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3047
3048                         /* GFX9 has the ESGS ring in LDS. */
3049                         if (ctx->screen->b.chip_class >= GFX9) {
3050                                 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
3051                                 continue;
3052                         }
3053
3054                         ac_build_buffer_store_dword(&ctx->ac,
3055                                                     ctx->esgs_ring,
3056                                                     out_val, 1, NULL, soffset,
3057                                                     (4 * param + chan) * 4,
3058                                                     1, 1, true, true);
3059                 }
3060         }
3061
3062         if (ctx->screen->b.chip_class >= GFX9)
3063                 si_set_es_return_value_for_gs(ctx);
3064 }
3065
3066 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3067 {
3068         if (ctx->screen->b.chip_class >= GFX9)
3069                 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3070         else
3071                 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3072 }
3073
3074 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3075 {
3076         struct si_shader_context *ctx = si_shader_context(bld_base);
3077
3078         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3079                          si_get_gs_wave_id(ctx));
3080
3081         if (ctx->screen->b.chip_class >= GFX9)
3082                 lp_build_endif(&ctx->merged_wrap_if_state);
3083 }
3084
3085 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3086                                      unsigned max_outputs,
3087                                      LLVMValueRef *addrs)
3088 {
3089         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3090         struct gallivm_state *gallivm = &ctx->gallivm;
3091         struct tgsi_shader_info *info = &ctx->shader->selector->info;
3092         struct si_shader_output_values *outputs = NULL;
3093         int i,j;
3094
3095         assert(!ctx->shader->is_gs_copy_shader);
3096         assert(info->num_outputs <= max_outputs);
3097
3098         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3099
3100         /* Vertex color clamping.
3101          *
3102          * This uses a state constant loaded in a user data SGPR and
3103          * an IF statement is added that clamps all colors if the constant
3104          * is true.
3105          */
3106         if (ctx->type == PIPE_SHADER_VERTEX) {
3107                 struct lp_build_if_state if_ctx;
3108                 LLVMValueRef cond = NULL;
3109                 LLVMValueRef addr, val;
3110
3111                 for (i = 0; i < info->num_outputs; i++) {
3112                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3113                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3114                                 continue;
3115
3116                         /* We've found a color. */
3117                         if (!cond) {
3118                                 /* The state is in the first bit of the user SGPR. */
3119                                 cond = LLVMGetParam(ctx->main_fn,
3120                                                     ctx->param_vs_state_bits);
3121                                 cond = LLVMBuildTrunc(gallivm->builder, cond,
3122                                                       ctx->i1, "");
3123                                 lp_build_if(&if_ctx, gallivm, cond);
3124                         }
3125
3126                         for (j = 0; j < 4; j++) {
3127                                 addr = addrs[4 * i + j];
3128                                 val = LLVMBuildLoad(gallivm->builder, addr, "");
3129                                 val = ac_build_clamp(&ctx->ac, val);
3130                                 LLVMBuildStore(gallivm->builder, val, addr);
3131                         }
3132                 }
3133
3134                 if (cond)
3135                         lp_build_endif(&if_ctx);
3136         }
3137
3138         for (i = 0; i < info->num_outputs; i++) {
3139                 outputs[i].semantic_name = info->output_semantic_name[i];
3140                 outputs[i].semantic_index = info->output_semantic_index[i];
3141
3142                 for (j = 0; j < 4; j++) {
3143                         outputs[i].values[j] =
3144                                 LLVMBuildLoad(gallivm->builder,
3145                                               addrs[4 * i + j],
3146                                               "");
3147                         outputs[i].vertex_stream[j] =
3148                                 (info->output_streams[i] >> (2 * j)) & 3;
3149                 }
3150         }
3151
3152         if (ctx->shader->selector->so.num_outputs)
3153                 si_llvm_emit_streamout(ctx, outputs, i, 0);
3154
3155         /* Export PrimitiveID. */
3156         if (ctx->shader->key.mono.u.vs_export_prim_id) {
3157                 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3158                 outputs[i].semantic_index = 0;
3159                 outputs[i].values[0] = LLVMBuildBitCast(gallivm->builder,
3160                                 get_primitive_id(ctx, 0), ctx->f32, "");
3161                 for (j = 1; j < 4; j++)
3162                         outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3163
3164                 memset(outputs[i].vertex_stream, 0,
3165                        sizeof(outputs[i].vertex_stream));
3166                 i++;
3167         }
3168
3169         si_llvm_export_vs(&ctx->bld_base, outputs, i);
3170         FREE(outputs);
3171 }
3172
3173 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3174 {
3175         struct si_shader_context *ctx = si_shader_context(bld_base);
3176
3177         ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3178                               &ctx->outputs[0][0]);
3179 }
3180
3181 struct si_ps_exports {
3182         unsigned num;
3183         struct ac_export_args args[10];
3184 };
3185
3186 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3187                                     bool writes_samplemask)
3188 {
3189         if (writes_z) {
3190                 /* Z needs 32 bits. */
3191                 if (writes_samplemask)
3192                         return V_028710_SPI_SHADER_32_ABGR;
3193                 else if (writes_stencil)
3194                         return V_028710_SPI_SHADER_32_GR;
3195                 else
3196                         return V_028710_SPI_SHADER_32_R;
3197         } else if (writes_stencil || writes_samplemask) {
3198                 /* Both stencil and sample mask need only 16 bits. */
3199                 return V_028710_SPI_SHADER_UINT16_ABGR;
3200         } else {
3201                 return V_028710_SPI_SHADER_ZERO;
3202         }
3203 }
3204
3205 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3206                             LLVMValueRef depth, LLVMValueRef stencil,
3207                             LLVMValueRef samplemask, struct si_ps_exports *exp)
3208 {
3209         struct si_shader_context *ctx = si_shader_context(bld_base);
3210         struct lp_build_context *base = &bld_base->base;
3211         struct ac_export_args args;
3212         unsigned mask = 0;
3213         unsigned format = si_get_spi_shader_z_format(depth != NULL,
3214                                                      stencil != NULL,
3215                                                      samplemask != NULL);
3216
3217         assert(depth || stencil || samplemask);
3218
3219         args.valid_mask = 1; /* whether the EXEC mask is valid */
3220         args.done = 1; /* DONE bit */
3221
3222         /* Specify the target we are exporting */
3223         args.target = V_008DFC_SQ_EXP_MRTZ;
3224
3225         args.compr = 0; /* COMP flag */
3226         args.out[0] = base->undef; /* R, depth */
3227         args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3228         args.out[2] = base->undef; /* B, sample mask */
3229         args.out[3] = base->undef; /* A, alpha to mask */
3230
3231         if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3232                 assert(!depth);
3233                 args.compr = 1; /* COMPR flag */
3234
3235                 if (stencil) {
3236                         /* Stencil should be in X[23:16]. */
3237                         stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3238                         stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3239                                                LLVMConstInt(ctx->i32, 16, 0), "");
3240                         args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3241                         mask |= 0x3;
3242                 }
3243                 if (samplemask) {
3244                         /* SampleMask should be in Y[15:0]. */
3245                         args.out[1] = samplemask;
3246                         mask |= 0xc;
3247                 }
3248         } else {
3249                 if (depth) {
3250                         args.out[0] = depth;
3251                         mask |= 0x1;
3252                 }
3253                 if (stencil) {
3254                         args.out[1] = stencil;
3255                         mask |= 0x2;
3256                 }
3257                 if (samplemask) {
3258                         args.out[2] = samplemask;
3259                         mask |= 0x4;
3260                 }
3261         }
3262
3263         /* SI (except OLAND and HAINAN) has a bug that it only looks
3264          * at the X writemask component. */
3265         if (ctx->screen->b.chip_class == SI &&
3266             ctx->screen->b.family != CHIP_OLAND &&
3267             ctx->screen->b.family != CHIP_HAINAN)
3268                 mask |= 0x1;
3269
3270         /* Specify which components to enable */
3271         args.enabled_channels = mask;
3272
3273         memcpy(&exp->args[exp->num++], &args, sizeof(args));
3274 }
3275
3276 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3277                                 LLVMValueRef *color, unsigned index,
3278                                 unsigned samplemask_param,
3279                                 bool is_last, struct si_ps_exports *exp)
3280 {
3281         struct si_shader_context *ctx = si_shader_context(bld_base);
3282         struct lp_build_context *base = &bld_base->base;
3283         int i;
3284
3285         /* Clamp color */
3286         if (ctx->shader->key.part.ps.epilog.clamp_color)
3287                 for (i = 0; i < 4; i++)
3288                         color[i] = ac_build_clamp(&ctx->ac, color[i]);
3289
3290         /* Alpha to one */
3291         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3292                 color[3] = base->one;
3293
3294         /* Alpha test */
3295         if (index == 0 &&
3296             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3297                 si_alpha_test(bld_base, color[3]);
3298
3299         /* Line & polygon smoothing */
3300         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3301                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3302                                                          samplemask_param);
3303
3304         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3305         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3306                 struct ac_export_args args[8];
3307                 int c, last = -1;
3308
3309                 /* Get the export arguments, also find out what the last one is. */
3310                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3311                         si_llvm_init_export_args(bld_base, color,
3312                                                  V_008DFC_SQ_EXP_MRT + c, &args[c]);
3313                         if (args[c].enabled_channels)
3314                                 last = c;
3315                 }
3316
3317                 /* Emit all exports. */
3318                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3319                         if (is_last && last == c) {
3320                                 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3321                                 args[c].done = 1; /* DONE bit */
3322                         } else if (!args[c].enabled_channels)
3323                                 continue; /* unnecessary NULL export */
3324
3325                         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3326                 }
3327         } else {
3328                 struct ac_export_args args;
3329
3330                 /* Export */
3331                 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3332                                          &args);
3333                 if (is_last) {
3334                         args.valid_mask = 1; /* whether the EXEC mask is valid */
3335                         args.done = 1; /* DONE bit */
3336                 } else if (!args.enabled_channels)
3337                         return; /* unnecessary NULL export */
3338
3339                 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3340         }
3341 }
3342
3343 static void si_emit_ps_exports(struct si_shader_context *ctx,
3344                                struct si_ps_exports *exp)
3345 {
3346         for (unsigned i = 0; i < exp->num; i++)
3347                 ac_build_export(&ctx->ac, &exp->args[i]);
3348 }
3349
3350 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3351 {
3352         struct si_shader_context *ctx = si_shader_context(bld_base);
3353         struct lp_build_context *base = &bld_base->base;
3354         struct ac_export_args args;
3355
3356         args.enabled_channels = 0x0; /* enabled channels */
3357         args.valid_mask = 1; /* whether the EXEC mask is valid */
3358         args.done = 1; /* DONE bit */
3359         args.target = V_008DFC_SQ_EXP_NULL;
3360         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3361         args.out[0] = base->undef; /* R */
3362         args.out[1] = base->undef; /* G */
3363         args.out[2] = base->undef; /* B */
3364         args.out[3] = base->undef; /* A */
3365
3366         ac_build_export(&ctx->ac, &args);
3367 }
3368
3369 /**
3370  * Return PS outputs in this order:
3371  *
3372  * v[0:3] = color0.xyzw
3373  * v[4:7] = color1.xyzw
3374  * ...
3375  * vN+0 = Depth
3376  * vN+1 = Stencil
3377  * vN+2 = SampleMask
3378  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3379  *
3380  * The alpha-ref SGPR is returned via its original location.
3381  */
3382 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3383                                       unsigned max_outputs,
3384                                       LLVMValueRef *addrs)
3385 {
3386         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3387         struct si_shader *shader = ctx->shader;
3388         struct tgsi_shader_info *info = &shader->selector->info;
3389         LLVMBuilderRef builder = ctx->gallivm.builder;
3390         unsigned i, j, first_vgpr, vgpr;
3391
3392         LLVMValueRef color[8][4] = {};
3393         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3394         LLVMValueRef ret;
3395
3396         if (ctx->postponed_kill)
3397                 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3398
3399         /* Read the output values. */
3400         for (i = 0; i < info->num_outputs; i++) {
3401                 unsigned semantic_name = info->output_semantic_name[i];
3402                 unsigned semantic_index = info->output_semantic_index[i];
3403
3404                 switch (semantic_name) {
3405                 case TGSI_SEMANTIC_COLOR:
3406                         assert(semantic_index < 8);
3407                         for (j = 0; j < 4; j++) {
3408                                 LLVMValueRef ptr = addrs[4 * i + j];
3409                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3410                                 color[semantic_index][j] = result;
3411                         }
3412                         break;
3413                 case TGSI_SEMANTIC_POSITION:
3414                         depth = LLVMBuildLoad(builder,
3415                                               addrs[4 * i + 2], "");
3416                         break;
3417                 case TGSI_SEMANTIC_STENCIL:
3418                         stencil = LLVMBuildLoad(builder,
3419                                                 addrs[4 * i + 1], "");
3420                         break;
3421                 case TGSI_SEMANTIC_SAMPLEMASK:
3422                         samplemask = LLVMBuildLoad(builder,
3423                                                    addrs[4 * i + 0], "");
3424                         break;
3425                 default:
3426                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3427                                 semantic_name);
3428                 }
3429         }
3430
3431         /* Fill the return structure. */
3432         ret = ctx->return_value;
3433
3434         /* Set SGPRs. */
3435         ret = LLVMBuildInsertValue(builder, ret,
3436                                    LLVMBuildBitCast(ctx->ac.builder,
3437                                                 LLVMGetParam(ctx->main_fn,
3438                                                         SI_PARAM_ALPHA_REF),
3439                                                 ctx->i32, ""),
3440                                    SI_SGPR_ALPHA_REF, "");
3441
3442         /* Set VGPRs */
3443         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3444         for (i = 0; i < ARRAY_SIZE(color); i++) {
3445                 if (!color[i][0])
3446                         continue;
3447
3448                 for (j = 0; j < 4; j++)
3449                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3450         }
3451         if (depth)
3452                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3453         if (stencil)
3454                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3455         if (samplemask)
3456                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3457
3458         /* Add the input sample mask for smoothing at the end. */
3459         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3460                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3461         ret = LLVMBuildInsertValue(builder, ret,
3462                                    LLVMGetParam(ctx->main_fn,
3463                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3464
3465         ctx->return_value = ret;
3466 }
3467
3468 /* Prevent optimizations (at least of memory accesses) across the current
3469  * point in the program by emitting empty inline assembly that is marked as
3470  * having side effects.
3471  *
3472  * Optionally, a value can be passed through the inline assembly to prevent
3473  * LLVM from hoisting calls to ReadNone functions.
3474  */
3475 static void emit_optimization_barrier(struct si_shader_context *ctx,
3476                                       LLVMValueRef *pvgpr)
3477 {
3478         static int counter = 0;
3479
3480         LLVMBuilderRef builder = ctx->gallivm.builder;
3481         char code[16];
3482
3483         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3484
3485         if (!pvgpr) {
3486                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3487                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3488                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3489         } else {
3490                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3491                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3492                 LLVMValueRef vgpr = *pvgpr;
3493                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3494                 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3495                 LLVMValueRef vgpr0;
3496
3497                 assert(vgpr_size % 4 == 0);
3498
3499                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3500                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3501                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3502                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3503                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3504
3505                 *pvgpr = vgpr;
3506         }
3507 }
3508
3509 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3510 {
3511         struct gallivm_state *gallivm = &ctx->gallivm;
3512         LLVMBuilderRef builder = gallivm->builder;
3513         LLVMValueRef args[1] = {
3514                 LLVMConstInt(ctx->i32, simm16, 0)
3515         };
3516         lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3517                            ctx->voidt, args, 1, 0);
3518 }
3519
3520 static void membar_emit(
3521                 const struct lp_build_tgsi_action *action,
3522                 struct lp_build_tgsi_context *bld_base,
3523                 struct lp_build_emit_data *emit_data)
3524 {
3525         struct si_shader_context *ctx = si_shader_context(bld_base);
3526         LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3527         unsigned flags = LLVMConstIntGetZExtValue(src0);
3528         unsigned waitcnt = NOOP_WAITCNT;
3529
3530         if (flags & TGSI_MEMBAR_THREAD_GROUP)
3531                 waitcnt &= VM_CNT & LGKM_CNT;
3532
3533         if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3534                      TGSI_MEMBAR_SHADER_BUFFER |
3535                      TGSI_MEMBAR_SHADER_IMAGE))
3536                 waitcnt &= VM_CNT;
3537
3538         if (flags & TGSI_MEMBAR_SHARED)
3539                 waitcnt &= LGKM_CNT;
3540
3541         if (waitcnt != NOOP_WAITCNT)
3542                 si_emit_waitcnt(ctx, waitcnt);
3543 }
3544
3545 static void clock_emit(
3546                 const struct lp_build_tgsi_action *action,
3547                 struct lp_build_tgsi_context *bld_base,
3548                 struct lp_build_emit_data *emit_data)
3549 {
3550         struct si_shader_context *ctx = si_shader_context(bld_base);
3551         struct gallivm_state *gallivm = &ctx->gallivm;
3552         LLVMValueRef tmp;
3553
3554         tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3555                                  ctx->i64, NULL, 0, 0);
3556         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3557
3558         emit_data->output[0] =
3559                 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3560         emit_data->output[1] =
3561                 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3562 }
3563
3564 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3565 {
3566         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3567                                CONST_ADDR_SPACE);
3568 }
3569
3570 static void si_llvm_emit_ddxy(
3571         const struct lp_build_tgsi_action *action,
3572         struct lp_build_tgsi_context *bld_base,
3573         struct lp_build_emit_data *emit_data)
3574 {
3575         struct si_shader_context *ctx = si_shader_context(bld_base);
3576         struct gallivm_state *gallivm = &ctx->gallivm;
3577         unsigned opcode = emit_data->info->opcode;
3578         LLVMValueRef val;
3579         int idx;
3580         unsigned mask;
3581
3582         if (opcode == TGSI_OPCODE_DDX_FINE)
3583                 mask = AC_TID_MASK_LEFT;
3584         else if (opcode == TGSI_OPCODE_DDY_FINE)
3585                 mask = AC_TID_MASK_TOP;
3586         else
3587                 mask = AC_TID_MASK_TOP_LEFT;
3588
3589         /* for DDX we want to next X pixel, DDY next Y pixel. */
3590         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3591
3592         val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3593         val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3594                             mask, idx, ctx->lds, val);
3595         emit_data->output[emit_data->chan] = val;
3596 }
3597
3598 /*
3599  * this takes an I,J coordinate pair,
3600  * and works out the X and Y derivatives.
3601  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3602  */
3603 static LLVMValueRef si_llvm_emit_ddxy_interp(
3604         struct lp_build_tgsi_context *bld_base,
3605         LLVMValueRef interp_ij)
3606 {
3607         struct si_shader_context *ctx = si_shader_context(bld_base);
3608         struct gallivm_state *gallivm = &ctx->gallivm;
3609         LLVMValueRef result[4], a;
3610         unsigned i;
3611
3612         for (i = 0; i < 2; i++) {
3613                 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3614                                             LLVMConstInt(ctx->i32, i, 0), "");
3615                 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3616                 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3617         }
3618
3619         return lp_build_gather_values(gallivm, result, 4);
3620 }
3621
3622 static void interp_fetch_args(
3623         struct lp_build_tgsi_context *bld_base,
3624         struct lp_build_emit_data *emit_data)
3625 {
3626         struct si_shader_context *ctx = si_shader_context(bld_base);
3627         struct gallivm_state *gallivm = &ctx->gallivm;
3628         const struct tgsi_full_instruction *inst = emit_data->inst;
3629
3630         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3631                 /* offset is in second src, first two channels */
3632                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3633                                                          emit_data->inst, 1,
3634                                                          TGSI_CHAN_X);
3635                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3636                                                          emit_data->inst, 1,
3637                                                          TGSI_CHAN_Y);
3638                 emit_data->arg_count = 2;
3639         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3640                 LLVMValueRef sample_position;
3641                 LLVMValueRef sample_id;
3642                 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3643
3644                 /* fetch sample ID, then fetch its sample position,
3645                  * and place into first two channels.
3646                  */
3647                 sample_id = lp_build_emit_fetch(bld_base,
3648                                                 emit_data->inst, 1, TGSI_CHAN_X);
3649                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3650                                              ctx->i32, "");
3651                 sample_position = load_sample_position(ctx, sample_id);
3652
3653                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3654                                                              sample_position,
3655                                                              ctx->i32_0, "");
3656
3657                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3658                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3659                                                              sample_position,
3660                                                              ctx->i32_1, "");
3661                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3662                 emit_data->arg_count = 2;
3663         }
3664 }
3665
3666 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3667                                 struct lp_build_tgsi_context *bld_base,
3668                                 struct lp_build_emit_data *emit_data)
3669 {
3670         struct si_shader_context *ctx = si_shader_context(bld_base);
3671         struct si_shader *shader = ctx->shader;
3672         struct gallivm_state *gallivm = &ctx->gallivm;
3673         const struct tgsi_shader_info *info = &shader->selector->info;
3674         LLVMValueRef interp_param;
3675         const struct tgsi_full_instruction *inst = emit_data->inst;
3676         const struct tgsi_full_src_register *input = &inst->Src[0];
3677         int input_base, input_array_size;
3678         int chan;
3679         int i;
3680         LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3681         LLVMValueRef array_idx;
3682         int interp_param_idx;
3683         unsigned interp;
3684         unsigned location;
3685
3686         assert(input->Register.File == TGSI_FILE_INPUT);
3687
3688         if (input->Register.Indirect) {
3689                 unsigned array_id = input->Indirect.ArrayID;
3690
3691                 if (array_id) {
3692                         input_base = info->input_array_first[array_id];
3693                         input_array_size = info->input_array_last[array_id] - input_base + 1;
3694                 } else {
3695                         input_base = inst->Src[0].Register.Index;
3696                         input_array_size = info->num_inputs - input_base;
3697                 }
3698
3699                 array_idx = si_get_indirect_index(ctx, &input->Indirect,
3700                                                input->Register.Index - input_base);
3701         } else {
3702                 input_base = inst->Src[0].Register.Index;
3703                 input_array_size = 1;
3704                 array_idx = ctx->i32_0;
3705         }
3706
3707         interp = shader->selector->info.input_interpolate[input_base];
3708
3709         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3710             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3711                 location = TGSI_INTERPOLATE_LOC_CENTER;
3712         else
3713                 location = TGSI_INTERPOLATE_LOC_CENTROID;
3714
3715         interp_param_idx = lookup_interp_param_index(interp, location);
3716         if (interp_param_idx == -1)
3717                 return;
3718         else if (interp_param_idx)
3719                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3720         else
3721                 interp_param = NULL;
3722
3723         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3724             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3725                 LLVMValueRef ij_out[2];
3726                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3727
3728                 /*
3729                  * take the I then J parameters, and the DDX/Y for it, and
3730                  * calculate the IJ inputs for the interpolator.
3731                  * temp1 = ddx * offset/sample.x + I;
3732                  * interp_param.I = ddy * offset/sample.y + temp1;
3733                  * temp1 = ddx * offset/sample.x + J;
3734                  * interp_param.J = ddy * offset/sample.y + temp1;
3735                  */
3736                 for (i = 0; i < 2; i++) {
3737                         LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3738                         LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3739                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3740                                                                       ddxy_out, ix_ll, "");
3741                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3742                                                                       ddxy_out, iy_ll, "");
3743                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3744                                                                          interp_param, ix_ll, "");
3745                         LLVMValueRef temp1, temp2;
3746
3747                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3748                                                      ctx->f32, "");
3749
3750                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3751
3752                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3753
3754                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3755
3756                         ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3757                 }
3758                 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3759         }
3760
3761         if (interp_param) {
3762                 interp_param = LLVMBuildBitCast(gallivm->builder,
3763                         interp_param, LLVMVectorType(ctx->f32, 2), "");
3764         }
3765
3766         for (chan = 0; chan < 4; chan++) {
3767                 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3768                 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3769
3770                 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3771                         LLVMValueRef v, i = NULL, j = NULL;
3772
3773                         if (interp_param) {
3774                                 interp_param = LLVMBuildBitCast(gallivm->builder,
3775                                         interp_param, LLVMVectorType(ctx->f32, 2), "");
3776                                 i = LLVMBuildExtractElement(
3777                                         gallivm->builder, interp_param, ctx->i32_0, "");
3778                                 j = LLVMBuildExtractElement(
3779                                         gallivm->builder, interp_param, ctx->i32_1, "");
3780                         }
3781                         v = si_build_fs_interp(ctx, input_base + idx, schan,
3782                                                prim_mask, i, j);
3783
3784                         gather = LLVMBuildInsertElement(gallivm->builder,
3785                                 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3786                 }
3787
3788                 emit_data->output[chan] = LLVMBuildExtractElement(
3789                         gallivm->builder, gather, array_idx, "");
3790         }
3791 }
3792
3793 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3794                                    LLVMValueRef value)
3795 {
3796         struct gallivm_state *gallivm = &ctx->gallivm;
3797         LLVMValueRef args[3] = {
3798                 value,
3799                 ctx->i32_0,
3800                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3801         };
3802
3803         /* We currently have no other way to prevent LLVM from lifting the icmp
3804          * calls to a dominating basic block.
3805          */
3806         emit_optimization_barrier(ctx, &args[0]);
3807
3808         if (LLVMTypeOf(args[0]) != ctx->i32)
3809                 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3810
3811         return lp_build_intrinsic(gallivm->builder,
3812                                   "llvm.amdgcn.icmp.i32",
3813                                   ctx->i64, args, 3,
3814                                   LP_FUNC_ATTR_NOUNWIND |
3815                                   LP_FUNC_ATTR_READNONE |
3816                                   LP_FUNC_ATTR_CONVERGENT);
3817 }
3818
3819 static void vote_all_emit(
3820         const struct lp_build_tgsi_action *action,
3821         struct lp_build_tgsi_context *bld_base,
3822         struct lp_build_emit_data *emit_data)
3823 {
3824         struct si_shader_context *ctx = si_shader_context(bld_base);
3825         struct gallivm_state *gallivm = &ctx->gallivm;
3826         LLVMValueRef active_set, vote_set;
3827         LLVMValueRef tmp;
3828
3829         active_set = si_emit_ballot(ctx, ctx->i32_1);
3830         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3831
3832         tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3833         emit_data->output[emit_data->chan] =
3834                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3835 }
3836
3837 static void vote_any_emit(
3838         const struct lp_build_tgsi_action *action,
3839         struct lp_build_tgsi_context *bld_base,
3840         struct lp_build_emit_data *emit_data)
3841 {
3842         struct si_shader_context *ctx = si_shader_context(bld_base);
3843         struct gallivm_state *gallivm = &ctx->gallivm;
3844         LLVMValueRef vote_set;
3845         LLVMValueRef tmp;
3846
3847         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3848
3849         tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3850                             vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3851         emit_data->output[emit_data->chan] =
3852                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3853 }
3854
3855 static void vote_eq_emit(
3856         const struct lp_build_tgsi_action *action,
3857         struct lp_build_tgsi_context *bld_base,
3858         struct lp_build_emit_data *emit_data)
3859 {
3860         struct si_shader_context *ctx = si_shader_context(bld_base);
3861         struct gallivm_state *gallivm = &ctx->gallivm;
3862         LLVMValueRef active_set, vote_set;
3863         LLVMValueRef all, none, tmp;
3864
3865         active_set = si_emit_ballot(ctx, ctx->i32_1);
3866         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3867
3868         all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3869         none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3870                              vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3871         tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3872         emit_data->output[emit_data->chan] =
3873                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3874 }
3875
3876 static void ballot_emit(
3877         const struct lp_build_tgsi_action *action,
3878         struct lp_build_tgsi_context *bld_base,
3879         struct lp_build_emit_data *emit_data)
3880 {
3881         struct si_shader_context *ctx = si_shader_context(bld_base);
3882         LLVMBuilderRef builder = ctx->gallivm.builder;
3883         LLVMValueRef tmp;
3884
3885         tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3886         tmp = si_emit_ballot(ctx, tmp);
3887         tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3888
3889         emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3890         emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3891 }
3892
3893 static void read_invoc_fetch_args(
3894         struct lp_build_tgsi_context *bld_base,
3895         struct lp_build_emit_data *emit_data)
3896 {
3897         emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3898                                                  0, emit_data->src_chan);
3899
3900         /* Always read the source invocation (= lane) from the X channel. */
3901         emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3902                                                  1, TGSI_CHAN_X);
3903         emit_data->arg_count = 2;
3904 }
3905
3906 static void read_lane_emit(
3907         const struct lp_build_tgsi_action *action,
3908         struct lp_build_tgsi_context *bld_base,
3909         struct lp_build_emit_data *emit_data)
3910 {
3911         struct si_shader_context *ctx = si_shader_context(bld_base);
3912         LLVMBuilderRef builder = ctx->gallivm.builder;
3913
3914         /* We currently have no other way to prevent LLVM from lifting the icmp
3915          * calls to a dominating basic block.
3916          */
3917         emit_optimization_barrier(ctx, &emit_data->args[0]);
3918
3919         for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3920                 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3921                                                       ctx->i32, "");
3922         }
3923
3924         emit_data->output[emit_data->chan] =
3925                 ac_build_intrinsic(&ctx->ac, action->intr_name,
3926                                    ctx->i32, emit_data->args, emit_data->arg_count,
3927                                    AC_FUNC_ATTR_READNONE |
3928                                    AC_FUNC_ATTR_CONVERGENT);
3929 }
3930
3931 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3932                                        struct lp_build_emit_data *emit_data)
3933 {
3934         struct si_shader_context *ctx = si_shader_context(bld_base);
3935         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
3936         LLVMValueRef imm;
3937         unsigned stream;
3938
3939         assert(src0.File == TGSI_FILE_IMMEDIATE);
3940
3941         imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
3942         stream = LLVMConstIntGetZExtValue(imm) & 0x3;
3943         return stream;
3944 }
3945
3946 /* Emit one vertex from the geometry shader */
3947 static void si_llvm_emit_vertex(
3948         const struct lp_build_tgsi_action *action,
3949         struct lp_build_tgsi_context *bld_base,
3950         struct lp_build_emit_data *emit_data)
3951 {
3952         struct si_shader_context *ctx = si_shader_context(bld_base);
3953         struct lp_build_context *uint = &bld_base->uint_bld;
3954         struct si_shader *shader = ctx->shader;
3955         struct tgsi_shader_info *info = &shader->selector->info;
3956         struct gallivm_state *gallivm = &ctx->gallivm;
3957         struct lp_build_if_state if_state;
3958         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3959                                             ctx->param_gs2vs_offset);
3960         LLVMValueRef gs_next_vertex;
3961         LLVMValueRef can_emit, kill;
3962         unsigned chan, offset;
3963         int i;
3964         unsigned stream;
3965
3966         stream = si_llvm_get_stream(bld_base, emit_data);
3967
3968         /* Write vertex attribute values to GSVS ring */
3969         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
3970                                        ctx->gs_next_vertex[stream],
3971                                        "");
3972
3973         /* If this thread has already emitted the declared maximum number of
3974          * vertices, skip the write: excessive vertex emissions are not
3975          * supposed to have any effect.
3976          *
3977          * If the shader has no writes to memory, kill it instead. This skips
3978          * further memory loads and may allow LLVM to skip to the end
3979          * altogether.
3980          */
3981         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
3982                                  LLVMConstInt(ctx->i32,
3983                                               shader->selector->gs_max_out_vertices, 0), "");
3984
3985         bool use_kill = !info->writes_memory;
3986         if (use_kill) {
3987                 kill = lp_build_select(&bld_base->base, can_emit,
3988                                        LLVMConstReal(ctx->f32, 1.0f),
3989                                        LLVMConstReal(ctx->f32, -1.0f));
3990
3991                 ac_build_kill(&ctx->ac, kill);
3992         } else {
3993                 lp_build_if(&if_state, gallivm, can_emit);
3994         }
3995
3996         offset = 0;
3997         for (i = 0; i < info->num_outputs; i++) {
3998                 LLVMValueRef *out_ptr = ctx->outputs[i];
3999
4000                 for (chan = 0; chan < 4; chan++) {
4001                         if (!(info->output_usagemask[i] & (1 << chan)) ||
4002                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4003                                 continue;
4004
4005                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4006                         LLVMValueRef voffset =
4007                                 LLVMConstInt(ctx->i32, offset *
4008                                              shader->selector->gs_max_out_vertices, 0);
4009                         offset++;
4010
4011                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
4012                         voffset = lp_build_mul_imm(uint, voffset, 4);
4013
4014                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4015
4016                         ac_build_buffer_store_dword(&ctx->ac,
4017                                                     ctx->gsvs_ring[stream],
4018                                                     out_val, 1,
4019                                                     voffset, soffset, 0,
4020                                                     1, 1, true, true);
4021                 }
4022         }
4023
4024         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4025                                       ctx->i32_1);
4026
4027         LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4028
4029         /* Signal vertex emission */
4030         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4031                          si_get_gs_wave_id(ctx));
4032         if (!use_kill)
4033                 lp_build_endif(&if_state);
4034 }
4035
4036 /* Cut one primitive from the geometry shader */
4037 static void si_llvm_emit_primitive(
4038         const struct lp_build_tgsi_action *action,
4039         struct lp_build_tgsi_context *bld_base,
4040         struct lp_build_emit_data *emit_data)
4041 {
4042         struct si_shader_context *ctx = si_shader_context(bld_base);
4043         unsigned stream;
4044
4045         /* Signal primitive cut */
4046         stream = si_llvm_get_stream(bld_base, emit_data);
4047         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4048                          si_get_gs_wave_id(ctx));
4049 }
4050
4051 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4052                                  struct lp_build_tgsi_context *bld_base,
4053                                  struct lp_build_emit_data *emit_data)
4054 {
4055         struct si_shader_context *ctx = si_shader_context(bld_base);
4056         struct gallivm_state *gallivm = &ctx->gallivm;
4057
4058         /* SI only (thanks to a hw bug workaround):
4059          * The real barrier instruction isn’t needed, because an entire patch
4060          * always fits into a single wave.
4061          */
4062         if (ctx->screen->b.chip_class == SI &&
4063             ctx->type == PIPE_SHADER_TESS_CTRL) {
4064                 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
4065                 return;
4066         }
4067
4068         lp_build_intrinsic(gallivm->builder,
4069                            "llvm.amdgcn.s.barrier",
4070                            ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4071 }
4072
4073 static const struct lp_build_tgsi_action interp_action = {
4074         .fetch_args = interp_fetch_args,
4075         .emit = build_interp_intrinsic,
4076 };
4077
4078 static void si_create_function(struct si_shader_context *ctx,
4079                                const char *name,
4080                                LLVMTypeRef *returns, unsigned num_returns,
4081                                struct si_function_info *fninfo,
4082                                unsigned max_workgroup_size)
4083 {
4084         int i;
4085
4086         si_llvm_create_func(ctx, name, returns, num_returns,
4087                             fninfo->types, fninfo->num_params);
4088         ctx->return_value = LLVMGetUndef(ctx->return_type);
4089
4090         for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4091                 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4092
4093                 /* The combination of:
4094                  * - ByVal
4095                  * - dereferenceable
4096                  * - invariant.load
4097                  * allows the optimization passes to move loads and reduces
4098                  * SGPR spilling significantly.
4099                  */
4100                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4101                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4102                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4103                         ac_add_attr_dereferenceable(P, UINT64_MAX);
4104                 } else
4105                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4106         }
4107
4108         for (i = 0; i < fninfo->num_params; ++i) {
4109                 if (fninfo->assign[i])
4110                         *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4111         }
4112
4113         if (max_workgroup_size) {
4114                 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4115                                       max_workgroup_size);
4116         }
4117         LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4118                                            "no-signed-zeros-fp-math",
4119                                            "true");
4120
4121         if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
4122                 /* These were copied from some LLVM test. */
4123                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4124                                                    "less-precise-fpmad",
4125                                                    "true");
4126                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4127                                                    "no-infs-fp-math",
4128                                                    "true");
4129                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4130                                                    "no-nans-fp-math",
4131                                                    "true");
4132                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4133                                                    "unsafe-fp-math",
4134                                                    "true");
4135         }
4136 }
4137
4138 static void declare_streamout_params(struct si_shader_context *ctx,
4139                                      struct pipe_stream_output_info *so,
4140                                      struct si_function_info *fninfo)
4141 {
4142         int i;
4143
4144         /* Streamout SGPRs. */
4145         if (so->num_outputs) {
4146                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4147                         ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4148                 else
4149                         ctx->param_streamout_config = fninfo->num_params - 1;
4150
4151                 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4152         }
4153         /* A streamout buffer offset is loaded if the stride is non-zero. */
4154         for (i = 0; i < 4; i++) {
4155                 if (!so->stride[i])
4156                         continue;
4157
4158                 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4159         }
4160 }
4161
4162 static unsigned llvm_get_type_size(LLVMTypeRef type)
4163 {
4164         LLVMTypeKind kind = LLVMGetTypeKind(type);
4165
4166         switch (kind) {
4167         case LLVMIntegerTypeKind:
4168                 return LLVMGetIntTypeWidth(type) / 8;
4169         case LLVMFloatTypeKind:
4170                 return 4;
4171         case LLVMPointerTypeKind:
4172                 return 8;
4173         case LLVMVectorTypeKind:
4174                 return LLVMGetVectorSize(type) *
4175                        llvm_get_type_size(LLVMGetElementType(type));
4176         case LLVMArrayTypeKind:
4177                 return LLVMGetArrayLength(type) *
4178                        llvm_get_type_size(LLVMGetElementType(type));
4179         default:
4180                 assert(0);
4181                 return 0;
4182         }
4183 }
4184
4185 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4186 {
4187         struct gallivm_state *gallivm = &ctx->gallivm;
4188
4189         unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4190         ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4191                 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4192                 "lds");
4193 }
4194
4195 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4196 {
4197         switch (shader->selector->type) {
4198         case PIPE_SHADER_TESS_CTRL:
4199                 /* Return this so that LLVM doesn't remove s_barrier
4200                  * instructions on chips where we use s_barrier. */
4201                 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4202
4203         case PIPE_SHADER_GEOMETRY:
4204                 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4205
4206         case PIPE_SHADER_COMPUTE:
4207                 break; /* see below */
4208
4209         default:
4210                 return 0;
4211         }
4212
4213         const unsigned *properties = shader->selector->info.properties;
4214         unsigned max_work_group_size =
4215                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4216                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4217                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4218
4219         if (!max_work_group_size) {
4220                 /* This is a variable group size compute shader,
4221                  * compile it for the maximum possible group size.
4222                  */
4223                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4224         }
4225         return max_work_group_size;
4226 }
4227
4228 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4229                                             struct si_function_info *fninfo,
4230                                             bool assign_params)
4231 {
4232         unsigned const_and_shader_buffers =
4233                 add_arg(fninfo, ARG_SGPR,
4234                         si_const_array(ctx->v4i32,
4235                                        SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
4236         unsigned samplers_and_images =
4237                 add_arg(fninfo, ARG_SGPR,
4238                         si_const_array(ctx->v8i32,
4239                                        SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4240
4241         if (assign_params) {
4242                 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4243                 ctx->param_samplers_and_images = samplers_and_images;
4244         }
4245 }
4246
4247 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4248                                           struct si_function_info *fninfo)
4249 {
4250         ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4251                 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4252         declare_per_stage_desc_pointers(ctx, fninfo, true);
4253 }
4254
4255 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4256                                             struct si_function_info *fninfo)
4257 {
4258         ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4259                 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4260         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4261         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4262         add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4263         ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4264 }
4265
4266 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4267                                    struct si_function_info *fninfo,
4268                                    unsigned *num_prolog_vgprs)
4269 {
4270         struct si_shader *shader = ctx->shader;
4271
4272         add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4273         if (shader->key.as_ls) {
4274                 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4275                 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4276         } else {
4277                 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4278                 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4279         }
4280         add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4281
4282         if (!shader->is_gs_copy_shader) {
4283                 /* Vertex load indices. */
4284                 ctx->param_vertex_index0 = fninfo->num_params;
4285                 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4286                         add_arg(fninfo, ARG_VGPR, ctx->i32);
4287                 *num_prolog_vgprs += shader->selector->info.num_inputs;
4288         }
4289 }
4290
4291 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4292                                     struct si_function_info *fninfo)
4293 {
4294         ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4295         ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4296         ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4297         ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4298 }
4299
4300 enum {
4301         /* Convenient merged shader definitions. */
4302         SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4303         SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4304 };
4305
4306 static void create_function(struct si_shader_context *ctx)
4307 {
4308         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
4309         struct gallivm_state *gallivm = &ctx->gallivm;
4310         struct si_shader *shader = ctx->shader;
4311         struct si_function_info fninfo;
4312         LLVMTypeRef returns[16+32*4];
4313         unsigned i, num_return_sgprs;
4314         unsigned num_returns = 0;
4315         unsigned num_prolog_vgprs = 0;
4316         unsigned type = ctx->type;
4317
4318         si_init_function_info(&fninfo);
4319
4320         /* Set MERGED shaders. */
4321         if (ctx->screen->b.chip_class >= GFX9) {
4322                 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4323                         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4324                 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4325                         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4326         }
4327
4328         LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4329
4330         switch (type) {
4331         case PIPE_SHADER_VERTEX:
4332                 declare_default_desc_pointers(ctx, &fninfo);
4333                 declare_vs_specific_input_sgprs(ctx, &fninfo);
4334
4335                 if (shader->key.as_es) {
4336                         assert(!shader->selector->nir);
4337                         ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4338                 } else if (shader->key.as_ls) {
4339                         assert(!shader->selector->nir);
4340                         /* no extra parameters */
4341                 } else {
4342                         if (shader->is_gs_copy_shader) {
4343                                 fninfo.num_params = ctx->param_rw_buffers + 1;
4344                                 fninfo.num_sgpr_params = fninfo.num_params;
4345                         }
4346
4347                         /* The locations of the other parameters are assigned dynamically. */
4348                         declare_streamout_params(ctx, &shader->selector->so,
4349                                                  &fninfo);
4350                 }
4351
4352                 /* VGPRs */
4353                 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4354                 break;
4355
4356         case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4357                 declare_default_desc_pointers(ctx, &fninfo);
4358                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4359                 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4360                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4361                 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4362                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4363                 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4364                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4365                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4366
4367                 /* VGPRs */
4368                 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4369                 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4370
4371                 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4372                  * placed after the user SGPRs.
4373                  */
4374                 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4375                         returns[num_returns++] = ctx->i32; /* SGPRs */
4376                 for (i = 0; i < 5; i++)
4377                         returns[num_returns++] = ctx->f32; /* VGPRs */
4378                 break;
4379
4380         case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4381                 /* Merged stages have 8 system SGPRs at the beginning. */
4382                 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4383                         add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4384                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4385                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4386                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4387                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4388                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4389                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4390
4391                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4392                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4393                 declare_per_stage_desc_pointers(ctx, &fninfo,
4394                                                 ctx->type == PIPE_SHADER_VERTEX);
4395                 declare_vs_specific_input_sgprs(ctx, &fninfo);
4396
4397                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4398                 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4399                 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4400                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4401                 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4402                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4403
4404                 declare_per_stage_desc_pointers(ctx, &fninfo,
4405                                                 ctx->type == PIPE_SHADER_TESS_CTRL);
4406
4407                 /* VGPRs (first TCS, then VS) */
4408                 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4409                 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4410
4411                 if (ctx->type == PIPE_SHADER_VERTEX) {
4412                         declare_vs_input_vgprs(ctx, &fninfo,
4413                                                &num_prolog_vgprs);
4414
4415                         /* LS return values are inputs to the TCS main shader part. */
4416                         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4417                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4418                         for (i = 0; i < 2; i++)
4419                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4420                 } else {
4421                         /* TCS return values are inputs to the TCS epilog.
4422                          *
4423                          * param_tcs_offchip_offset, param_tcs_factor_offset,
4424                          * param_tcs_offchip_layout, and param_rw_buffers
4425                          * should be passed to the epilog.
4426                          */
4427                         for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4428                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4429                         for (i = 0; i < 5; i++)
4430                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4431                 }
4432                 break;
4433
4434         case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4435                 /* Merged stages have 8 system SGPRs at the beginning. */
4436                 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4437                         add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4438                 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4439                 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4440                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4441                 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4442                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4443                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4444
4445                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4446                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4447                 declare_per_stage_desc_pointers(ctx, &fninfo,
4448                                                 (ctx->type == PIPE_SHADER_VERTEX ||
4449                                                  ctx->type == PIPE_SHADER_TESS_EVAL));
4450                 if (ctx->type == PIPE_SHADER_VERTEX) {
4451                         declare_vs_specific_input_sgprs(ctx, &fninfo);
4452                 } else {
4453                         /* TESS_EVAL (and also GEOMETRY):
4454                          * Declare as many input SGPRs as the VS has. */
4455                         ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4456                         ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4457                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4458                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4459                         add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4460                         ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4461                 }
4462
4463                 declare_per_stage_desc_pointers(ctx, &fninfo,
4464                                                 ctx->type == PIPE_SHADER_GEOMETRY);
4465
4466                 /* VGPRs (first GS, then VS/TES) */
4467                 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4468                 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4469                 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4470                 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4471                 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4472
4473                 if (ctx->type == PIPE_SHADER_VERTEX) {
4474                         declare_vs_input_vgprs(ctx, &fninfo,
4475                                                &num_prolog_vgprs);
4476                 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4477                         declare_tes_input_vgprs(ctx, &fninfo);
4478                 }
4479
4480                 if (ctx->type == PIPE_SHADER_VERTEX ||
4481                     ctx->type == PIPE_SHADER_TESS_EVAL) {
4482                         /* ES return values are inputs to GS. */
4483                         for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4484                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4485                         for (i = 0; i < 5; i++)
4486                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4487                 }
4488                 break;
4489
4490         case PIPE_SHADER_TESS_EVAL:
4491                 declare_default_desc_pointers(ctx, &fninfo);
4492                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4493                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4494
4495                 if (shader->key.as_es) {
4496                         ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4497                         add_arg(&fninfo, ARG_SGPR, ctx->i32);
4498                         ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4499                 } else {
4500                         add_arg(&fninfo, ARG_SGPR, ctx->i32);
4501                         declare_streamout_params(ctx, &shader->selector->so,
4502                                                  &fninfo);
4503                         ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4504                 }
4505
4506                 /* VGPRs */
4507                 declare_tes_input_vgprs(ctx, &fninfo);
4508                 break;
4509
4510         case PIPE_SHADER_GEOMETRY:
4511                 declare_default_desc_pointers(ctx, &fninfo);
4512                 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4513                 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4514
4515                 /* VGPRs */
4516                 ctx->param_gs_vtx0_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4517                 ctx->param_gs_vtx1_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4518                 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4519                 ctx->param_gs_vtx2_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4520                 ctx->param_gs_vtx3_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4521                 ctx->param_gs_vtx4_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4522                 ctx->param_gs_vtx5_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4523                 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4524                 break;
4525
4526         case PIPE_SHADER_FRAGMENT:
4527                 declare_default_desc_pointers(ctx, &fninfo);
4528                 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4529                 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4530
4531                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4532                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4533                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4534                 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4535                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4536                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4537                 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4538                 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4539                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4540                                        &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4541                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4542                                        &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4543                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4544                                        &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4545                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4546                                        &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4547                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4548                                        &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4549                 shader->info.face_vgpr_index = 20;
4550                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4551                                        &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4552                 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4553                                        &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4554                 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4555
4556                 /* Color inputs from the prolog. */
4557                 if (shader->selector->info.colors_read) {
4558                         unsigned num_color_elements =
4559                                 util_bitcount(shader->selector->info.colors_read);
4560
4561                         assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4562                         for (i = 0; i < num_color_elements; i++)
4563                                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4564
4565                         num_prolog_vgprs += num_color_elements;
4566                 }
4567
4568                 /* Outputs for the epilog. */
4569                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4570                 num_returns =
4571                         num_return_sgprs +
4572                         util_bitcount(shader->selector->info.colors_written) * 4 +
4573                         shader->selector->info.writes_z +
4574                         shader->selector->info.writes_stencil +
4575                         shader->selector->info.writes_samplemask +
4576                         1 /* SampleMaskIn */;
4577
4578                 num_returns = MAX2(num_returns,
4579                                    num_return_sgprs +
4580                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4581
4582                 for (i = 0; i < num_return_sgprs; i++)
4583                         returns[i] = ctx->i32;
4584                 for (; i < num_returns; i++)
4585                         returns[i] = ctx->f32;
4586                 break;
4587
4588         case PIPE_SHADER_COMPUTE:
4589                 declare_default_desc_pointers(ctx, &fninfo);
4590                 if (shader->selector->info.uses_grid_size)
4591                         ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4592                 if (shader->selector->info.uses_block_size)
4593                         ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4594
4595                 for (i = 0; i < 3; i++) {
4596                         ctx->param_block_id[i] = -1;
4597                         if (shader->selector->info.uses_block_id[i])
4598                                 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4599                 }
4600
4601                 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4602                 break;
4603         default:
4604                 assert(0 && "unimplemented shader");
4605                 return;
4606         }
4607
4608         si_create_function(ctx, "main", returns, num_returns, &fninfo,
4609                            si_get_max_workgroup_size(shader));
4610
4611         /* Reserve register locations for VGPR inputs the PS prolog may need. */
4612         if (ctx->type == PIPE_SHADER_FRAGMENT &&
4613             ctx->separate_prolog) {
4614                 si_llvm_add_attribute(ctx->main_fn,
4615                                       "InitialPSInputAddr",
4616                                       S_0286D0_PERSP_SAMPLE_ENA(1) |
4617                                       S_0286D0_PERSP_CENTER_ENA(1) |
4618                                       S_0286D0_PERSP_CENTROID_ENA(1) |
4619                                       S_0286D0_LINEAR_SAMPLE_ENA(1) |
4620                                       S_0286D0_LINEAR_CENTER_ENA(1) |
4621                                       S_0286D0_LINEAR_CENTROID_ENA(1) |
4622                                       S_0286D0_FRONT_FACE_ENA(1) |
4623                                       S_0286D0_POS_FIXED_PT_ENA(1));
4624         }
4625
4626         shader->info.num_input_sgprs = 0;
4627         shader->info.num_input_vgprs = 0;
4628
4629         for (i = 0; i < fninfo.num_sgpr_params; ++i)
4630                 shader->info.num_input_sgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4631
4632         for (; i < fninfo.num_params; ++i)
4633                 shader->info.num_input_vgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4634
4635         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4636         shader->info.num_input_vgprs -= num_prolog_vgprs;
4637
4638         if (!ctx->screen->has_ds_bpermute &&
4639             bld_base->info &&
4640             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
4641              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
4642              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
4643              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
4644              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
4645              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
4646                 ctx->lds =
4647                         LLVMAddGlobalInAddressSpace(gallivm->module,
4648                                                     LLVMArrayType(ctx->i32, 64),
4649                                                     "ddxy_lds",
4650                                                     LOCAL_ADDR_SPACE);
4651
4652         if (shader->key.as_ls ||
4653             ctx->type == PIPE_SHADER_TESS_CTRL ||
4654             /* GFX9 has the ESGS ring buffer in LDS. */
4655             (ctx->screen->b.chip_class >= GFX9 &&
4656              (shader->key.as_es ||
4657               ctx->type == PIPE_SHADER_GEOMETRY)))
4658                 declare_lds_as_pointer(ctx);
4659 }
4660
4661 /**
4662  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4663  * for later use.
4664  */
4665 static void preload_ring_buffers(struct si_shader_context *ctx)
4666 {
4667         struct gallivm_state *gallivm = &ctx->gallivm;
4668         LLVMBuilderRef builder = gallivm->builder;
4669
4670         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4671                                             ctx->param_rw_buffers);
4672
4673         if (ctx->screen->b.chip_class <= VI &&
4674             (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4675                 unsigned ring =
4676                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4677                                                              : SI_ES_RING_ESGS;
4678                 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4679
4680                 ctx->esgs_ring =
4681                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4682         }
4683
4684         if (ctx->shader->is_gs_copy_shader) {
4685                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4686
4687                 ctx->gsvs_ring[0] =
4688                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4689         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4690                 const struct si_shader_selector *sel = ctx->shader->selector;
4691                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4692                 LLVMValueRef base_ring;
4693
4694                 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4695
4696                 /* The conceptual layout of the GSVS ring is
4697                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
4698                  * but the real memory layout is swizzled across
4699                  * threads:
4700                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4701                  *   t16v0c0 ..
4702                  * Override the buffer descriptor accordingly.
4703                  */
4704                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4705                 uint64_t stream_offset = 0;
4706
4707                 for (unsigned stream = 0; stream < 4; ++stream) {
4708                         unsigned num_components;
4709                         unsigned stride;
4710                         unsigned num_records;
4711                         LLVMValueRef ring, tmp;
4712
4713                         num_components = sel->info.num_stream_output_components[stream];
4714                         if (!num_components)
4715                                 continue;
4716
4717                         stride = 4 * num_components * sel->gs_max_out_vertices;
4718
4719                         /* Limit on the stride field for <= CIK. */
4720                         assert(stride < (1 << 14));
4721
4722                         num_records = 64;
4723
4724                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4725                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4726                         tmp = LLVMBuildAdd(builder, tmp,
4727                                            LLVMConstInt(ctx->i64,
4728                                                         stream_offset, 0), "");
4729                         stream_offset += stride * 64;
4730
4731                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4732                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4733                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4734                         tmp = LLVMBuildOr(builder, tmp,
4735                                 LLVMConstInt(ctx->i32,
4736                                              S_008F04_STRIDE(stride) |
4737                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
4738                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4739                         ring = LLVMBuildInsertElement(builder, ring,
4740                                         LLVMConstInt(ctx->i32, num_records, 0),
4741                                         LLVMConstInt(ctx->i32, 2, 0), "");
4742                         ring = LLVMBuildInsertElement(builder, ring,
4743                                 LLVMConstInt(ctx->i32,
4744                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4745                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4746                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4747                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4748                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4749                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4750                                              S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4751                                              S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4752                                              S_008F0C_ADD_TID_ENABLE(1),
4753                                              0),
4754                                 LLVMConstInt(ctx->i32, 3, 0), "");
4755
4756                         ctx->gsvs_ring[stream] = ring;
4757                 }
4758         }
4759 }
4760
4761 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4762                                          LLVMValueRef param_rw_buffers,
4763                                          unsigned param_pos_fixed_pt)
4764 {
4765         struct gallivm_state *gallivm = &ctx->gallivm;
4766         LLVMBuilderRef builder = gallivm->builder;
4767         LLVMValueRef slot, desc, offset, row, bit, address[2];
4768
4769         /* Use the fixed-point gl_FragCoord input.
4770          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4771          * per coordinate to get the repeating effect.
4772          */
4773         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4774         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4775
4776         /* Load the buffer descriptor. */
4777         slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4778         desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4779
4780         /* The stipple pattern is 32x32, each row has 32 bits. */
4781         offset = LLVMBuildMul(builder, address[1],
4782                               LLVMConstInt(ctx->i32, 4, 0), "");
4783         row = buffer_load_const(ctx, desc, offset);
4784         row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4785         bit = LLVMBuildLShr(builder, row, address[0], "");
4786         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4787
4788         /* The intrinsic kills the thread if arg < 0. */
4789         bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4790                               LLVMConstReal(ctx->f32, -1), "");
4791         ac_build_kill(&ctx->ac, bit);
4792 }
4793
4794 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4795                                   struct si_shader_config *conf,
4796                                   unsigned symbol_offset)
4797 {
4798         unsigned i;
4799         const unsigned char *config =
4800                 ac_shader_binary_config_start(binary, symbol_offset);
4801         bool really_needs_scratch = false;
4802
4803         /* LLVM adds SGPR spills to the scratch size.
4804          * Find out if we really need the scratch buffer.
4805          */
4806         for (i = 0; i < binary->reloc_count; i++) {
4807                 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4808
4809                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4810                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4811                         really_needs_scratch = true;
4812                         break;
4813                 }
4814         }
4815
4816         /* XXX: We may be able to emit some of these values directly rather than
4817          * extracting fields to be emitted later.
4818          */
4819
4820         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4821                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4822                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4823                 switch (reg) {
4824                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4825                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4826                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4827                 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4828                 case R_00B848_COMPUTE_PGM_RSRC1:
4829                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4830                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4831                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
4832                         conf->rsrc1 = value;
4833                         break;
4834                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4835                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4836                         break;
4837                 case R_00B84C_COMPUTE_PGM_RSRC2:
4838                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4839                         conf->rsrc2 = value;
4840                         break;
4841                 case R_0286CC_SPI_PS_INPUT_ENA:
4842                         conf->spi_ps_input_ena = value;
4843                         break;
4844                 case R_0286D0_SPI_PS_INPUT_ADDR:
4845                         conf->spi_ps_input_addr = value;
4846                         break;
4847                 case R_0286E8_SPI_TMPRING_SIZE:
4848                 case R_00B860_COMPUTE_TMPRING_SIZE:
4849                         /* WAVESIZE is in units of 256 dwords. */
4850                         if (really_needs_scratch)
4851                                 conf->scratch_bytes_per_wave =
4852                                         G_00B860_WAVESIZE(value) * 256 * 4;
4853                         break;
4854                 case 0x4: /* SPILLED_SGPRS */
4855                         conf->spilled_sgprs = value;
4856                         break;
4857                 case 0x8: /* SPILLED_VGPRS */
4858                         conf->spilled_vgprs = value;
4859                         break;
4860                 default:
4861                         {
4862                                 static bool printed;
4863
4864                                 if (!printed) {
4865                                         fprintf(stderr, "Warning: LLVM emitted unknown "
4866                                                 "config register: 0x%x\n", reg);
4867                                         printed = true;
4868                                 }
4869                         }
4870                         break;
4871                 }
4872         }
4873
4874         if (!conf->spi_ps_input_addr)
4875                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4876 }
4877
4878 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4879                                     uint64_t scratch_va)
4880 {
4881         unsigned i;
4882         uint32_t scratch_rsrc_dword0 = scratch_va;
4883         uint32_t scratch_rsrc_dword1 =
4884                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4885
4886         /* Enable scratch coalescing. */
4887         scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4888
4889         for (i = 0 ; i < shader->binary.reloc_count; i++) {
4890                 const struct ac_shader_reloc *reloc =
4891                                         &shader->binary.relocs[i];
4892                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4893                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4894                         &scratch_rsrc_dword0, 4);
4895                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4896                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4897                         &scratch_rsrc_dword1, 4);
4898                 }
4899         }
4900 }
4901
4902 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4903 {
4904         unsigned size = shader->binary.code_size;
4905
4906         if (shader->prolog)
4907                 size += shader->prolog->binary.code_size;
4908         if (shader->previous_stage)
4909                 size += shader->previous_stage->binary.code_size;
4910         if (shader->prolog2)
4911                 size += shader->prolog2->binary.code_size;
4912         if (shader->epilog)
4913                 size += shader->epilog->binary.code_size;
4914         return size;
4915 }
4916
4917 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4918 {
4919         const struct ac_shader_binary *prolog =
4920                 shader->prolog ? &shader->prolog->binary : NULL;
4921         const struct ac_shader_binary *previous_stage =
4922                 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4923         const struct ac_shader_binary *prolog2 =
4924                 shader->prolog2 ? &shader->prolog2->binary : NULL;
4925         const struct ac_shader_binary *epilog =
4926                 shader->epilog ? &shader->epilog->binary : NULL;
4927         const struct ac_shader_binary *mainb = &shader->binary;
4928         unsigned bo_size = si_get_shader_binary_size(shader) +
4929                            (!epilog ? mainb->rodata_size : 0);
4930         unsigned char *ptr;
4931
4932         assert(!prolog || !prolog->rodata_size);
4933         assert(!previous_stage || !previous_stage->rodata_size);
4934         assert(!prolog2 || !prolog2->rodata_size);
4935         assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4936                !mainb->rodata_size);
4937         assert(!epilog || !epilog->rodata_size);
4938
4939         r600_resource_reference(&shader->bo, NULL);
4940         shader->bo = (struct r600_resource*)
4941                      pipe_buffer_create(&sscreen->b.b, 0,
4942                                         PIPE_USAGE_IMMUTABLE,
4943                                         align(bo_size, SI_CPDMA_ALIGNMENT));
4944         if (!shader->bo)
4945                 return -ENOMEM;
4946
4947         /* Upload. */
4948         ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
4949                                         PIPE_TRANSFER_READ_WRITE |
4950                                         PIPE_TRANSFER_UNSYNCHRONIZED);
4951
4952         /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
4953          * endian-independent. */
4954         if (prolog) {
4955                 memcpy(ptr, prolog->code, prolog->code_size);
4956                 ptr += prolog->code_size;
4957         }
4958         if (previous_stage) {
4959                 memcpy(ptr, previous_stage->code, previous_stage->code_size);
4960                 ptr += previous_stage->code_size;
4961         }
4962         if (prolog2) {
4963                 memcpy(ptr, prolog2->code, prolog2->code_size);
4964                 ptr += prolog2->code_size;
4965         }
4966
4967         memcpy(ptr, mainb->code, mainb->code_size);
4968         ptr += mainb->code_size;
4969
4970         if (epilog)
4971                 memcpy(ptr, epilog->code, epilog->code_size);
4972         else if (mainb->rodata_size > 0)
4973                 memcpy(ptr, mainb->rodata, mainb->rodata_size);
4974
4975         sscreen->b.ws->buffer_unmap(shader->bo->buf);
4976         return 0;
4977 }
4978
4979 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
4980                                        struct pipe_debug_callback *debug,
4981                                        const char *name, FILE *file)
4982 {
4983         char *line, *p;
4984         unsigned i, count;
4985
4986         if (binary->disasm_string) {
4987                 fprintf(file, "Shader %s disassembly:\n", name);
4988                 fprintf(file, "%s", binary->disasm_string);
4989
4990                 if (debug && debug->debug_message) {
4991                         /* Very long debug messages are cut off, so send the
4992                          * disassembly one line at a time. This causes more
4993                          * overhead, but on the plus side it simplifies
4994                          * parsing of resulting logs.
4995                          */
4996                         pipe_debug_message(debug, SHADER_INFO,
4997                                            "Shader Disassembly Begin");
4998
4999                         line = binary->disasm_string;
5000                         while (*line) {
5001                                 p = util_strchrnul(line, '\n');
5002                                 count = p - line;
5003
5004                                 if (count) {
5005                                         pipe_debug_message(debug, SHADER_INFO,
5006                                                            "%.*s", count, line);
5007                                 }
5008
5009                                 if (!*p)
5010                                         break;
5011                                 line = p + 1;
5012                         }
5013
5014                         pipe_debug_message(debug, SHADER_INFO,
5015                                            "Shader Disassembly End");
5016                 }
5017         } else {
5018                 fprintf(file, "Shader %s binary:\n", name);
5019                 for (i = 0; i < binary->code_size; i += 4) {
5020                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5021                                 binary->code[i + 3], binary->code[i + 2],
5022                                 binary->code[i + 1], binary->code[i]);
5023                 }
5024         }
5025 }
5026
5027 static void si_shader_dump_stats(struct si_screen *sscreen,
5028                                  const struct si_shader *shader,
5029                                  struct pipe_debug_callback *debug,
5030                                  unsigned processor,
5031                                  FILE *file,
5032                                  bool check_debug_option)
5033 {
5034         const struct si_shader_config *conf = &shader->config;
5035         unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
5036         unsigned code_size = si_get_shader_binary_size(shader);
5037         unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5038         unsigned lds_per_wave = 0;
5039         unsigned max_simd_waves = 10;
5040
5041         /* Compute LDS usage for PS. */
5042         switch (processor) {
5043         case PIPE_SHADER_FRAGMENT:
5044                 /* The minimum usage per wave is (num_inputs * 48). The maximum
5045                  * usage is (num_inputs * 48 * 16).
5046                  * We can get anything in between and it varies between waves.
5047                  *
5048                  * The 48 bytes per input for a single primitive is equal to
5049                  * 4 bytes/component * 4 components/input * 3 points.
5050                  *
5051                  * Other stages don't know the size at compile time or don't
5052                  * allocate LDS per wave, but instead they do it per thread group.
5053                  */
5054                 lds_per_wave = conf->lds_size * lds_increment +
5055                                align(num_inputs * 48, lds_increment);
5056                 break;
5057         case PIPE_SHADER_COMPUTE:
5058                 if (shader->selector) {
5059                         unsigned max_workgroup_size =
5060                                 si_get_max_workgroup_size(shader);
5061                         lds_per_wave = (conf->lds_size * lds_increment) /
5062                                        DIV_ROUND_UP(max_workgroup_size, 64);
5063                 }
5064                 break;
5065         }
5066
5067         /* Compute the per-SIMD wave counts. */
5068         if (conf->num_sgprs) {
5069                 if (sscreen->b.chip_class >= VI)
5070                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5071                 else
5072                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5073         }
5074
5075         if (conf->num_vgprs)
5076                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5077
5078         /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5079          * 16KB makes some SIMDs unoccupied). */
5080         if (lds_per_wave)
5081                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5082
5083         if (!check_debug_option ||
5084             r600_can_dump_shader(&sscreen->b, processor)) {
5085                 if (processor == PIPE_SHADER_FRAGMENT) {
5086                         fprintf(file, "*** SHADER CONFIG ***\n"
5087                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5088                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
5089                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5090                 }
5091
5092                 fprintf(file, "*** SHADER STATS ***\n"
5093                         "SGPRS: %d\n"
5094                         "VGPRS: %d\n"
5095                         "Spilled SGPRs: %d\n"
5096                         "Spilled VGPRs: %d\n"
5097                         "Private memory VGPRs: %d\n"
5098                         "Code Size: %d bytes\n"
5099                         "LDS: %d blocks\n"
5100                         "Scratch: %d bytes per wave\n"
5101                         "Max Waves: %d\n"
5102                         "********************\n\n\n",
5103                         conf->num_sgprs, conf->num_vgprs,
5104                         conf->spilled_sgprs, conf->spilled_vgprs,
5105                         conf->private_mem_vgprs, code_size,
5106                         conf->lds_size, conf->scratch_bytes_per_wave,
5107                         max_simd_waves);
5108         }
5109
5110         pipe_debug_message(debug, SHADER_INFO,
5111                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5112                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5113                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
5114                            conf->num_sgprs, conf->num_vgprs, code_size,
5115                            conf->lds_size, conf->scratch_bytes_per_wave,
5116                            max_simd_waves, conf->spilled_sgprs,
5117                            conf->spilled_vgprs, conf->private_mem_vgprs);
5118 }
5119
5120 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5121 {
5122         switch (processor) {
5123         case PIPE_SHADER_VERTEX:
5124                 if (shader->key.as_es)
5125                         return "Vertex Shader as ES";
5126                 else if (shader->key.as_ls)
5127                         return "Vertex Shader as LS";
5128                 else
5129                         return "Vertex Shader as VS";
5130         case PIPE_SHADER_TESS_CTRL:
5131                 return "Tessellation Control Shader";
5132         case PIPE_SHADER_TESS_EVAL:
5133                 if (shader->key.as_es)
5134                         return "Tessellation Evaluation Shader as ES";
5135                 else
5136                         return "Tessellation Evaluation Shader as VS";
5137         case PIPE_SHADER_GEOMETRY:
5138                 if (shader->is_gs_copy_shader)
5139                         return "GS Copy Shader as VS";
5140                 else
5141                         return "Geometry Shader";
5142         case PIPE_SHADER_FRAGMENT:
5143                 return "Pixel Shader";
5144         case PIPE_SHADER_COMPUTE:
5145                 return "Compute Shader";
5146         default:
5147                 return "Unknown Shader";
5148         }
5149 }
5150
5151 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5152                     struct pipe_debug_callback *debug, unsigned processor,
5153                     FILE *file, bool check_debug_option)
5154 {
5155         if (!check_debug_option ||
5156             r600_can_dump_shader(&sscreen->b, processor))
5157                 si_dump_shader_key(processor, shader, file);
5158
5159         if (!check_debug_option && shader->binary.llvm_ir_string) {
5160                 if (shader->previous_stage &&
5161                     shader->previous_stage->binary.llvm_ir_string) {
5162                         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5163                                 si_get_shader_name(shader, processor));
5164                         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5165                 }
5166
5167                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5168                         si_get_shader_name(shader, processor));
5169                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5170         }
5171
5172         if (!check_debug_option ||
5173             (r600_can_dump_shader(&sscreen->b, processor) &&
5174              !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5175                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5176
5177                 if (shader->prolog)
5178                         si_shader_dump_disassembly(&shader->prolog->binary,
5179                                                    debug, "prolog", file);
5180                 if (shader->previous_stage)
5181                         si_shader_dump_disassembly(&shader->previous_stage->binary,
5182                                                    debug, "previous stage", file);
5183                 if (shader->prolog2)
5184                         si_shader_dump_disassembly(&shader->prolog2->binary,
5185                                                    debug, "prolog2", file);
5186
5187                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5188
5189                 if (shader->epilog)
5190                         si_shader_dump_disassembly(&shader->epilog->binary,
5191                                                    debug, "epilog", file);
5192                 fprintf(file, "\n");
5193         }
5194
5195         si_shader_dump_stats(sscreen, shader, debug, processor, file,
5196                              check_debug_option);
5197 }
5198
5199 static int si_compile_llvm(struct si_screen *sscreen,
5200                            struct ac_shader_binary *binary,
5201                            struct si_shader_config *conf,
5202                            LLVMTargetMachineRef tm,
5203                            LLVMModuleRef mod,
5204                            struct pipe_debug_callback *debug,
5205                            unsigned processor,
5206                            const char *name)
5207 {
5208         int r = 0;
5209         unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5210
5211         if (r600_can_dump_shader(&sscreen->b, processor)) {
5212                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5213
5214                 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5215                         fprintf(stderr, "%s LLVM IR:\n\n", name);
5216                         ac_dump_module(mod);
5217                         fprintf(stderr, "\n");
5218                 }
5219         }
5220
5221         if (sscreen->record_llvm_ir) {
5222                 char *ir = LLVMPrintModuleToString(mod);
5223                 binary->llvm_ir_string = strdup(ir);
5224                 LLVMDisposeMessage(ir);
5225         }
5226
5227         if (!si_replace_shader(count, binary)) {
5228                 r = si_llvm_compile(mod, binary, tm, debug);
5229                 if (r)
5230                         return r;
5231         }
5232
5233         si_shader_binary_read_config(binary, conf, 0);
5234
5235         /* Enable 64-bit and 16-bit denormals, because there is no performance
5236          * cost.
5237          *
5238          * If denormals are enabled, all floating-point output modifiers are
5239          * ignored.
5240          *
5241          * Don't enable denormals for 32-bit floats, because:
5242          * - Floating-point output modifiers would be ignored by the hw.
5243          * - Some opcodes don't support denormals, such as v_mad_f32. We would
5244          *   have to stop using those.
5245          * - SI & CI would be very slow.
5246          */
5247         conf->float_mode |= V_00B028_FP_64_DENORMS;
5248
5249         FREE(binary->config);
5250         FREE(binary->global_symbol_offsets);
5251         binary->config = NULL;
5252         binary->global_symbol_offsets = NULL;
5253
5254         /* Some shaders can't have rodata because their binaries can be
5255          * concatenated.
5256          */
5257         if (binary->rodata_size &&
5258             (processor == PIPE_SHADER_VERTEX ||
5259              processor == PIPE_SHADER_TESS_CTRL ||
5260              processor == PIPE_SHADER_TESS_EVAL ||
5261              processor == PIPE_SHADER_FRAGMENT)) {
5262                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5263                 return -EINVAL;
5264         }
5265
5266         return r;
5267 }
5268
5269 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5270 {
5271         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5272                 LLVMBuildRetVoid(ctx->gallivm.builder);
5273         else
5274                 LLVMBuildRet(ctx->gallivm.builder, ret);
5275 }
5276
5277 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5278 struct si_shader *
5279 si_generate_gs_copy_shader(struct si_screen *sscreen,
5280                            LLVMTargetMachineRef tm,
5281                            struct si_shader_selector *gs_selector,
5282                            struct pipe_debug_callback *debug)
5283 {
5284         struct si_shader_context ctx;
5285         struct si_shader *shader;
5286         struct gallivm_state *gallivm = &ctx.gallivm;
5287         LLVMBuilderRef builder;
5288         struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5289         struct lp_build_context *uint = &bld_base->uint_bld;
5290         struct si_shader_output_values *outputs;
5291         struct tgsi_shader_info *gsinfo = &gs_selector->info;
5292         int i, r;
5293
5294         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5295
5296         if (!outputs)
5297                 return NULL;
5298
5299         shader = CALLOC_STRUCT(si_shader);
5300         if (!shader) {
5301                 FREE(outputs);
5302                 return NULL;
5303         }
5304
5305
5306         shader->selector = gs_selector;
5307         shader->is_gs_copy_shader = true;
5308
5309         si_init_shader_ctx(&ctx, sscreen, tm);
5310         ctx.shader = shader;
5311         ctx.type = PIPE_SHADER_VERTEX;
5312
5313         builder = gallivm->builder;
5314
5315         create_function(&ctx);
5316         preload_ring_buffers(&ctx);
5317
5318         LLVMValueRef voffset =
5319                 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5320
5321         /* Fetch the vertex stream ID.*/
5322         LLVMValueRef stream_id;
5323
5324         if (gs_selector->so.num_outputs)
5325                 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5326         else
5327                 stream_id = ctx.i32_0;
5328
5329         /* Fill in output information. */
5330         for (i = 0; i < gsinfo->num_outputs; ++i) {
5331                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5332                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5333
5334                 for (int chan = 0; chan < 4; chan++) {
5335                         outputs[i].vertex_stream[chan] =
5336                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5337                 }
5338         }
5339
5340         LLVMBasicBlockRef end_bb;
5341         LLVMValueRef switch_inst;
5342
5343         end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5344         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5345
5346         for (int stream = 0; stream < 4; stream++) {
5347                 LLVMBasicBlockRef bb;
5348                 unsigned offset;
5349
5350                 if (!gsinfo->num_stream_output_components[stream])
5351                         continue;
5352
5353                 if (stream > 0 && !gs_selector->so.num_outputs)
5354                         continue;
5355
5356                 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5357                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5358                 LLVMPositionBuilderAtEnd(builder, bb);
5359
5360                 /* Fetch vertex data from GSVS ring */
5361                 offset = 0;
5362                 for (i = 0; i < gsinfo->num_outputs; ++i) {
5363                         for (unsigned chan = 0; chan < 4; chan++) {
5364                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5365                                     outputs[i].vertex_stream[chan] != stream) {
5366                                         outputs[i].values[chan] = ctx.bld_base.base.undef;
5367                                         continue;
5368                                 }
5369
5370                                 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5371                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5372                                 offset++;
5373
5374                                 outputs[i].values[chan] =
5375                                         ac_build_buffer_load(&ctx.ac,
5376                                                              ctx.gsvs_ring[0], 1,
5377                                                              ctx.i32_0, voffset,
5378                                                              soffset, 0, 1, 1,
5379                                                              true, false);
5380                         }
5381                 }
5382
5383                 /* Streamout and exports. */
5384                 if (gs_selector->so.num_outputs) {
5385                         si_llvm_emit_streamout(&ctx, outputs,
5386                                                gsinfo->num_outputs,
5387                                                stream);
5388                 }
5389
5390                 if (stream == 0)
5391                         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5392
5393                 LLVMBuildBr(builder, end_bb);
5394         }
5395
5396         LLVMPositionBuilderAtEnd(builder, end_bb);
5397
5398         LLVMBuildRetVoid(gallivm->builder);
5399
5400         ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5401         si_llvm_optimize_module(&ctx);
5402
5403         r = si_compile_llvm(sscreen, &ctx.shader->binary,
5404                             &ctx.shader->config, ctx.tm,
5405                             ctx.gallivm.module,
5406                             debug, PIPE_SHADER_GEOMETRY,
5407                             "GS Copy Shader");
5408         if (!r) {
5409                 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5410                         fprintf(stderr, "GS Copy Shader:\n");
5411                 si_shader_dump(sscreen, ctx.shader, debug,
5412                                PIPE_SHADER_GEOMETRY, stderr, true);
5413                 r = si_shader_binary_upload(sscreen, ctx.shader);
5414         }
5415
5416         si_llvm_dispose(&ctx);
5417
5418         FREE(outputs);
5419
5420         if (r != 0) {
5421                 FREE(shader);
5422                 shader = NULL;
5423         }
5424         return shader;
5425 }
5426
5427 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5428                                   const struct si_vs_prolog_bits *prolog,
5429                                   const char *prefix, FILE *f)
5430 {
5431         fprintf(f, "  %s.instance_divisor_is_one = %u\n",
5432                 prefix, prolog->instance_divisor_is_one);
5433         fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
5434                 prefix, prolog->instance_divisor_is_fetched);
5435
5436         fprintf(f, "  mono.vs.fix_fetch = {");
5437         for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5438                 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5439         fprintf(f, "}\n");
5440 }
5441
5442 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5443                                FILE *f)
5444 {
5445         const struct si_shader_key *key = &shader->key;
5446
5447         fprintf(f, "SHADER KEY\n");
5448
5449         switch (processor) {
5450         case PIPE_SHADER_VERTEX:
5451                 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5452                                       "part.vs.prolog", f);
5453                 fprintf(f, "  as_es = %u\n", key->as_es);
5454                 fprintf(f, "  as_ls = %u\n", key->as_ls);
5455                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5456                         key->mono.u.vs_export_prim_id);
5457                 break;
5458
5459         case PIPE_SHADER_TESS_CTRL:
5460                 if (shader->selector->screen->b.chip_class >= GFX9) {
5461                         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5462                                               "part.tcs.ls_prolog", f);
5463                 }
5464                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5465                 fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5466                 break;
5467
5468         case PIPE_SHADER_TESS_EVAL:
5469                 fprintf(f, "  as_es = %u\n", key->as_es);
5470                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5471                         key->mono.u.vs_export_prim_id);
5472                 break;
5473
5474         case PIPE_SHADER_GEOMETRY:
5475                 if (shader->is_gs_copy_shader)
5476                         break;
5477
5478                 if (shader->selector->screen->b.chip_class >= GFX9 &&
5479                     key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5480                         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5481                                               "part.gs.vs_prolog", f);
5482                 }
5483                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5484                 break;
5485
5486         case PIPE_SHADER_COMPUTE:
5487                 break;
5488
5489         case PIPE_SHADER_FRAGMENT:
5490                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5491                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5492                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5493                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5494                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5495                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5496                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5497                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5498                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5499                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5500                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5501                 fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5502                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5503                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5504                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5505                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5506                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5507                 break;
5508
5509         default:
5510                 assert(0);
5511         }
5512
5513         if ((processor == PIPE_SHADER_GEOMETRY ||
5514              processor == PIPE_SHADER_TESS_EVAL ||
5515              processor == PIPE_SHADER_VERTEX) &&
5516             !key->as_es && !key->as_ls) {
5517                 fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5518                 fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
5519         }
5520 }
5521
5522 static void si_init_shader_ctx(struct si_shader_context *ctx,
5523                                struct si_screen *sscreen,
5524                                LLVMTargetMachineRef tm)
5525 {
5526         struct lp_build_tgsi_context *bld_base;
5527
5528         ctx->abi.chip_class = sscreen->b.chip_class;
5529
5530         si_llvm_context_init(ctx, sscreen, tm);
5531
5532         bld_base = &ctx->bld_base;
5533         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5534
5535         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5536         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5537         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5538
5539         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5540
5541         bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5542
5543         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5544         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5545         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5546         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5547
5548         bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5549         bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5550         bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5551         bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5552         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5553         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5554         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5555         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5556         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5557
5558         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5559         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5560         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5561 }
5562
5563 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5564 {
5565         struct si_shader *shader = ctx->shader;
5566         struct tgsi_shader_info *info = &shader->selector->info;
5567
5568         if ((ctx->type != PIPE_SHADER_VERTEX &&
5569              ctx->type != PIPE_SHADER_TESS_EVAL) ||
5570             shader->key.as_ls ||
5571             shader->key.as_es)
5572                 return;
5573
5574         ac_optimize_vs_outputs(&ctx->ac,
5575                                ctx->main_fn,
5576                                shader->info.vs_output_param_offset,
5577                                info->num_outputs,
5578                                &shader->info.nr_param_exports);
5579 }
5580
5581 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5582 {
5583         ctx->shader->config.private_mem_vgprs = 0;
5584
5585         /* Process all LLVM instructions. */
5586         LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5587         while (bb) {
5588                 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5589
5590                 while (next) {
5591                         LLVMValueRef inst = next;
5592                         next = LLVMGetNextInstruction(next);
5593
5594                         if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5595                                 continue;
5596
5597                         LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5598                         /* No idea why LLVM aligns allocas to 4 elements. */
5599                         unsigned alignment = LLVMGetAlignment(inst);
5600                         unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
5601                         ctx->shader->config.private_mem_vgprs += dw_size;
5602                 }
5603                 bb = LLVMGetNextBasicBlock(bb);
5604         }
5605 }
5606
5607 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5608 {
5609         LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5610         lp_build_intrinsic(ctx->gallivm.builder,
5611                            "llvm.amdgcn.init.exec", ctx->voidt,
5612                            &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5613 }
5614
5615 static void si_init_exec_from_input(struct si_shader_context *ctx,
5616                                     unsigned param, unsigned bitoffset)
5617 {
5618         LLVMValueRef args[] = {
5619                 LLVMGetParam(ctx->main_fn, param),
5620                 LLVMConstInt(ctx->i32, bitoffset, 0),
5621         };
5622         lp_build_intrinsic(ctx->gallivm.builder,
5623                            "llvm.amdgcn.init.exec.from.input",
5624                            ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5625 }
5626
5627 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5628                                  bool is_monolithic)
5629 {
5630         struct si_shader *shader = ctx->shader;
5631         struct si_shader_selector *sel = shader->selector;
5632         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5633
5634         // TODO clean all this up!
5635         switch (ctx->type) {
5636         case PIPE_SHADER_VERTEX:
5637                 ctx->load_input = declare_input_vs;
5638                 if (shader->key.as_ls)
5639                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5640                 else if (shader->key.as_es)
5641                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5642                 else {
5643                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5644                         bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5645                 }
5646                 break;
5647         case PIPE_SHADER_TESS_CTRL:
5648                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5649                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5650                 bld_base->emit_store = store_output_tcs;
5651                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5652                 break;
5653         case PIPE_SHADER_TESS_EVAL:
5654                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5655                 if (shader->key.as_es)
5656                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5657                 else {
5658                         ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5659                         bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5660                 }
5661                 break;
5662         case PIPE_SHADER_GEOMETRY:
5663                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5664                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5665                 break;
5666         case PIPE_SHADER_FRAGMENT:
5667                 ctx->load_input = declare_input_fs;
5668                 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5669                 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5670                 break;
5671         case PIPE_SHADER_COMPUTE:
5672                 ctx->declare_memory_region = declare_compute_memory;
5673                 break;
5674         default:
5675                 assert(!"Unsupported shader type");
5676                 return false;
5677         }
5678
5679         ctx->abi.load_ubo = load_ubo;
5680         ctx->abi.load_ssbo = load_ssbo;
5681
5682         create_function(ctx);
5683         preload_ring_buffers(ctx);
5684
5685         /* For GFX9 merged shaders:
5686          * - Set EXEC for the first shader. If the prolog is present, set
5687          *   EXEC there instead.
5688          * - Add a barrier before the second shader.
5689          * - In the second shader, reset EXEC to ~0 and wrap the main part in
5690          *   an if-statement. This is required for correctness in geometry
5691          *   shaders, to ensure that empty GS waves do not send GS_EMIT and
5692          *   GS_CUT messages.
5693          *
5694          * For monolithic merged shaders, the first shader is wrapped in an
5695          * if-block together with its prolog in si_build_wrapper_function.
5696          */
5697         if (ctx->screen->b.chip_class >= GFX9) {
5698                 if (!is_monolithic &&
5699                     sel->info.num_instructions > 1 && /* not empty shader */
5700                     (shader->key.as_es || shader->key.as_ls) &&
5701                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
5702                      (ctx->type == PIPE_SHADER_VERTEX &&
5703                       !sel->vs_needs_prolog))) {
5704                         si_init_exec_from_input(ctx,
5705                                                 ctx->param_merged_wave_info, 0);
5706                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5707                            ctx->type == PIPE_SHADER_GEOMETRY) {
5708                         if (!is_monolithic)
5709                                 si_init_exec_full_mask(ctx);
5710
5711                         /* The barrier must execute for all shaders in a
5712                          * threadgroup.
5713                          */
5714                         si_llvm_emit_barrier(NULL, bld_base, NULL);
5715
5716                         LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5717                         LLVMValueRef ena =
5718                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5719                                             ac_get_thread_id(&ctx->ac), num_threads, "");
5720                         lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5721                 }
5722         }
5723
5724         if (ctx->type == PIPE_SHADER_GEOMETRY) {
5725                 int i;
5726                 for (i = 0; i < 4; i++) {
5727                         ctx->gs_next_vertex[i] =
5728                                 lp_build_alloca(&ctx->gallivm,
5729                                                 ctx->i32, "");
5730                 }
5731         }
5732
5733         if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5734             ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5735                 /* This is initialized to 0.0 = not kill. */
5736                 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5737         }
5738
5739         if (sel->tokens) {
5740                 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5741                         fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5742                         return false;
5743                 }
5744         } else {
5745                 if (!si_nir_build_llvm(ctx, sel->nir)) {
5746                         fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
5747                         return false;
5748                 }
5749         }
5750
5751         si_llvm_build_ret(ctx, ctx->return_value);
5752         return true;
5753 }
5754
5755 /**
5756  * Compute the VS prolog key, which contains all the information needed to
5757  * build the VS prolog function, and set shader->info bits where needed.
5758  *
5759  * \param info             Shader info of the vertex shader.
5760  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
5761  * \param prolog_key       Key of the VS prolog
5762  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
5763  * \param key              Output shader part key.
5764  */
5765 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5766                                  unsigned num_input_sgprs,
5767                                  const struct si_vs_prolog_bits *prolog_key,
5768                                  struct si_shader *shader_out,
5769                                  union si_shader_part_key *key)
5770 {
5771         memset(key, 0, sizeof(*key));
5772         key->vs_prolog.states = *prolog_key;
5773         key->vs_prolog.num_input_sgprs = num_input_sgprs;
5774         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5775         key->vs_prolog.as_ls = shader_out->key.as_ls;
5776
5777         if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5778                 key->vs_prolog.as_ls = 1;
5779                 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5780         } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5781                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5782         }
5783
5784         /* Enable loading the InstanceID VGPR. */
5785         uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5786
5787         if ((key->vs_prolog.states.instance_divisor_is_one |
5788              key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5789                 shader_out->info.uses_instanceid = true;
5790 }
5791
5792 /**
5793  * Compute the PS prolog key, which contains all the information needed to
5794  * build the PS prolog function, and set related bits in shader->config.
5795  */
5796 static void si_get_ps_prolog_key(struct si_shader *shader,
5797                                  union si_shader_part_key *key,
5798                                  bool separate_prolog)
5799 {
5800         struct tgsi_shader_info *info = &shader->selector->info;
5801
5802         memset(key, 0, sizeof(*key));
5803         key->ps_prolog.states = shader->key.part.ps.prolog;
5804         key->ps_prolog.colors_read = info->colors_read;
5805         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5806         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5807         key->ps_prolog.wqm = info->uses_derivatives &&
5808                 (key->ps_prolog.colors_read ||
5809                  key->ps_prolog.states.force_persp_sample_interp ||
5810                  key->ps_prolog.states.force_linear_sample_interp ||
5811                  key->ps_prolog.states.force_persp_center_interp ||
5812                  key->ps_prolog.states.force_linear_center_interp ||
5813                  key->ps_prolog.states.bc_optimize_for_persp ||
5814                  key->ps_prolog.states.bc_optimize_for_linear);
5815
5816         if (info->colors_read) {
5817                 unsigned *color = shader->selector->color_attr_index;
5818
5819                 if (shader->key.part.ps.prolog.color_two_side) {
5820                         /* BCOLORs are stored after the last input. */
5821                         key->ps_prolog.num_interp_inputs = info->num_inputs;
5822                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5823                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5824                 }
5825
5826                 for (unsigned i = 0; i < 2; i++) {
5827                         unsigned interp = info->input_interpolate[color[i]];
5828                         unsigned location = info->input_interpolate_loc[color[i]];
5829
5830                         if (!(info->colors_read & (0xf << i*4)))
5831                                 continue;
5832
5833                         key->ps_prolog.color_attr_index[i] = color[i];
5834
5835                         if (shader->key.part.ps.prolog.flatshade_colors &&
5836                             interp == TGSI_INTERPOLATE_COLOR)
5837                                 interp = TGSI_INTERPOLATE_CONSTANT;
5838
5839                         switch (interp) {
5840                         case TGSI_INTERPOLATE_CONSTANT:
5841                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5842                                 break;
5843                         case TGSI_INTERPOLATE_PERSPECTIVE:
5844                         case TGSI_INTERPOLATE_COLOR:
5845                                 /* Force the interpolation location for colors here. */
5846                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5847                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
5848                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
5849                                         location = TGSI_INTERPOLATE_LOC_CENTER;
5850
5851                                 switch (location) {
5852                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
5853                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
5854                                         shader->config.spi_ps_input_ena |=
5855                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
5856                                         break;
5857                                 case TGSI_INTERPOLATE_LOC_CENTER:
5858                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
5859                                         shader->config.spi_ps_input_ena |=
5860                                                 S_0286CC_PERSP_CENTER_ENA(1);
5861                                         break;
5862                                 case TGSI_INTERPOLATE_LOC_CENTROID:
5863                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
5864                                         shader->config.spi_ps_input_ena |=
5865                                                 S_0286CC_PERSP_CENTROID_ENA(1);
5866                                         break;
5867                                 default:
5868                                         assert(0);
5869                                 }
5870                                 break;
5871                         case TGSI_INTERPOLATE_LINEAR:
5872                                 /* Force the interpolation location for colors here. */
5873                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5874                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
5875                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
5876                                         location = TGSI_INTERPOLATE_LOC_CENTER;
5877
5878                                 /* The VGPR assignment for non-monolithic shaders
5879                                  * works because InitialPSInputAddr is set on the
5880                                  * main shader and PERSP_PULL_MODEL is never used.
5881                                  */
5882                                 switch (location) {
5883                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
5884                                         key->ps_prolog.color_interp_vgpr_index[i] =
5885                                                 separate_prolog ? 6 : 9;
5886                                         shader->config.spi_ps_input_ena |=
5887                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
5888                                         break;
5889                                 case TGSI_INTERPOLATE_LOC_CENTER:
5890                                         key->ps_prolog.color_interp_vgpr_index[i] =
5891                                                 separate_prolog ? 8 : 11;
5892                                         shader->config.spi_ps_input_ena |=
5893                                                 S_0286CC_LINEAR_CENTER_ENA(1);
5894                                         break;
5895                                 case TGSI_INTERPOLATE_LOC_CENTROID:
5896                                         key->ps_prolog.color_interp_vgpr_index[i] =
5897                                                 separate_prolog ? 10 : 13;
5898                                         shader->config.spi_ps_input_ena |=
5899                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
5900                                         break;
5901                                 default:
5902                                         assert(0);
5903                                 }
5904                                 break;
5905                         default:
5906                                 assert(0);
5907                         }
5908                 }
5909         }
5910 }
5911
5912 /**
5913  * Check whether a PS prolog is required based on the key.
5914  */
5915 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5916 {
5917         return key->ps_prolog.colors_read ||
5918                key->ps_prolog.states.force_persp_sample_interp ||
5919                key->ps_prolog.states.force_linear_sample_interp ||
5920                key->ps_prolog.states.force_persp_center_interp ||
5921                key->ps_prolog.states.force_linear_center_interp ||
5922                key->ps_prolog.states.bc_optimize_for_persp ||
5923                key->ps_prolog.states.bc_optimize_for_linear ||
5924                key->ps_prolog.states.poly_stipple;
5925 }
5926
5927 /**
5928  * Compute the PS epilog key, which contains all the information needed to
5929  * build the PS epilog function.
5930  */
5931 static void si_get_ps_epilog_key(struct si_shader *shader,
5932                                  union si_shader_part_key *key)
5933 {
5934         struct tgsi_shader_info *info = &shader->selector->info;
5935         memset(key, 0, sizeof(*key));
5936         key->ps_epilog.colors_written = info->colors_written;
5937         key->ps_epilog.writes_z = info->writes_z;
5938         key->ps_epilog.writes_stencil = info->writes_stencil;
5939         key->ps_epilog.writes_samplemask = info->writes_samplemask;
5940         key->ps_epilog.states = shader->key.part.ps.epilog;
5941 }
5942
5943 /**
5944  * Build the GS prolog function. Rotate the input vertices for triangle strips
5945  * with adjacency.
5946  */
5947 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
5948                                         union si_shader_part_key *key)
5949 {
5950         unsigned num_sgprs, num_vgprs;
5951         struct gallivm_state *gallivm = &ctx->gallivm;
5952         struct si_function_info fninfo;
5953         LLVMBuilderRef builder = gallivm->builder;
5954         LLVMTypeRef returns[48];
5955         LLVMValueRef func, ret;
5956
5957         si_init_function_info(&fninfo);
5958
5959         if (ctx->screen->b.chip_class >= GFX9) {
5960                 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
5961                 num_vgprs = 5; /* ES inputs are not needed by GS */
5962         } else {
5963                 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
5964                 num_vgprs = 8;
5965         }
5966
5967         for (unsigned i = 0; i < num_sgprs; ++i) {
5968                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
5969                 returns[i] = ctx->i32;
5970         }
5971
5972         for (unsigned i = 0; i < num_vgprs; ++i) {
5973                 add_arg(&fninfo, ARG_VGPR, ctx->i32);
5974                 returns[num_sgprs + i] = ctx->f32;
5975         }
5976
5977         /* Create the function. */
5978         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
5979                            &fninfo, 0);
5980         func = ctx->main_fn;
5981
5982         /* Set the full EXEC mask for the prolog, because we are only fiddling
5983          * with registers here. The main shader part will set the correct EXEC
5984          * mask.
5985          */
5986         if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
5987                 si_init_exec_full_mask(ctx);
5988
5989         /* Copy inputs to outputs. This should be no-op, as the registers match,
5990          * but it will prevent the compiler from overwriting them unintentionally.
5991          */
5992         ret = ctx->return_value;
5993         for (unsigned i = 0; i < num_sgprs; i++) {
5994                 LLVMValueRef p = LLVMGetParam(func, i);
5995                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
5996         }
5997         for (unsigned i = 0; i < num_vgprs; i++) {
5998                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
5999                 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
6000                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6001         }
6002
6003         if (key->gs_prolog.states.tri_strip_adj_fix) {
6004                 /* Remap the input vertices for every other primitive. */
6005                 const unsigned gfx6_vtx_params[6] = {
6006                         num_sgprs,
6007                         num_sgprs + 1,
6008                         num_sgprs + 3,
6009                         num_sgprs + 4,
6010                         num_sgprs + 5,
6011                         num_sgprs + 6
6012                 };
6013                 const unsigned gfx9_vtx_params[3] = {
6014                         num_sgprs,
6015                         num_sgprs + 1,
6016                         num_sgprs + 4,
6017                 };
6018                 LLVMValueRef vtx_in[6], vtx_out[6];
6019                 LLVMValueRef prim_id, rotate;
6020
6021                 if (ctx->screen->b.chip_class >= GFX9) {
6022                         for (unsigned i = 0; i < 3; i++) {
6023                                 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6024                                 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6025                         }
6026                 } else {
6027                         for (unsigned i = 0; i < 6; i++)
6028                                 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6029                 }
6030
6031                 prim_id = LLVMGetParam(func, num_sgprs + 2);
6032                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6033
6034                 for (unsigned i = 0; i < 6; ++i) {
6035                         LLVMValueRef base, rotated;
6036                         base = vtx_in[i];
6037                         rotated = vtx_in[(i + 4) % 6];
6038                         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6039                 }
6040
6041                 if (ctx->screen->b.chip_class >= GFX9) {
6042                         for (unsigned i = 0; i < 3; i++) {
6043                                 LLVMValueRef hi, out;
6044
6045                                 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6046                                                   LLVMConstInt(ctx->i32, 16, 0), "");
6047                                 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6048                                 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
6049                                 ret = LLVMBuildInsertValue(builder, ret, out,
6050                                                            gfx9_vtx_params[i], "");
6051                         }
6052                 } else {
6053                         for (unsigned i = 0; i < 6; i++) {
6054                                 LLVMValueRef out;
6055
6056                                 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
6057                                 ret = LLVMBuildInsertValue(builder, ret, out,
6058                                                            gfx6_vtx_params[i], "");
6059                         }
6060                 }
6061         }
6062
6063         LLVMBuildRet(builder, ret);
6064 }
6065
6066 /**
6067  * Given a list of shader part functions, build a wrapper function that
6068  * runs them in sequence to form a monolithic shader.
6069  */
6070 static void si_build_wrapper_function(struct si_shader_context *ctx,
6071                                       LLVMValueRef *parts,
6072                                       unsigned num_parts,
6073                                       unsigned main_part,
6074                                       unsigned next_shader_first_part)
6075 {
6076         struct gallivm_state *gallivm = &ctx->gallivm;
6077         LLVMBuilderRef builder = ctx->gallivm.builder;
6078         /* PS epilog has one arg per color component; gfx9 merged shader
6079          * prologs need to forward 32 user SGPRs.
6080          */
6081         struct si_function_info fninfo;
6082         LLVMValueRef initial[64], out[64];
6083         LLVMTypeRef function_type;
6084         unsigned num_first_params;
6085         unsigned num_out, initial_num_out;
6086         MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6087         MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6088         unsigned num_sgprs, num_vgprs;
6089         unsigned gprs;
6090         struct lp_build_if_state if_state;
6091
6092         si_init_function_info(&fninfo);
6093
6094         for (unsigned i = 0; i < num_parts; ++i) {
6095                 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6096                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6097         }
6098
6099         /* The parameters of the wrapper function correspond to those of the
6100          * first part in terms of SGPRs and VGPRs, but we use the types of the
6101          * main part to get the right types. This is relevant for the
6102          * dereferenceable attribute on descriptor table pointers.
6103          */
6104         num_sgprs = 0;
6105         num_vgprs = 0;
6106
6107         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6108         num_first_params = LLVMCountParamTypes(function_type);
6109
6110         for (unsigned i = 0; i < num_first_params; ++i) {
6111                 LLVMValueRef param = LLVMGetParam(parts[0], i);
6112
6113                 if (ac_is_sgpr_param(param)) {
6114                         assert(num_vgprs == 0);
6115                         num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6116                 } else {
6117                         num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6118                 }
6119         }
6120
6121         gprs = 0;
6122         while (gprs < num_sgprs + num_vgprs) {
6123                 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6124                 LLVMTypeRef type = LLVMTypeOf(param);
6125                 unsigned size = llvm_get_type_size(type) / 4;
6126
6127                 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6128
6129                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6130                 assert(gprs + size <= num_sgprs + num_vgprs &&
6131                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
6132
6133                 gprs += size;
6134         }
6135
6136         si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6137                            si_get_max_workgroup_size(ctx->shader));
6138
6139         if (is_merged_shader(ctx->shader))
6140                 si_init_exec_full_mask(ctx);
6141
6142         /* Record the arguments of the function as if they were an output of
6143          * a previous part.
6144          */
6145         num_out = 0;
6146         num_out_sgpr = 0;
6147
6148         for (unsigned i = 0; i < fninfo.num_params; ++i) {
6149                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6150                 LLVMTypeRef param_type = LLVMTypeOf(param);
6151                 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6152                 unsigned size = llvm_get_type_size(param_type) / 4;
6153
6154                 if (size == 1) {
6155                         if (param_type != out_type)
6156                                 param = LLVMBuildBitCast(builder, param, out_type, "");
6157                         out[num_out++] = param;
6158                 } else {
6159                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6160
6161                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6162                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6163                                 param_type = ctx->i64;
6164                         }
6165
6166                         if (param_type != vector_type)
6167                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
6168
6169                         for (unsigned j = 0; j < size; ++j)
6170                                 out[num_out++] = LLVMBuildExtractElement(
6171                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6172                 }
6173
6174                 if (i < fninfo.num_sgpr_params)
6175                         num_out_sgpr = num_out;
6176         }
6177
6178         memcpy(initial, out, sizeof(out));
6179         initial_num_out = num_out;
6180         initial_num_out_sgpr = num_out_sgpr;
6181
6182         /* Now chain the parts. */
6183         for (unsigned part = 0; part < num_parts; ++part) {
6184                 LLVMValueRef in[48];
6185                 LLVMValueRef ret;
6186                 LLVMTypeRef ret_type;
6187                 unsigned out_idx = 0;
6188                 unsigned num_params = LLVMCountParams(parts[part]);
6189
6190                 /* Merged shaders are executed conditionally depending
6191                  * on the number of enabled threads passed in the input SGPRs. */
6192                 if (is_merged_shader(ctx->shader) && part == 0) {
6193                         LLVMValueRef ena, count = initial[3];
6194
6195                         count = LLVMBuildAnd(builder, count,
6196                                              LLVMConstInt(ctx->i32, 0x7f, 0), "");
6197                         ena = LLVMBuildICmp(builder, LLVMIntULT,
6198                                             ac_get_thread_id(&ctx->ac), count, "");
6199                         lp_build_if(&if_state, &ctx->gallivm, ena);
6200                 }
6201
6202                 /* Derive arguments for the next part from outputs of the
6203                  * previous one.
6204                  */
6205                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6206                         LLVMValueRef param;
6207                         LLVMTypeRef param_type;
6208                         bool is_sgpr;
6209                         unsigned param_size;
6210                         LLVMValueRef arg = NULL;
6211
6212                         param = LLVMGetParam(parts[part], param_idx);
6213                         param_type = LLVMTypeOf(param);
6214                         param_size = llvm_get_type_size(param_type) / 4;
6215                         is_sgpr = ac_is_sgpr_param(param);
6216
6217                         if (is_sgpr) {
6218 #if HAVE_LLVM < 0x0400
6219                                 LLVMRemoveAttribute(param, LLVMByValAttribute);
6220 #else
6221                                 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6222                                 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6223 #endif
6224                                 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6225                         }
6226
6227                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6228                         assert(is_sgpr || out_idx >= num_out_sgpr);
6229
6230                         if (param_size == 1)
6231                                 arg = out[out_idx];
6232                         else
6233                                 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6234
6235                         if (LLVMTypeOf(arg) != param_type) {
6236                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6237                                         arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6238                                         arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6239                                 } else {
6240                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
6241                                 }
6242                         }
6243
6244                         in[param_idx] = arg;
6245                         out_idx += param_size;
6246                 }
6247
6248                 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6249
6250                 if (is_merged_shader(ctx->shader) &&
6251                     part + 1 == next_shader_first_part) {
6252                         lp_build_endif(&if_state);
6253
6254                         /* The second half of the merged shader should use
6255                          * the inputs from the toplevel (wrapper) function,
6256                          * not the return value from the last call.
6257                          *
6258                          * That's because the last call was executed condi-
6259                          * tionally, so we can't consume it in the main
6260                          * block.
6261                          */
6262                         memcpy(out, initial, sizeof(initial));
6263                         num_out = initial_num_out;
6264                         num_out_sgpr = initial_num_out_sgpr;
6265                         continue;
6266                 }
6267
6268                 /* Extract the returned GPRs. */
6269                 ret_type = LLVMTypeOf(ret);
6270                 num_out = 0;
6271                 num_out_sgpr = 0;
6272
6273                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6274                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6275
6276                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6277
6278                         for (unsigned i = 0; i < ret_size; ++i) {
6279                                 LLVMValueRef val =
6280                                         LLVMBuildExtractValue(builder, ret, i, "");
6281
6282                                 assert(num_out < ARRAY_SIZE(out));
6283                                 out[num_out++] = val;
6284
6285                                 if (LLVMTypeOf(val) == ctx->i32) {
6286                                         assert(num_out_sgpr + 1 == num_out);
6287                                         num_out_sgpr = num_out;
6288                                 }
6289                         }
6290                 }
6291         }
6292
6293         LLVMBuildRetVoid(builder);
6294 }
6295
6296 int si_compile_tgsi_shader(struct si_screen *sscreen,
6297                            LLVMTargetMachineRef tm,
6298                            struct si_shader *shader,
6299                            bool is_monolithic,
6300                            struct pipe_debug_callback *debug)
6301 {
6302         struct si_shader_selector *sel = shader->selector;
6303         struct si_shader_context ctx;
6304         int r = -1;
6305
6306         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6307          * conversion fails. */
6308         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6309             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6310                 if (sel->tokens)
6311                         tgsi_dump(sel->tokens, 0);
6312                 else
6313                         nir_print_shader(sel->nir, stderr);
6314                 si_dump_streamout(&sel->so);
6315         }
6316
6317         si_init_shader_ctx(&ctx, sscreen, tm);
6318         si_llvm_context_set_tgsi(&ctx, shader);
6319         ctx.separate_prolog = !is_monolithic;
6320
6321         memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6322                sizeof(shader->info.vs_output_param_offset));
6323
6324         shader->info.uses_instanceid = sel->info.uses_instanceid;
6325
6326         ctx.load_system_value = declare_system_value;
6327
6328         if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6329                 si_llvm_dispose(&ctx);
6330                 return -1;
6331         }
6332
6333         if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6334                 LLVMValueRef parts[2];
6335                 bool need_prolog = sel->vs_needs_prolog;
6336
6337                 parts[1] = ctx.main_fn;
6338
6339                 if (need_prolog) {
6340                         union si_shader_part_key prolog_key;
6341                         si_get_vs_prolog_key(&sel->info,
6342                                              shader->info.num_input_sgprs,
6343                                              &shader->key.part.vs.prolog,
6344                                              shader, &prolog_key);
6345                         si_build_vs_prolog_function(&ctx, &prolog_key);
6346                         parts[0] = ctx.main_fn;
6347                 }
6348
6349                 si_build_wrapper_function(&ctx, parts + !need_prolog,
6350                                           1 + need_prolog, need_prolog, 0);
6351         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6352                 if (sscreen->b.chip_class >= GFX9) {
6353                         struct si_shader_selector *ls = shader->key.part.tcs.ls;
6354                         LLVMValueRef parts[4];
6355
6356                         /* TCS main part */
6357                         parts[2] = ctx.main_fn;
6358
6359                         /* TCS epilog */
6360                         union si_shader_part_key tcs_epilog_key;
6361                         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6362                         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6363                         si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6364                         parts[3] = ctx.main_fn;
6365
6366                         /* VS prolog */
6367                         if (ls->vs_needs_prolog) {
6368                                 union si_shader_part_key vs_prolog_key;
6369                                 si_get_vs_prolog_key(&ls->info,
6370                                                      shader->info.num_input_sgprs,
6371                                                      &shader->key.part.tcs.ls_prolog,
6372                                                      shader, &vs_prolog_key);
6373                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6374                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6375                                 parts[0] = ctx.main_fn;
6376                         }
6377
6378                         /* VS as LS main part */
6379                         struct si_shader shader_ls = {};
6380                         shader_ls.selector = ls;
6381                         shader_ls.key.as_ls = 1;
6382                         shader_ls.key.mono = shader->key.mono;
6383                         shader_ls.key.opt = shader->key.opt;
6384                         si_llvm_context_set_tgsi(&ctx, &shader_ls);
6385
6386                         if (!si_compile_tgsi_main(&ctx, true)) {
6387                                 si_llvm_dispose(&ctx);
6388                                 return -1;
6389                         }
6390                         shader->info.uses_instanceid |= ls->info.uses_instanceid;
6391                         parts[1] = ctx.main_fn;
6392
6393                         /* Reset the shader context. */
6394                         ctx.shader = shader;
6395                         ctx.type = PIPE_SHADER_TESS_CTRL;
6396
6397                         si_build_wrapper_function(&ctx,
6398                                                   parts + !ls->vs_needs_prolog,
6399                                                   4 - !ls->vs_needs_prolog, 0,
6400                                                   ls->vs_needs_prolog ? 2 : 1);
6401                 } else {
6402                         LLVMValueRef parts[2];
6403                         union si_shader_part_key epilog_key;
6404
6405                         parts[0] = ctx.main_fn;
6406
6407                         memset(&epilog_key, 0, sizeof(epilog_key));
6408                         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6409                         si_build_tcs_epilog_function(&ctx, &epilog_key);
6410                         parts[1] = ctx.main_fn;
6411
6412                         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6413                 }
6414         } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6415                 if (ctx.screen->b.chip_class >= GFX9) {
6416                         struct si_shader_selector *es = shader->key.part.gs.es;
6417                         LLVMValueRef es_prolog = NULL;
6418                         LLVMValueRef es_main = NULL;
6419                         LLVMValueRef gs_prolog = NULL;
6420                         LLVMValueRef gs_main = ctx.main_fn;
6421
6422                         /* GS prolog */
6423                         union si_shader_part_key gs_prolog_key;
6424                         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6425                         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6426                         gs_prolog_key.gs_prolog.is_monolithic = true;
6427                         si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6428                         gs_prolog = ctx.main_fn;
6429
6430                         /* ES prolog */
6431                         if (es->vs_needs_prolog) {
6432                                 union si_shader_part_key vs_prolog_key;
6433                                 si_get_vs_prolog_key(&es->info,
6434                                                      shader->info.num_input_sgprs,
6435                                                      &shader->key.part.tcs.ls_prolog,
6436                                                      shader, &vs_prolog_key);
6437                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6438                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6439                                 es_prolog = ctx.main_fn;
6440                         }
6441
6442                         /* ES main part */
6443                         struct si_shader shader_es = {};
6444                         shader_es.selector = es;
6445                         shader_es.key.as_es = 1;
6446                         shader_es.key.mono = shader->key.mono;
6447                         shader_es.key.opt = shader->key.opt;
6448                         si_llvm_context_set_tgsi(&ctx, &shader_es);
6449
6450                         if (!si_compile_tgsi_main(&ctx, true)) {
6451                                 si_llvm_dispose(&ctx);
6452                                 return -1;
6453                         }
6454                         shader->info.uses_instanceid |= es->info.uses_instanceid;
6455                         es_main = ctx.main_fn;
6456
6457                         /* Reset the shader context. */
6458                         ctx.shader = shader;
6459                         ctx.type = PIPE_SHADER_GEOMETRY;
6460
6461                         /* Prepare the array of shader parts. */
6462                         LLVMValueRef parts[4];
6463                         unsigned num_parts = 0, main_part, next_first_part;
6464
6465                         if (es_prolog)
6466                                 parts[num_parts++] = es_prolog;
6467
6468                         parts[main_part = num_parts++] = es_main;
6469                         parts[next_first_part = num_parts++] = gs_prolog;
6470                         parts[num_parts++] = gs_main;
6471
6472                         si_build_wrapper_function(&ctx, parts, num_parts,
6473                                                   main_part, next_first_part);
6474                 } else {
6475                         LLVMValueRef parts[2];
6476                         union si_shader_part_key prolog_key;
6477
6478                         parts[1] = ctx.main_fn;
6479
6480                         memset(&prolog_key, 0, sizeof(prolog_key));
6481                         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6482                         si_build_gs_prolog_function(&ctx, &prolog_key);
6483                         parts[0] = ctx.main_fn;
6484
6485                         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6486                 }
6487         } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6488                 LLVMValueRef parts[3];
6489                 union si_shader_part_key prolog_key;
6490                 union si_shader_part_key epilog_key;
6491                 bool need_prolog;
6492
6493                 si_get_ps_prolog_key(shader, &prolog_key, false);
6494                 need_prolog = si_need_ps_prolog(&prolog_key);
6495
6496                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6497
6498                 if (need_prolog) {
6499                         si_build_ps_prolog_function(&ctx, &prolog_key);
6500                         parts[0] = ctx.main_fn;
6501                 }
6502
6503                 si_get_ps_epilog_key(shader, &epilog_key);
6504                 si_build_ps_epilog_function(&ctx, &epilog_key);
6505                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6506
6507                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6508                                           need_prolog ? 1 : 0, 0);
6509         }
6510
6511         si_llvm_optimize_module(&ctx);
6512
6513         /* Post-optimization transformations and analysis. */
6514         si_optimize_vs_outputs(&ctx);
6515
6516         if ((debug && debug->debug_message) ||
6517             r600_can_dump_shader(&sscreen->b, ctx.type))
6518                 si_count_scratch_private_memory(&ctx);
6519
6520         /* Compile to bytecode. */
6521         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6522                             ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6523         si_llvm_dispose(&ctx);
6524         if (r) {
6525                 fprintf(stderr, "LLVM failed to compile shader\n");
6526                 return r;
6527         }
6528
6529         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6530          * LLVM 3.9svn has this bug.
6531          */
6532         if (sel->type == PIPE_SHADER_COMPUTE) {
6533                 unsigned wave_size = 64;
6534                 unsigned max_vgprs = 256;
6535                 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6536                 unsigned max_sgprs_per_wave = 128;
6537                 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6538                 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6539                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6540
6541                 max_vgprs = max_vgprs / min_waves_per_simd;
6542                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6543
6544                 if (shader->config.num_sgprs > max_sgprs ||
6545                     shader->config.num_vgprs > max_vgprs) {
6546                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
6547                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6548                                 shader->config.num_sgprs, shader->config.num_vgprs,
6549                                 max_sgprs, max_vgprs);
6550
6551                         /* Just terminate the process, because dependent
6552                          * shaders can hang due to bad input data, but use
6553                          * the env var to allow shader-db to work.
6554                          */
6555                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6556                                 abort();
6557                 }
6558         }
6559
6560         /* Add the scratch offset to input SGPRs. */
6561         if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6562                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6563
6564         /* Calculate the number of fragment input VGPRs. */
6565         if (ctx.type == PIPE_SHADER_FRAGMENT) {
6566                 shader->info.num_input_vgprs = 0;
6567                 shader->info.face_vgpr_index = -1;
6568
6569                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6570                         shader->info.num_input_vgprs += 2;
6571                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6572                         shader->info.num_input_vgprs += 2;
6573                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6574                         shader->info.num_input_vgprs += 2;
6575                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6576                         shader->info.num_input_vgprs += 3;
6577                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6578                         shader->info.num_input_vgprs += 2;
6579                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6580                         shader->info.num_input_vgprs += 2;
6581                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6582                         shader->info.num_input_vgprs += 2;
6583                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6584                         shader->info.num_input_vgprs += 1;
6585                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6586                         shader->info.num_input_vgprs += 1;
6587                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6588                         shader->info.num_input_vgprs += 1;
6589                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6590                         shader->info.num_input_vgprs += 1;
6591                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6592                         shader->info.num_input_vgprs += 1;
6593                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6594                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6595                         shader->info.num_input_vgprs += 1;
6596                 }
6597                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6598                         shader->info.num_input_vgprs += 1;
6599                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6600                         shader->info.num_input_vgprs += 1;
6601                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6602                         shader->info.num_input_vgprs += 1;
6603         }
6604
6605         return 0;
6606 }
6607
6608 /**
6609  * Create, compile and return a shader part (prolog or epilog).
6610  *
6611  * \param sscreen       screen
6612  * \param list          list of shader parts of the same category
6613  * \param type          shader type
6614  * \param key           shader part key
6615  * \param prolog        whether the part being requested is a prolog
6616  * \param tm            LLVM target machine
6617  * \param debug         debug callback
6618  * \param build         the callback responsible for building the main function
6619  * \return              non-NULL on success
6620  */
6621 static struct si_shader_part *
6622 si_get_shader_part(struct si_screen *sscreen,
6623                    struct si_shader_part **list,
6624                    enum pipe_shader_type type,
6625                    bool prolog,
6626                    union si_shader_part_key *key,
6627                    LLVMTargetMachineRef tm,
6628                    struct pipe_debug_callback *debug,
6629                    void (*build)(struct si_shader_context *,
6630                                  union si_shader_part_key *),
6631                    const char *name)
6632 {
6633         struct si_shader_part *result;
6634
6635         mtx_lock(&sscreen->shader_parts_mutex);
6636
6637         /* Find existing. */
6638         for (result = *list; result; result = result->next) {
6639                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6640                         mtx_unlock(&sscreen->shader_parts_mutex);
6641                         return result;
6642                 }
6643         }
6644
6645         /* Compile a new one. */
6646         result = CALLOC_STRUCT(si_shader_part);
6647         result->key = *key;
6648
6649         struct si_shader shader = {};
6650         struct si_shader_context ctx;
6651         struct gallivm_state *gallivm = &ctx.gallivm;
6652
6653         si_init_shader_ctx(&ctx, sscreen, tm);
6654         ctx.shader = &shader;
6655         ctx.type = type;
6656
6657         switch (type) {
6658         case PIPE_SHADER_VERTEX:
6659                 break;
6660         case PIPE_SHADER_TESS_CTRL:
6661                 assert(!prolog);
6662                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6663                 break;
6664         case PIPE_SHADER_GEOMETRY:
6665                 assert(prolog);
6666                 break;
6667         case PIPE_SHADER_FRAGMENT:
6668                 if (prolog)
6669                         shader.key.part.ps.prolog = key->ps_prolog.states;
6670                 else
6671                         shader.key.part.ps.epilog = key->ps_epilog.states;
6672                 break;
6673         default:
6674                 unreachable("bad shader part");
6675         }
6676
6677         build(&ctx, key);
6678
6679         /* Compile. */
6680         si_llvm_optimize_module(&ctx);
6681
6682         if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6683                             gallivm->module, debug, ctx.type, name)) {
6684                 FREE(result);
6685                 result = NULL;
6686                 goto out;
6687         }
6688
6689         result->next = *list;
6690         *list = result;
6691
6692 out:
6693         si_llvm_dispose(&ctx);
6694         mtx_unlock(&sscreen->shader_parts_mutex);
6695         return result;
6696 }
6697
6698 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6699 {
6700         struct gallivm_state *gallivm = &ctx->gallivm;
6701         LLVMValueRef ptr[2], list;
6702
6703         /* Get the pointer to rw buffers. */
6704         ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6705         ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6706         list = lp_build_gather_values(gallivm, ptr, 2);
6707         list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6708         list = LLVMBuildIntToPtr(gallivm->builder, list,
6709                                  si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6710         return list;
6711 }
6712
6713 /**
6714  * Build the vertex shader prolog function.
6715  *
6716  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6717  * All inputs are returned unmodified. The vertex load indices are
6718  * stored after them, which will be used by the API VS for fetching inputs.
6719  *
6720  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6721  *   input_v0,
6722  *   input_v1,
6723  *   input_v2,
6724  *   input_v3,
6725  *   (VertexID + BaseVertex),
6726  *   (InstanceID + StartInstance),
6727  *   (InstanceID / 2 + StartInstance)
6728  */
6729 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6730                                         union si_shader_part_key *key)
6731 {
6732         struct gallivm_state *gallivm = &ctx->gallivm;
6733         struct si_function_info fninfo;
6734         LLVMTypeRef *returns;
6735         LLVMValueRef ret, func;
6736         int num_returns, i;
6737         unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
6738                                  key->vs_prolog.num_merged_next_stage_vgprs;
6739         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6740         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6741                                       num_input_vgprs;
6742         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6743
6744         si_init_function_info(&fninfo);
6745
6746         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6747         returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6748                          sizeof(LLVMTypeRef));
6749         num_returns = 0;
6750
6751         /* Declare input and output SGPRs. */
6752         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6753                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6754                 returns[num_returns++] = ctx->i32;
6755         }
6756
6757         /* Preloaded VGPRs (outputs must be floats) */
6758         for (i = 0; i < num_input_vgprs; i++) {
6759                 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6760                 returns[num_returns++] = ctx->f32;
6761         }
6762
6763         fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
6764         fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
6765
6766         /* Vertex load indices. */
6767         for (i = 0; i <= key->vs_prolog.last_input; i++)
6768                 returns[num_returns++] = ctx->f32;
6769
6770         /* Create the function. */
6771         si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6772         func = ctx->main_fn;
6773
6774         if (key->vs_prolog.num_merged_next_stage_vgprs &&
6775             !key->vs_prolog.is_monolithic)
6776                 si_init_exec_from_input(ctx, 3, 0);
6777
6778         /* Copy inputs to outputs. This should be no-op, as the registers match,
6779          * but it will prevent the compiler from overwriting them unintentionally.
6780          */
6781         ret = ctx->return_value;
6782         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6783                 LLVMValueRef p = LLVMGetParam(func, i);
6784                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6785         }
6786         for (; i < fninfo.num_params; i++) {
6787                 LLVMValueRef p = LLVMGetParam(func, i);
6788                 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6789                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6790         }
6791
6792         /* Compute vertex load indices from instance divisors. */
6793         LLVMValueRef instance_divisor_constbuf = NULL;
6794
6795         if (key->vs_prolog.states.instance_divisor_is_fetched) {
6796                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6797                 LLVMValueRef buf_index =
6798                         LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6799                 instance_divisor_constbuf =
6800                         ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6801         }
6802
6803         for (i = 0; i <= key->vs_prolog.last_input; i++) {
6804                 bool divisor_is_one =
6805                         key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6806                 bool divisor_is_fetched =
6807                         key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6808                 LLVMValueRef index;
6809
6810                 if (divisor_is_one || divisor_is_fetched) {
6811                         LLVMValueRef divisor = ctx->i32_1;
6812
6813                         if (divisor_is_fetched) {
6814                                 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6815                                                             LLVMConstInt(ctx->i32, i * 4, 0));
6816                                 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6817                                                            ctx->i32, "");
6818                         }
6819
6820                         /* InstanceID / Divisor + StartInstance */
6821                         index = get_instance_index_for_fetch(ctx,
6822                                                              user_sgpr_base +
6823                                                              SI_SGPR_START_INSTANCE,
6824                                                              divisor);
6825                 } else {
6826                         /* VertexID + BaseVertex */
6827                         index = LLVMBuildAdd(gallivm->builder,
6828                                              ctx->abi.vertex_id,
6829                                              LLVMGetParam(func, user_sgpr_base +
6830                                                                 SI_SGPR_BASE_VERTEX), "");
6831                 }
6832
6833                 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6834                 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6835                                            fninfo.num_params + i, "");
6836         }
6837
6838         si_llvm_build_ret(ctx, ret);
6839 }
6840
6841 static bool si_get_vs_prolog(struct si_screen *sscreen,
6842                              LLVMTargetMachineRef tm,
6843                              struct si_shader *shader,
6844                              struct pipe_debug_callback *debug,
6845                              struct si_shader *main_part,
6846                              const struct si_vs_prolog_bits *key)
6847 {
6848         struct si_shader_selector *vs = main_part->selector;
6849
6850         /* The prolog is a no-op if there are no inputs. */
6851         if (!vs->vs_needs_prolog)
6852                 return true;
6853
6854         /* Get the prolog. */
6855         union si_shader_part_key prolog_key;
6856         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6857                              key, shader, &prolog_key);
6858
6859         shader->prolog =
6860                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6861                                    PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6862                                    debug, si_build_vs_prolog_function,
6863                                    "Vertex Shader Prolog");
6864         return shader->prolog != NULL;
6865 }
6866
6867 /**
6868  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6869  */
6870 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6871                                       LLVMTargetMachineRef tm,
6872                                       struct si_shader *shader,
6873                                       struct pipe_debug_callback *debug)
6874 {
6875         return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6876                                 &shader->key.part.vs.prolog);
6877 }
6878
6879 /**
6880  * Compile the TCS epilog function. This writes tesselation factors to memory
6881  * based on the output primitive type of the tesselator (determined by TES).
6882  */
6883 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6884                                          union si_shader_part_key *key)
6885 {
6886         struct gallivm_state *gallivm = &ctx->gallivm;
6887         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6888         struct si_function_info fninfo;
6889         LLVMValueRef func;
6890
6891         si_init_function_info(&fninfo);
6892
6893         if (ctx->screen->b.chip_class >= GFX9) {
6894                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6895                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6896                 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
6897                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6898                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6899                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6900                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6901                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6902                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6903                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6904                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6905                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6906                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6907                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6908                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6909                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6910                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6911                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6912                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6913                 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6914         } else {
6915                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6916                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6917                 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6918                 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6919                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6920                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6921                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6922                 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6923                 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6924                 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6925                 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6926         }
6927
6928         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
6929         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
6930         unsigned tess_factors_idx =
6931                 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
6932         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
6933         add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
6934
6935         /* Create the function. */
6936         si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
6937                            ctx->screen->b.chip_class >= CIK ? 128 : 64);
6938         declare_lds_as_pointer(ctx);
6939         func = ctx->main_fn;
6940
6941         si_write_tess_factors(bld_base,
6942                               LLVMGetParam(func, tess_factors_idx),
6943                               LLVMGetParam(func, tess_factors_idx + 1),
6944                               LLVMGetParam(func, tess_factors_idx + 2));
6945
6946         LLVMBuildRetVoid(gallivm->builder);
6947 }
6948
6949 /**
6950  * Select and compile (or reuse) TCS parts (epilog).
6951  */
6952 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6953                                        LLVMTargetMachineRef tm,
6954                                        struct si_shader *shader,
6955                                        struct pipe_debug_callback *debug)
6956 {
6957         if (sscreen->b.chip_class >= GFX9) {
6958                 struct si_shader *ls_main_part =
6959                         shader->key.part.tcs.ls->main_shader_part_ls;
6960
6961                 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
6962                                       &shader->key.part.tcs.ls_prolog))
6963                         return false;
6964
6965                 shader->previous_stage = ls_main_part;
6966         }
6967
6968         /* Get the epilog. */
6969         union si_shader_part_key epilog_key;
6970         memset(&epilog_key, 0, sizeof(epilog_key));
6971         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6972
6973         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6974                                             PIPE_SHADER_TESS_CTRL, false,
6975                                             &epilog_key, tm, debug,
6976                                             si_build_tcs_epilog_function,
6977                                             "Tessellation Control Shader Epilog");
6978         return shader->epilog != NULL;
6979 }
6980
6981 /**
6982  * Select and compile (or reuse) GS parts (prolog).
6983  */
6984 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
6985                                       LLVMTargetMachineRef tm,
6986                                       struct si_shader *shader,
6987                                       struct pipe_debug_callback *debug)
6988 {
6989         if (sscreen->b.chip_class >= GFX9) {
6990                 struct si_shader *es_main_part =
6991                         shader->key.part.gs.es->main_shader_part_es;
6992
6993                 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
6994                     !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
6995                                       &shader->key.part.gs.vs_prolog))
6996                         return false;
6997
6998                 shader->previous_stage = es_main_part;
6999         }
7000
7001         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7002                 return true;
7003
7004         union si_shader_part_key prolog_key;
7005         memset(&prolog_key, 0, sizeof(prolog_key));
7006         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7007
7008         shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7009                                             PIPE_SHADER_GEOMETRY, true,
7010                                             &prolog_key, tm, debug,
7011                                             si_build_gs_prolog_function,
7012                                             "Geometry Shader Prolog");
7013         return shader->prolog2 != NULL;
7014 }
7015
7016 /**
7017  * Build the pixel shader prolog function. This handles:
7018  * - two-side color selection and interpolation
7019  * - overriding interpolation parameters for the API PS
7020  * - polygon stippling
7021  *
7022  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7023  * overriden by other states. (e.g. per-sample interpolation)
7024  * Interpolated colors are stored after the preloaded VGPRs.
7025  */
7026 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7027                                         union si_shader_part_key *key)
7028 {
7029         struct gallivm_state *gallivm = &ctx->gallivm;
7030         struct si_function_info fninfo;
7031         LLVMValueRef ret, func;
7032         int num_returns, i, num_color_channels;
7033
7034         assert(si_need_ps_prolog(key));
7035
7036         si_init_function_info(&fninfo);
7037
7038         /* Declare inputs. */
7039         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7040                 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7041
7042         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7043                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7044
7045         /* Declare outputs (same as inputs + add colors if needed) */
7046         num_returns = fninfo.num_params;
7047         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7048         for (i = 0; i < num_color_channels; i++)
7049                 fninfo.types[num_returns++] = ctx->f32;
7050
7051         /* Create the function. */
7052         si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7053                            &fninfo, 0);
7054         func = ctx->main_fn;
7055
7056         /* Copy inputs to outputs. This should be no-op, as the registers match,
7057          * but it will prevent the compiler from overwriting them unintentionally.
7058          */
7059         ret = ctx->return_value;
7060         for (i = 0; i < fninfo.num_params; i++) {
7061                 LLVMValueRef p = LLVMGetParam(func, i);
7062                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7063         }
7064
7065         /* Polygon stippling. */
7066         if (key->ps_prolog.states.poly_stipple) {
7067                 /* POS_FIXED_PT is always last. */
7068                 unsigned pos = key->ps_prolog.num_input_sgprs +
7069                                key->ps_prolog.num_input_vgprs - 1;
7070                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7071
7072                 si_llvm_emit_polygon_stipple(ctx, list, pos);
7073         }
7074
7075         if (key->ps_prolog.states.bc_optimize_for_persp ||
7076             key->ps_prolog.states.bc_optimize_for_linear) {
7077                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7078                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7079
7080                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7081                  * The hw doesn't compute CENTROID if the whole wave only
7082                  * contains fully-covered quads.
7083                  *
7084                  * PRIM_MASK is after user SGPRs.
7085                  */
7086                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7087                 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7088                                             LLVMConstInt(ctx->i32, 31, 0), "");
7089                 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7090                                              ctx->i1, "");
7091
7092                 if (key->ps_prolog.states.bc_optimize_for_persp) {
7093                         /* Read PERSP_CENTER. */
7094                         for (i = 0; i < 2; i++)
7095                                 center[i] = LLVMGetParam(func, base + 2 + i);
7096                         /* Read PERSP_CENTROID. */
7097                         for (i = 0; i < 2; i++)
7098                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
7099                         /* Select PERSP_CENTROID. */
7100                         for (i = 0; i < 2; i++) {
7101                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7102                                                       center[i], centroid[i], "");
7103                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7104                                                            tmp, base + 4 + i, "");
7105                         }
7106                 }
7107                 if (key->ps_prolog.states.bc_optimize_for_linear) {
7108                         /* Read LINEAR_CENTER. */
7109                         for (i = 0; i < 2; i++)
7110                                 center[i] = LLVMGetParam(func, base + 8 + i);
7111                         /* Read LINEAR_CENTROID. */
7112                         for (i = 0; i < 2; i++)
7113                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
7114                         /* Select LINEAR_CENTROID. */
7115                         for (i = 0; i < 2; i++) {
7116                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7117                                                       center[i], centroid[i], "");
7118                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7119                                                            tmp, base + 10 + i, "");
7120                         }
7121                 }
7122         }
7123
7124         /* Force per-sample interpolation. */
7125         if (key->ps_prolog.states.force_persp_sample_interp) {
7126                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7127                 LLVMValueRef persp_sample[2];
7128
7129                 /* Read PERSP_SAMPLE. */
7130                 for (i = 0; i < 2; i++)
7131                         persp_sample[i] = LLVMGetParam(func, base + i);
7132                 /* Overwrite PERSP_CENTER. */
7133                 for (i = 0; i < 2; i++)
7134                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7135                                                    persp_sample[i], base + 2 + i, "");
7136                 /* Overwrite PERSP_CENTROID. */
7137                 for (i = 0; i < 2; i++)
7138                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7139                                                    persp_sample[i], base + 4 + i, "");
7140         }
7141         if (key->ps_prolog.states.force_linear_sample_interp) {
7142                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7143                 LLVMValueRef linear_sample[2];
7144
7145                 /* Read LINEAR_SAMPLE. */
7146                 for (i = 0; i < 2; i++)
7147                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7148                 /* Overwrite LINEAR_CENTER. */
7149                 for (i = 0; i < 2; i++)
7150                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7151                                                    linear_sample[i], base + 8 + i, "");
7152                 /* Overwrite LINEAR_CENTROID. */
7153                 for (i = 0; i < 2; i++)
7154                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7155                                                    linear_sample[i], base + 10 + i, "");
7156         }
7157
7158         /* Force center interpolation. */
7159         if (key->ps_prolog.states.force_persp_center_interp) {
7160                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7161                 LLVMValueRef persp_center[2];
7162
7163                 /* Read PERSP_CENTER. */
7164                 for (i = 0; i < 2; i++)
7165                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
7166                 /* Overwrite PERSP_SAMPLE. */
7167                 for (i = 0; i < 2; i++)
7168                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7169                                                    persp_center[i], base + i, "");
7170                 /* Overwrite PERSP_CENTROID. */
7171                 for (i = 0; i < 2; i++)
7172                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7173                                                    persp_center[i], base + 4 + i, "");
7174         }
7175         if (key->ps_prolog.states.force_linear_center_interp) {
7176                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7177                 LLVMValueRef linear_center[2];
7178
7179                 /* Read LINEAR_CENTER. */
7180                 for (i = 0; i < 2; i++)
7181                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
7182                 /* Overwrite LINEAR_SAMPLE. */
7183                 for (i = 0; i < 2; i++)
7184                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7185                                                    linear_center[i], base + 6 + i, "");
7186                 /* Overwrite LINEAR_CENTROID. */
7187                 for (i = 0; i < 2; i++)
7188                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7189                                                    linear_center[i], base + 10 + i, "");
7190         }
7191
7192         /* Interpolate colors. */
7193         unsigned color_out_idx = 0;
7194         for (i = 0; i < 2; i++) {
7195                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7196                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7197                                      key->ps_prolog.face_vgpr_index;
7198                 LLVMValueRef interp[2], color[4];
7199                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7200
7201                 if (!writemask)
7202                         continue;
7203
7204                 /* If the interpolation qualifier is not CONSTANT (-1). */
7205                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7206                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7207                                                key->ps_prolog.color_interp_vgpr_index[i];
7208
7209                         /* Get the (i,j) updated by bc_optimize handling. */
7210                         interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7211                                                           interp_vgpr, "");
7212                         interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7213                                                           interp_vgpr + 1, "");
7214                         interp_ij = lp_build_gather_values(gallivm, interp, 2);
7215                 }
7216
7217                 /* Use the absolute location of the input. */
7218                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7219
7220                 if (key->ps_prolog.states.color_two_side) {
7221                         face = LLVMGetParam(func, face_vgpr);
7222                         face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7223                 }
7224
7225                 interp_fs_input(ctx,
7226                                 key->ps_prolog.color_attr_index[i],
7227                                 TGSI_SEMANTIC_COLOR, i,
7228                                 key->ps_prolog.num_interp_inputs,
7229                                 key->ps_prolog.colors_read, interp_ij,
7230                                 prim_mask, face, color);
7231
7232                 while (writemask) {
7233                         unsigned chan = u_bit_scan(&writemask);
7234                         ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7235                                                    fninfo.num_params + color_out_idx++, "");
7236                 }
7237         }
7238
7239         /* Tell LLVM to insert WQM instruction sequence when needed. */
7240         if (key->ps_prolog.wqm) {
7241                 LLVMAddTargetDependentFunctionAttr(func,
7242                                                    "amdgpu-ps-wqm-outputs", "");
7243         }
7244
7245         si_llvm_build_ret(ctx, ret);
7246 }
7247
7248 /**
7249  * Build the pixel shader epilog function. This handles everything that must be
7250  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7251  */
7252 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7253                                         union si_shader_part_key *key)
7254 {
7255         struct gallivm_state *gallivm = &ctx->gallivm;
7256         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7257         struct si_function_info fninfo;
7258         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7259         int i;
7260         struct si_ps_exports exp = {};
7261
7262         si_init_function_info(&fninfo);
7263
7264         /* Declare input SGPRs. */
7265         ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7266         ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7267         ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7268         add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7269
7270         /* Declare input VGPRs. */
7271         unsigned required_num_params =
7272                      fninfo.num_sgpr_params +
7273                      util_bitcount(key->ps_epilog.colors_written) * 4 +
7274                      key->ps_epilog.writes_z +
7275                      key->ps_epilog.writes_stencil +
7276                      key->ps_epilog.writes_samplemask;
7277
7278         required_num_params = MAX2(required_num_params,
7279                                    fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7280
7281         while (fninfo.num_params < required_num_params)
7282                 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7283
7284         /* Create the function. */
7285         si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7286         /* Disable elimination of unused inputs. */
7287         si_llvm_add_attribute(ctx->main_fn,
7288                                   "InitialPSInputAddr", 0xffffff);
7289
7290         /* Process colors. */
7291         unsigned vgpr = fninfo.num_sgpr_params;
7292         unsigned colors_written = key->ps_epilog.colors_written;
7293         int last_color_export = -1;
7294
7295         /* Find the last color export. */
7296         if (!key->ps_epilog.writes_z &&
7297             !key->ps_epilog.writes_stencil &&
7298             !key->ps_epilog.writes_samplemask) {
7299                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7300
7301                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7302                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7303                         /* Just set this if any of the colorbuffers are enabled. */
7304                         if (spi_format &
7305                             ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7306                                 last_color_export = 0;
7307                 } else {
7308                         for (i = 0; i < 8; i++)
7309                                 if (colors_written & (1 << i) &&
7310                                     (spi_format >> (i * 4)) & 0xf)
7311                                         last_color_export = i;
7312                 }
7313         }
7314
7315         while (colors_written) {
7316                 LLVMValueRef color[4];
7317                 int mrt = u_bit_scan(&colors_written);
7318
7319                 for (i = 0; i < 4; i++)
7320                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7321
7322                 si_export_mrt_color(bld_base, color, mrt,
7323                                     fninfo.num_params - 1,
7324                                     mrt == last_color_export, &exp);
7325         }
7326
7327         /* Process depth, stencil, samplemask. */
7328         if (key->ps_epilog.writes_z)
7329                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7330         if (key->ps_epilog.writes_stencil)
7331                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7332         if (key->ps_epilog.writes_samplemask)
7333                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7334
7335         if (depth || stencil || samplemask)
7336                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7337         else if (last_color_export == -1)
7338                 si_export_null(bld_base);
7339
7340         if (exp.num)
7341                 si_emit_ps_exports(ctx, &exp);
7342
7343         /* Compile. */
7344         LLVMBuildRetVoid(gallivm->builder);
7345 }
7346
7347 /**
7348  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7349  */
7350 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7351                                       LLVMTargetMachineRef tm,
7352                                       struct si_shader *shader,
7353                                       struct pipe_debug_callback *debug)
7354 {
7355         union si_shader_part_key prolog_key;
7356         union si_shader_part_key epilog_key;
7357
7358         /* Get the prolog. */
7359         si_get_ps_prolog_key(shader, &prolog_key, true);
7360
7361         /* The prolog is a no-op if these aren't set. */
7362         if (si_need_ps_prolog(&prolog_key)) {
7363                 shader->prolog =
7364                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
7365                                            PIPE_SHADER_FRAGMENT, true,
7366                                            &prolog_key, tm, debug,
7367                                            si_build_ps_prolog_function,
7368                                            "Fragment Shader Prolog");
7369                 if (!shader->prolog)
7370                         return false;
7371         }
7372
7373         /* Get the epilog. */
7374         si_get_ps_epilog_key(shader, &epilog_key);
7375
7376         shader->epilog =
7377                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7378                                    PIPE_SHADER_FRAGMENT, false,
7379                                    &epilog_key, tm, debug,
7380                                    si_build_ps_epilog_function,
7381                                    "Fragment Shader Epilog");
7382         if (!shader->epilog)
7383                 return false;
7384
7385         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7386         if (shader->key.part.ps.prolog.poly_stipple) {
7387                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7388                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7389         }
7390
7391         /* Set up the enable bits for per-sample shading if needed. */
7392         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7393             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7394              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7395                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7396                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7397                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7398         }
7399         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7400             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7401              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7402                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7403                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7404                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7405         }
7406         if (shader->key.part.ps.prolog.force_persp_center_interp &&
7407             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7408              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7409                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7410                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7411                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7412         }
7413         if (shader->key.part.ps.prolog.force_linear_center_interp &&
7414             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7415              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7416                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7417                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7418                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7419         }
7420
7421         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7422         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7423             !(shader->config.spi_ps_input_ena & 0xf)) {
7424                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7425                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7426         }
7427
7428         /* At least one pair of interpolation weights must be enabled. */
7429         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7430                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7431                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7432         }
7433
7434         /* The sample mask input is always enabled, because the API shader always
7435          * passes it through to the epilog. Disable it here if it's unused.
7436          */
7437         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7438             !shader->selector->info.reads_samplemask)
7439                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7440
7441         return true;
7442 }
7443
7444 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7445                                       unsigned *lds_size)
7446 {
7447         /* SPI barrier management bug:
7448          *   Make sure we have at least 4k of LDS in use to avoid the bug.
7449          *   It applies to workgroup sizes of more than one wavefront.
7450          */
7451         if (sscreen->b.family == CHIP_BONAIRE ||
7452             sscreen->b.family == CHIP_KABINI ||
7453             sscreen->b.family == CHIP_MULLINS)
7454                 *lds_size = MAX2(*lds_size, 8);
7455 }
7456
7457 static void si_fix_resource_usage(struct si_screen *sscreen,
7458                                   struct si_shader *shader)
7459 {
7460         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7461
7462         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7463
7464         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7465             si_get_max_workgroup_size(shader) > 64) {
7466                 si_multiwave_lds_size_workaround(sscreen,
7467                                                  &shader->config.lds_size);
7468         }
7469 }
7470
7471 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7472                      struct si_shader *shader,
7473                      struct pipe_debug_callback *debug)
7474 {
7475         struct si_shader_selector *sel = shader->selector;
7476         struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7477         int r;
7478
7479         /* LS, ES, VS are compiled on demand if the main part hasn't been
7480          * compiled for that stage.
7481          *
7482          * Vertex shaders are compiled on demand when a vertex fetch
7483          * workaround must be applied.
7484          */
7485         if (shader->is_monolithic) {
7486                 /* Monolithic shader (compiled as a whole, has many variants,
7487                  * may take a long time to compile).
7488                  */
7489                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7490                 if (r)
7491                         return r;
7492         } else {
7493                 /* The shader consists of several parts:
7494                  *
7495                  * - the middle part is the user shader, it has 1 variant only
7496                  *   and it was compiled during the creation of the shader
7497                  *   selector
7498                  * - the prolog part is inserted at the beginning
7499                  * - the epilog part is inserted at the end
7500                  *
7501                  * The prolog and epilog have many (but simple) variants.
7502                  *
7503                  * Starting with gfx9, geometry and tessellation control
7504                  * shaders also contain the prolog and user shader parts of
7505                  * the previous shader stage.
7506                  */
7507
7508                 if (!mainp)
7509                         return -1;
7510
7511                 /* Copy the compiled TGSI shader data over. */
7512                 shader->is_binary_shared = true;
7513                 shader->binary = mainp->binary;
7514                 shader->config = mainp->config;
7515                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7516                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7517                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7518                 memcpy(shader->info.vs_output_param_offset,
7519                        mainp->info.vs_output_param_offset,
7520                        sizeof(mainp->info.vs_output_param_offset));
7521                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7522                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7523                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7524
7525                 /* Select prologs and/or epilogs. */
7526                 switch (sel->type) {
7527                 case PIPE_SHADER_VERTEX:
7528                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7529                                 return -1;
7530                         break;
7531                 case PIPE_SHADER_TESS_CTRL:
7532                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7533                                 return -1;
7534                         break;
7535                 case PIPE_SHADER_TESS_EVAL:
7536                         break;
7537                 case PIPE_SHADER_GEOMETRY:
7538                         if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7539                                 return -1;
7540                         break;
7541                 case PIPE_SHADER_FRAGMENT:
7542                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7543                                 return -1;
7544
7545                         /* Make sure we have at least as many VGPRs as there
7546                          * are allocated inputs.
7547                          */
7548                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7549                                                         shader->info.num_input_vgprs);
7550                         break;
7551                 }
7552
7553                 /* Update SGPR and VGPR counts. */
7554                 if (shader->prolog) {
7555                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7556                                                         shader->prolog->config.num_sgprs);
7557                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7558                                                         shader->prolog->config.num_vgprs);
7559                 }
7560                 if (shader->previous_stage) {
7561                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7562                                                         shader->previous_stage->config.num_sgprs);
7563                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7564                                                         shader->previous_stage->config.num_vgprs);
7565                         shader->config.spilled_sgprs =
7566                                 MAX2(shader->config.spilled_sgprs,
7567                                      shader->previous_stage->config.spilled_sgprs);
7568                         shader->config.spilled_vgprs =
7569                                 MAX2(shader->config.spilled_vgprs,
7570                                      shader->previous_stage->config.spilled_vgprs);
7571                         shader->config.private_mem_vgprs =
7572                                 MAX2(shader->config.private_mem_vgprs,
7573                                      shader->previous_stage->config.private_mem_vgprs);
7574                         shader->config.scratch_bytes_per_wave =
7575                                 MAX2(shader->config.scratch_bytes_per_wave,
7576                                      shader->previous_stage->config.scratch_bytes_per_wave);
7577                         shader->info.uses_instanceid |=
7578                                 shader->previous_stage->info.uses_instanceid;
7579                 }
7580                 if (shader->prolog2) {
7581                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7582                                                         shader->prolog2->config.num_sgprs);
7583                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7584                                                         shader->prolog2->config.num_vgprs);
7585                 }
7586                 if (shader->epilog) {
7587                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7588                                                         shader->epilog->config.num_sgprs);
7589                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7590                                                         shader->epilog->config.num_vgprs);
7591                 }
7592         }
7593
7594         si_fix_resource_usage(sscreen, shader);
7595         si_shader_dump(sscreen, shader, debug, sel->info.processor,
7596                        stderr, true);
7597
7598         /* Upload. */
7599         r = si_shader_binary_upload(sscreen, shader);
7600         if (r) {
7601                 fprintf(stderr, "LLVM failed to upload shader\n");
7602                 return r;
7603         }
7604
7605         return 0;
7606 }
7607
7608 void si_shader_destroy(struct si_shader *shader)
7609 {
7610         if (shader->scratch_bo)
7611                 r600_resource_reference(&shader->scratch_bo, NULL);
7612
7613         r600_resource_reference(&shader->bo, NULL);
7614
7615         if (!shader->is_binary_shared)
7616                 radeon_shader_binary_clean(&shader->binary);
7617
7618         free(shader->shader_log);
7619 }