src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_flow.h"
  35 #include "gallivm/lp_bld_misc.h"
  36 #include "util/u_memory.h"
  37 #include "util/u_string.h"
  38 #include "tgsi/tgsi_build.h"
  39 #include "tgsi/tgsi_util.h"
  40 #include "tgsi/tgsi_dump.h"
  41
  42 #include "ac_binary.h"
  43 #include "ac_llvm_util.h"
  44 #include "ac_exp_param.h"
  45 #include "si_shader_internal.h"
  46 #include "si_pipe.h"
  47 #include "sid.h"
  48
  49
  50 static const char *scratch_rsrc_dword0_symbol =
  51         "SCRATCH_RSRC_DWORD0";
  52
  53 static const char *scratch_rsrc_dword1_symbol =
  54         "SCRATCH_RSRC_DWORD1";
  55
  56 struct si_shader_output_values
  57 {
  58         LLVMValueRef values[4];
  59         unsigned semantic_name;
  60         unsigned semantic_index;
  61         ubyte vertex_stream[4];
  62 };
  63
  64 static void si_init_shader_ctx(struct si_shader_context *ctx,
  65                                struct si_screen *sscreen,
  66                                LLVMTargetMachineRef tm);
  67
  68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  69                                  struct lp_build_tgsi_context *bld_base,
  70                                  struct lp_build_emit_data *emit_data);
  71
  72 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
  73                                FILE *f);
  74
  75 static unsigned llvm_get_type_size(LLVMTypeRef type);
  76
  77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  78                                         union si_shader_part_key *key);
  79 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  80                                          union si_shader_part_key *key);
  81 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  82                                         union si_shader_part_key *key);
  83 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  84                                         union si_shader_part_key *key);
  85
  86 /* Ideally pass the sample mask input to the PS epilog as v13, which
  87  * is its usual location, so that the shader doesn't have to add v_mov.
  88  */
  89 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
  90
  91 enum {
  92         CONST_ADDR_SPACE = 2,
  93         LOCAL_ADDR_SPACE = 3,
  94 };
  95
  96 static bool is_merged_shader(struct si_shader *shader)
  97 {
  98         if (shader->selector->screen->b.chip_class <= VI)
  99                 return false;
 100
 101         return shader->key.as_ls ||
 102                shader->key.as_es ||
 103                shader->selector->type == PIPE_SHADER_TESS_CTRL ||
 104                shader->selector->type == PIPE_SHADER_GEOMETRY;
 105 }
 106
 107 /**
 108  * Returns a unique index for a per-patch semantic name and index. The index
 109  * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
 110  * can be calculated.
 111  */
 112 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
 113 {
 114         switch (semantic_name) {
 115         case TGSI_SEMANTIC_TESSOUTER:
 116                 return 0;
 117         case TGSI_SEMANTIC_TESSINNER:
 118                 return 1;
 119         case TGSI_SEMANTIC_PATCH:
 120                 assert(index < 30);
 121                 return 2 + index;
 122
 123         default:
 124                 assert(!"invalid semantic name");
 125                 return 0;
 126         }
 127 }
 128
 129 /**
 130  * Returns a unique index for a semantic name and index. The index must be
 131  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 132  * calculated.
 133  */
 134 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 135 {
 136         switch (semantic_name) {
 137         case TGSI_SEMANTIC_POSITION:
 138                 return 0;
 139         case TGSI_SEMANTIC_GENERIC:
 140                 /* Since some shader stages use the the highest used IO index
 141                  * to determine the size to allocate for inputs/outputs
 142                  * (in LDS, tess and GS rings). GENERIC should be placed right
 143                  * after POSITION to make that size as small as possible.
 144                  */
 145                 if (index < SI_MAX_IO_GENERIC)
 146                         return 1 + index;
 147
 148                 assert(!"invalid generic index");
 149                 return 0;
 150         case TGSI_SEMANTIC_PSIZE:
 151                 return SI_MAX_IO_GENERIC + 1;
 152         case TGSI_SEMANTIC_CLIPDIST:
 153                 assert(index <= 1);
 154                 return SI_MAX_IO_GENERIC + 2 + index;
 155         case TGSI_SEMANTIC_FOG:
 156                 return SI_MAX_IO_GENERIC + 4;
 157         case TGSI_SEMANTIC_LAYER:
 158                 return SI_MAX_IO_GENERIC + 5;
 159         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 160                 return SI_MAX_IO_GENERIC + 6;
 161         case TGSI_SEMANTIC_PRIMID:
 162                 return SI_MAX_IO_GENERIC + 7;
 163         case TGSI_SEMANTIC_COLOR: /* these alias */
 164         case TGSI_SEMANTIC_BCOLOR:
 165                 assert(index < 2);
 166                 return SI_MAX_IO_GENERIC + 8 + index;
 167         case TGSI_SEMANTIC_TEXCOORD:
 168                 assert(index < 8);
 169                 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
 170                 return SI_MAX_IO_GENERIC + 10 + index;
 171         default:
 172                 assert(!"invalid semantic name");
 173                 return 0;
 174         }
 175 }
 176
 177 /**
 178  * Get the value of a shader input parameter and extract a bitfield.
 179  */
 180 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 181                                  unsigned param, unsigned rshift,
 182                                  unsigned bitwidth)
 183 {
 184         struct gallivm_state *gallivm = &ctx->gallivm;
 185         LLVMValueRef value = LLVMGetParam(ctx->main_fn,
 186                                           param);
 187
 188         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 189                 value = bitcast(&ctx->bld_base,
 190                                 TGSI_TYPE_UNSIGNED, value);
 191
 192         if (rshift)
 193                 value = LLVMBuildLShr(gallivm->builder, value,
 194                                       LLVMConstInt(ctx->i32, rshift, 0), "");
 195
 196         if (rshift + bitwidth < 32) {
 197                 unsigned mask = (1 << bitwidth) - 1;
 198                 value = LLVMBuildAnd(gallivm->builder, value,
 199                                      LLVMConstInt(ctx->i32, mask, 0), "");
 200         }
 201
 202         return value;
 203 }
 204
 205 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 206 {
 207         switch (ctx->type) {
 208         case PIPE_SHADER_TESS_CTRL:
 209                 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
 210
 211         case PIPE_SHADER_TESS_EVAL:
 212                 return LLVMGetParam(ctx->main_fn,
 213                                     ctx->param_tes_rel_patch_id);
 214
 215         default:
 216                 assert(0);
 217                 return NULL;
 218         }
 219 }
 220
 221 /* Tessellation shaders pass outputs to the next shader using LDS.
 222  *
 223  * LS outputs = TCS inputs
 224  * TCS outputs = TES inputs
 225  *
 226  * The LDS layout is:
 227  * - TCS inputs for patch 0
 228  * - TCS inputs for patch 1
 229  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 230  * - ...
 231  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 232  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 233  * - TCS outputs for patch 1
 234  * - Per-patch TCS outputs for patch 1
 235  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 236  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 237  * - ...
 238  *
 239  * All three shaders VS(LS), TCS, TES share the same LDS space.
 240  */
 241
 242 static LLVMValueRef
 243 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 244 {
 245         return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 246 }
 247
 248 static LLVMValueRef
 249 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 250 {
 251         return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 252 }
 253
 254 static LLVMValueRef
 255 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 256 {
 257         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 258                                 unpack_param(ctx,
 259                                              ctx->param_tcs_out_lds_offsets,
 260                                              0, 16),
 261                                 4);
 262 }
 263
 264 static LLVMValueRef
 265 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 266 {
 267         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 268                                 unpack_param(ctx,
 269                                              ctx->param_tcs_out_lds_offsets,
 270                                              16, 16),
 271                                 4);
 272 }
 273
 274 static LLVMValueRef
 275 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 276 {
 277         struct gallivm_state *gallivm = &ctx->gallivm;
 278         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 279         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 280
 281         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 282 }
 283
 284 static LLVMValueRef
 285 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 286 {
 287         struct gallivm_state *gallivm = &ctx->gallivm;
 288         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 289         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 290         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 291
 292         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 293                             LLVMBuildMul(gallivm->builder, patch_stride,
 294                                          rel_patch_id, ""),
 295                             "");
 296 }
 297
 298 static LLVMValueRef
 299 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 300 {
 301         struct gallivm_state *gallivm = &ctx->gallivm;
 302         LLVMValueRef patch0_patch_data_offset =
 303                 get_tcs_out_patch0_patch_data_offset(ctx);
 304         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 305         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 306
 307         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 308                             LLVMBuildMul(gallivm->builder, patch_stride,
 309                                          rel_patch_id, ""),
 310                             "");
 311 }
 312
 313 static LLVMValueRef get_instance_index_for_fetch(
 314         struct si_shader_context *ctx,
 315         unsigned param_start_instance, LLVMValueRef divisor)
 316 {
 317         struct gallivm_state *gallivm = &ctx->gallivm;
 318
 319         LLVMValueRef result = LLVMGetParam(ctx->main_fn,
 320                                            ctx->param_instance_id);
 321
 322         /* The division must be done before START_INSTANCE is added. */
 323         if (divisor != ctx->i32_1)
 324                 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
 325
 326         return LLVMBuildAdd(gallivm->builder, result,
 327                             LLVMGetParam(ctx->main_fn, param_start_instance), "");
 328 }
 329
 330 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
 331  * to float. */
 332 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
 333                                             LLVMValueRef vec4,
 334                                             unsigned double_index)
 335 {
 336         LLVMBuilderRef builder = ctx->gallivm.builder;
 337         LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
 338         LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
 339                                               LLVMVectorType(f64, 2), "");
 340         LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
 341         LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
 342         return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
 343 }
 344
 345 static void declare_input_vs(
 346         struct si_shader_context *ctx,
 347         unsigned input_index,
 348         const struct tgsi_full_declaration *decl,
 349         LLVMValueRef out[4])
 350 {
 351         struct gallivm_state *gallivm = &ctx->gallivm;
 352
 353         unsigned chan;
 354         unsigned fix_fetch;
 355         unsigned num_fetches;
 356         unsigned fetch_stride;
 357
 358         LLVMValueRef t_list_ptr;
 359         LLVMValueRef t_offset;
 360         LLVMValueRef t_list;
 361         LLVMValueRef vertex_index;
 362         LLVMValueRef input[3];
 363
 364         /* Load the T list */
 365         t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 366
 367         t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 368
 369         t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
 370
 371         vertex_index = LLVMGetParam(ctx->main_fn,
 372                                     ctx->param_vertex_index0 +
 373                                     input_index);
 374
 375         fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
 376
 377         /* Do multiple loads for special formats. */
 378         switch (fix_fetch) {
 379         case SI_FIX_FETCH_RGB_64_FLOAT:
 380                 num_fetches = 3; /* 3 2-dword loads */
 381                 fetch_stride = 8;
 382                 break;
 383         case SI_FIX_FETCH_RGBA_64_FLOAT:
 384                 num_fetches = 2; /* 2 4-dword loads */
 385                 fetch_stride = 16;
 386                 break;
 387         case SI_FIX_FETCH_RGB_8:
 388         case SI_FIX_FETCH_RGB_8_INT:
 389                 num_fetches = 3;
 390                 fetch_stride = 1;
 391                 break;
 392         case SI_FIX_FETCH_RGB_16:
 393         case SI_FIX_FETCH_RGB_16_INT:
 394                 num_fetches = 3;
 395                 fetch_stride = 2;
 396                 break;
 397         default:
 398                 num_fetches = 1;
 399                 fetch_stride = 0;
 400         }
 401
 402         for (unsigned i = 0; i < num_fetches; i++) {
 403                 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
 404
 405                 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
 406                                                        vertex_index, voffset,
 407                                                        true);
 408         }
 409
 410         /* Break up the vec4 into individual components */
 411         for (chan = 0; chan < 4; chan++) {
 412                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
 413                 out[chan] = LLVMBuildExtractElement(gallivm->builder,
 414                                                     input[0], llvm_chan, "");
 415         }
 416
 417         switch (fix_fetch) {
 418         case SI_FIX_FETCH_A2_SNORM:
 419         case SI_FIX_FETCH_A2_SSCALED:
 420         case SI_FIX_FETCH_A2_SINT: {
 421                 /* The hardware returns an unsigned value; convert it to a
 422                  * signed one.
 423                  */
 424                 LLVMValueRef tmp = out[3];
 425                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 426
 427                 /* First, recover the sign-extended signed integer value. */
 428                 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
 429                         tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
 430                 else
 431                         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
 432
 433                 /* For the integer-like cases, do a natural sign extension.
 434                  *
 435                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 436                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 437                  * exponent.
 438                  */
 439                 tmp = LLVMBuildShl(gallivm->builder, tmp,
 440                                    fix_fetch == SI_FIX_FETCH_A2_SNORM ?
 441                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 442                 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
 443
 444                 /* Convert back to the right type. */
 445                 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
 446                         LLVMValueRef clamp;
 447                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 448                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 449                         clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
 450                         tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
 451                 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
 452                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 453                 }
 454
 455                 out[3] = tmp;
 456                 break;
 457         }
 458         case SI_FIX_FETCH_RGBA_32_UNORM:
 459         case SI_FIX_FETCH_RGBX_32_UNORM:
 460                 for (chan = 0; chan < 4; chan++) {
 461                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 462                                                      ctx->i32, "");
 463                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 464                                                     out[chan], ctx->f32, "");
 465                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 466                                                   LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
 467                 }
 468                 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 469                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
 470                         out[3] = LLVMConstReal(ctx->f32, 1);
 471                 break;
 472         case SI_FIX_FETCH_RGBA_32_SNORM:
 473         case SI_FIX_FETCH_RGBX_32_SNORM:
 474         case SI_FIX_FETCH_RGBA_32_FIXED:
 475         case SI_FIX_FETCH_RGBX_32_FIXED: {
 476                 double scale;
 477                 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
 478                         scale = 1.0 / 0x10000;
 479                 else
 480                         scale = 1.0 / INT_MAX;
 481
 482                 for (chan = 0; chan < 4; chan++) {
 483                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 484                                                      ctx->i32, "");
 485                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 486                                                     out[chan], ctx->f32, "");
 487                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 488                                                   LLVMConstReal(ctx->f32, scale), "");
 489                 }
 490                 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 491                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
 492                     fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
 493                         out[3] = LLVMConstReal(ctx->f32, 1);
 494                 break;
 495         }
 496         case SI_FIX_FETCH_RGBA_32_USCALED:
 497                 for (chan = 0; chan < 4; chan++) {
 498                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 499                                                      ctx->i32, "");
 500                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 501                                                     out[chan], ctx->f32, "");
 502                 }
 503                 break;
 504         case SI_FIX_FETCH_RGBA_32_SSCALED:
 505                 for (chan = 0; chan < 4; chan++) {
 506                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 507                                                      ctx->i32, "");
 508                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 509                                                     out[chan], ctx->f32, "");
 510                 }
 511                 break;
 512         case SI_FIX_FETCH_RG_64_FLOAT:
 513                 for (chan = 0; chan < 2; chan++)
 514                         out[chan] = extract_double_to_float(ctx, input[0], chan);
 515
 516                 out[2] = LLVMConstReal(ctx->f32, 0);
 517                 out[3] = LLVMConstReal(ctx->f32, 1);
 518                 break;
 519         case SI_FIX_FETCH_RGB_64_FLOAT:
 520                 for (chan = 0; chan < 3; chan++)
 521                         out[chan] = extract_double_to_float(ctx, input[chan], 0);
 522
 523                 out[3] = LLVMConstReal(ctx->f32, 1);
 524                 break;
 525         case SI_FIX_FETCH_RGBA_64_FLOAT:
 526                 for (chan = 0; chan < 4; chan++) {
 527                         out[chan] = extract_double_to_float(ctx, input[chan / 2],
 528                                                             chan % 2);
 529                 }
 530                 break;
 531         case SI_FIX_FETCH_RGB_8:
 532         case SI_FIX_FETCH_RGB_8_INT:
 533         case SI_FIX_FETCH_RGB_16:
 534         case SI_FIX_FETCH_RGB_16_INT:
 535                 for (chan = 0; chan < 3; chan++) {
 536                         out[chan] = LLVMBuildExtractElement(gallivm->builder,
 537                                                             input[chan],
 538                                                             ctx->i32_0, "");
 539                 }
 540                 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
 541                     fix_fetch == SI_FIX_FETCH_RGB_16) {
 542                         out[3] = LLVMConstReal(ctx->f32, 1);
 543                 } else {
 544                         out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
 545                                                   ctx->f32, "");
 546                 }
 547                 break;
 548         }
 549 }
 550
 551 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 552                                      unsigned swizzle)
 553 {
 554         struct si_shader_context *ctx = si_shader_context(bld_base);
 555
 556         if (swizzle > 0)
 557                 return ctx->i32_0;
 558
 559         switch (ctx->type) {
 560         case PIPE_SHADER_VERTEX:
 561                 return LLVMGetParam(ctx->main_fn,
 562                                     ctx->param_vs_prim_id);
 563         case PIPE_SHADER_TESS_CTRL:
 564                 return LLVMGetParam(ctx->main_fn,
 565                                     ctx->param_tcs_patch_id);
 566         case PIPE_SHADER_TESS_EVAL:
 567                 return LLVMGetParam(ctx->main_fn,
 568                                     ctx->param_tes_patch_id);
 569         case PIPE_SHADER_GEOMETRY:
 570                 return LLVMGetParam(ctx->main_fn,
 571                                     ctx->param_gs_prim_id);
 572         default:
 573                 assert(0);
 574                 return ctx->i32_0;
 575         }
 576 }
 577
 578 /**
 579  * Return the value of tgsi_ind_register for indexing.
 580  * This is the indirect index with the constant offset added to it.
 581  */
 582 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 583                                        const struct tgsi_ind_register *ind,
 584                                        int rel_index)
 585 {
 586         struct gallivm_state *gallivm = &ctx->gallivm;
 587         LLVMValueRef result;
 588
 589         result = ctx->addrs[ind->Index][ind->Swizzle];
 590         result = LLVMBuildLoad(gallivm->builder, result, "");
 591         result = LLVMBuildAdd(gallivm->builder, result,
 592                               LLVMConstInt(ctx->i32, rel_index, 0), "");
 593         return result;
 594 }
 595
 596 /**
 597  * Like get_indirect_index, but restricts the return value to a (possibly
 598  * undefined) value inside [0..num).
 599  */
 600 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
 601                                            const struct tgsi_ind_register *ind,
 602                                            int rel_index, unsigned num)
 603 {
 604         LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
 605
 606         return si_llvm_bound_index(ctx, result, num);
 607 }
 608
 609
 610 /**
 611  * Calculate a dword address given an input or output register and a stride.
 612  */
 613 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 614                                    const struct tgsi_full_dst_register *dst,
 615                                    const struct tgsi_full_src_register *src,
 616                                    LLVMValueRef vertex_dw_stride,
 617                                    LLVMValueRef base_addr)
 618 {
 619         struct gallivm_state *gallivm = &ctx->gallivm;
 620         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 621         ubyte *name, *index, *array_first;
 622         int first, param;
 623         struct tgsi_full_dst_register reg;
 624
 625         /* Set the register description. The address computation is the same
 626          * for sources and destinations. */
 627         if (src) {
 628                 reg.Register.File = src->Register.File;
 629                 reg.Register.Index = src->Register.Index;
 630                 reg.Register.Indirect = src->Register.Indirect;
 631                 reg.Register.Dimension = src->Register.Dimension;
 632                 reg.Indirect = src->Indirect;
 633                 reg.Dimension = src->Dimension;
 634                 reg.DimIndirect = src->DimIndirect;
 635         } else
 636                 reg = *dst;
 637
 638         /* If the register is 2-dimensional (e.g. an array of vertices
 639          * in a primitive), calculate the base address of the vertex. */
 640         if (reg.Register.Dimension) {
 641                 LLVMValueRef index;
 642
 643                 if (reg.Dimension.Indirect)
 644                         index = get_indirect_index(ctx, &reg.DimIndirect,
 645                                                    reg.Dimension.Index);
 646                 else
 647                         index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 648
 649                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 650                                          LLVMBuildMul(gallivm->builder, index,
 651                                                       vertex_dw_stride, ""), "");
 652         }
 653
 654         /* Get information about the register. */
 655         if (reg.Register.File == TGSI_FILE_INPUT) {
 656                 name = info->input_semantic_name;
 657                 index = info->input_semantic_index;
 658                 array_first = info->input_array_first;
 659         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 660                 name = info->output_semantic_name;
 661                 index = info->output_semantic_index;
 662                 array_first = info->output_array_first;
 663         } else {
 664                 assert(0);
 665                 return NULL;
 666         }
 667
 668         if (reg.Register.Indirect) {
 669                 /* Add the relative address of the element. */
 670                 LLVMValueRef ind_index;
 671
 672                 if (reg.Indirect.ArrayID)
 673                         first = array_first[reg.Indirect.ArrayID];
 674                 else
 675                         first = reg.Register.Index;
 676
 677                 ind_index = get_indirect_index(ctx, &reg.Indirect,
 678                                            reg.Register.Index - first);
 679
 680                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 681                                     LLVMBuildMul(gallivm->builder, ind_index,
 682                                                  LLVMConstInt(ctx->i32, 4, 0), ""), "");
 683
 684                 param = reg.Register.Dimension ?
 685                         si_shader_io_get_unique_index(name[first], index[first]) :
 686                         si_shader_io_get_unique_index_patch(name[first], index[first]);
 687         } else {
 688                 param = reg.Register.Dimension ?
 689                         si_shader_io_get_unique_index(name[reg.Register.Index],
 690                                                       index[reg.Register.Index]) :
 691                         si_shader_io_get_unique_index_patch(name[reg.Register.Index],
 692                                                             index[reg.Register.Index]);
 693         }
 694
 695         /* Add the base address of the element. */
 696         return LLVMBuildAdd(gallivm->builder, base_addr,
 697                             LLVMConstInt(ctx->i32, param * 4, 0), "");
 698 }
 699
 700 /* The offchip buffer layout for TCS->TES is
 701  *
 702  * - attribute 0 of patch 0 vertex 0
 703  * - attribute 0 of patch 0 vertex 1
 704  * - attribute 0 of patch 0 vertex 2
 705  *   ...
 706  * - attribute 0 of patch 1 vertex 0
 707  * - attribute 0 of patch 1 vertex 1
 708  *   ...
 709  * - attribute 1 of patch 0 vertex 0
 710  * - attribute 1 of patch 0 vertex 1
 711  *   ...
 712  * - per patch attribute 0 of patch 0
 713  * - per patch attribute 0 of patch 1
 714  *   ...
 715  *
 716  * Note that every attribute has 4 components.
 717  */
 718 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 719                                                LLVMValueRef rel_patch_id,
 720                                                LLVMValueRef vertex_index,
 721                                                LLVMValueRef param_index)
 722 {
 723         struct gallivm_state *gallivm = &ctx->gallivm;
 724         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 725         LLVMValueRef param_stride, constant16;
 726
 727         vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
 728         num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
 729         total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 730                                       num_patches, "");
 731
 732         constant16 = LLVMConstInt(ctx->i32, 16, 0);
 733         if (vertex_index) {
 734                 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
 735                                          vertices_per_patch, "");
 736
 737                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 738                                          vertex_index, "");
 739
 740                 param_stride = total_vertices;
 741         } else {
 742                 base_addr = rel_patch_id;
 743                 param_stride = num_patches;
 744         }
 745
 746         base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 747                                  LLVMBuildMul(gallivm->builder, param_index,
 748                                               param_stride, ""), "");
 749
 750         base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 751
 752         if (!vertex_index) {
 753                 LLVMValueRef patch_data_offset =
 754                            unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
 755
 756                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 757                                          patch_data_offset, "");
 758         }
 759         return base_addr;
 760 }
 761
 762 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 763                                        struct si_shader_context *ctx,
 764                                        const struct tgsi_full_dst_register *dst,
 765                                        const struct tgsi_full_src_register *src)
 766 {
 767         struct gallivm_state *gallivm = &ctx->gallivm;
 768         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 769         ubyte *name, *index, *array_first;
 770         struct tgsi_full_src_register reg;
 771         LLVMValueRef vertex_index = NULL;
 772         LLVMValueRef param_index = NULL;
 773         unsigned param_index_base, param_base;
 774
 775         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 776
 777         if (reg.Register.Dimension) {
 778
 779                 if (reg.Dimension.Indirect)
 780                         vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
 781                                                           reg.Dimension.Index);
 782                 else
 783                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 784         }
 785
 786         /* Get information about the register. */
 787         if (reg.Register.File == TGSI_FILE_INPUT) {
 788                 name = info->input_semantic_name;
 789                 index = info->input_semantic_index;
 790                 array_first = info->input_array_first;
 791         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 792                 name = info->output_semantic_name;
 793                 index = info->output_semantic_index;
 794                 array_first = info->output_array_first;
 795         } else {
 796                 assert(0);
 797                 return NULL;
 798         }
 799
 800         if (reg.Register.Indirect) {
 801                 if (reg.Indirect.ArrayID)
 802                         param_base = array_first[reg.Indirect.ArrayID];
 803                 else
 804                         param_base = reg.Register.Index;
 805
 806                 param_index = get_indirect_index(ctx, &reg.Indirect,
 807                                                  reg.Register.Index - param_base);
 808
 809         } else {
 810                 param_base = reg.Register.Index;
 811                 param_index = ctx->i32_0;
 812         }
 813
 814         param_index_base = reg.Register.Dimension ?
 815                 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
 816                 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
 817
 818         param_index = LLVMBuildAdd(gallivm->builder, param_index,
 819                                    LLVMConstInt(ctx->i32, param_index_base, 0),
 820                                    "");
 821
 822         return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
 823                                           vertex_index, param_index);
 824 }
 825
 826 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 827                                 enum tgsi_opcode_type type, unsigned swizzle,
 828                                 LLVMValueRef buffer, LLVMValueRef offset,
 829                                 LLVMValueRef base, bool can_speculate)
 830 {
 831         struct si_shader_context *ctx = si_shader_context(bld_base);
 832         struct gallivm_state *gallivm = &ctx->gallivm;
 833         LLVMValueRef value, value2;
 834         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
 835         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 836
 837         if (swizzle == ~0) {
 838                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 839                                              0, 1, 0, can_speculate, false);
 840
 841                 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 842         }
 843
 844         if (!tgsi_type_is_64bit(type)) {
 845                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 846                                              0, 1, 0, can_speculate, false);
 847
 848                 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 849                 return LLVMBuildExtractElement(gallivm->builder, value,
 850                                     LLVMConstInt(ctx->i32, swizzle, 0), "");
 851         }
 852
 853         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 854                                   swizzle * 4, 1, 0, can_speculate, false);
 855
 856         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 857                                    swizzle * 4 + 4, 1, 0, can_speculate, false);
 858
 859         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 860 }
 861
 862 /**
 863  * Load from LDS.
 864  *
 865  * \param type          output value type
 866  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 867  * \param dw_addr       address in dwords
 868  */
 869 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 870                              enum tgsi_opcode_type type, unsigned swizzle,
 871                              LLVMValueRef dw_addr)
 872 {
 873         struct si_shader_context *ctx = si_shader_context(bld_base);
 874         struct gallivm_state *gallivm = &ctx->gallivm;
 875         LLVMValueRef value;
 876
 877         if (swizzle == ~0) {
 878                 LLVMValueRef values[TGSI_NUM_CHANNELS];
 879
 880                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
 881                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
 882
 883                 return lp_build_gather_values(gallivm, values,
 884                                               TGSI_NUM_CHANNELS);
 885         }
 886
 887         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 888                             LLVMConstInt(ctx->i32, swizzle, 0));
 889
 890         value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 891         if (tgsi_type_is_64bit(type)) {
 892                 LLVMValueRef value2;
 893                 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 894                                        ctx->i32_1);
 895                 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 896                 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 897         }
 898
 899         return LLVMBuildBitCast(gallivm->builder, value,
 900                                 tgsi2llvmtype(bld_base, type), "");
 901 }
 902
 903 /**
 904  * Store to LDS.
 905  *
 906  * \param swizzle       offset (typically 0..3)
 907  * \param dw_addr       address in dwords
 908  * \param value         value to store
 909  */
 910 static void lds_store(struct lp_build_tgsi_context *bld_base,
 911                       unsigned dw_offset_imm, LLVMValueRef dw_addr,
 912                       LLVMValueRef value)
 913 {
 914         struct si_shader_context *ctx = si_shader_context(bld_base);
 915         struct gallivm_state *gallivm = &ctx->gallivm;
 916
 917         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 918                             LLVMConstInt(ctx->i32, dw_offset_imm, 0));
 919
 920         value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
 921         ac_build_indexed_store(&ctx->ac, ctx->lds,
 922                                dw_addr, value);
 923 }
 924
 925 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
 926                                                   unsigned param)
 927 {
 928         LLVMBuilderRef builder = ctx->gallivm.builder;
 929
 930         LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
 931         addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
 932         addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
 933
 934         uint64_t desc2 = 0xffffffff;
 935         uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 936                          S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 937                          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
 938                          S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
 939                          S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
 940                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 941         LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
 942
 943         LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
 944         desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
 945         desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
 946         return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
 947 }
 948
 949 static LLVMValueRef fetch_input_tcs(
 950         struct lp_build_tgsi_context *bld_base,
 951         const struct tgsi_full_src_register *reg,
 952         enum tgsi_opcode_type type, unsigned swizzle)
 953 {
 954         struct si_shader_context *ctx = si_shader_context(bld_base);
 955         LLVMValueRef dw_addr, stride;
 956
 957         stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 958         dw_addr = get_tcs_in_current_patch_offset(ctx);
 959         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 960
 961         return lds_load(bld_base, type, swizzle, dw_addr);
 962 }
 963
 964 static LLVMValueRef fetch_output_tcs(
 965                 struct lp_build_tgsi_context *bld_base,
 966                 const struct tgsi_full_src_register *reg,
 967                 enum tgsi_opcode_type type, unsigned swizzle)
 968 {
 969         struct si_shader_context *ctx = si_shader_context(bld_base);
 970         LLVMValueRef dw_addr, stride;
 971
 972         if (reg->Register.Dimension) {
 973                 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
 974                 dw_addr = get_tcs_out_current_patch_offset(ctx);
 975                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 976         } else {
 977                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 978                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
 979         }
 980
 981         return lds_load(bld_base, type, swizzle, dw_addr);
 982 }
 983
 984 static LLVMValueRef fetch_input_tes(
 985         struct lp_build_tgsi_context *bld_base,
 986         const struct tgsi_full_src_register *reg,
 987         enum tgsi_opcode_type type, unsigned swizzle)
 988 {
 989         struct si_shader_context *ctx = si_shader_context(bld_base);
 990         LLVMValueRef buffer, base, addr;
 991
 992         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
 993
 994         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 995         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
 996
 997         return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
 998 }
 999
1000 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1001                              const struct tgsi_full_instruction *inst,
1002                              const struct tgsi_opcode_info *info,
1003                              LLVMValueRef dst[4])
1004 {
1005         struct si_shader_context *ctx = si_shader_context(bld_base);
1006         struct gallivm_state *gallivm = &ctx->gallivm;
1007         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1008         const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1009         unsigned chan_index;
1010         LLVMValueRef dw_addr, stride;
1011         LLVMValueRef buffer, base, buf_addr;
1012         LLVMValueRef values[4];
1013         bool skip_lds_store;
1014         bool is_tess_factor = false;
1015
1016         /* Only handle per-patch and per-vertex outputs here.
1017          * Vectors will be lowered to scalars and this function will be called again.
1018          */
1019         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1020             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1021                 si_llvm_emit_store(bld_base, inst, info, dst);
1022                 return;
1023         }
1024
1025         if (reg->Register.Dimension) {
1026                 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1027                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1028                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1029                 skip_lds_store = !sh_info->reads_pervertex_outputs;
1030         } else {
1031                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1032                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1033                 skip_lds_store = !sh_info->reads_perpatch_outputs;
1034
1035                 if (!reg->Register.Indirect) {
1036                         int name = sh_info->output_semantic_name[reg->Register.Index];
1037
1038                         /* Always write tess factors into LDS for the TCS epilog. */
1039                         if (name == TGSI_SEMANTIC_TESSINNER ||
1040                             name == TGSI_SEMANTIC_TESSOUTER) {
1041                                 skip_lds_store = false;
1042                                 is_tess_factor = true;
1043                         }
1044                 }
1045         }
1046
1047         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1048
1049         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1050         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1051
1052
1053         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1054                 LLVMValueRef value = dst[chan_index];
1055
1056                 if (inst->Instruction.Saturate)
1057                         value = ac_build_clamp(&ctx->ac, value);
1058
1059                 /* Skip LDS stores if there is no LDS read of this output. */
1060                 if (!skip_lds_store)
1061                         lds_store(bld_base, chan_index, dw_addr, value);
1062
1063                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1064                 values[chan_index] = value;
1065
1066                 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1067                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1068                                                     buf_addr, base,
1069                                                     4 * chan_index, 1, 0, true, false);
1070                 }
1071         }
1072
1073         if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1074                 LLVMValueRef value = lp_build_gather_values(gallivm,
1075                                                             values, 4);
1076                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1077                                             base, 0, 1, 0, true, false);
1078         }
1079 }
1080
1081 static LLVMValueRef fetch_input_gs(
1082         struct lp_build_tgsi_context *bld_base,
1083         const struct tgsi_full_src_register *reg,
1084         enum tgsi_opcode_type type,
1085         unsigned swizzle)
1086 {
1087         struct si_shader_context *ctx = si_shader_context(bld_base);
1088         struct si_shader *shader = ctx->shader;
1089         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1090         struct gallivm_state *gallivm = &ctx->gallivm;
1091         LLVMValueRef vtx_offset, soffset;
1092         struct tgsi_shader_info *info = &shader->selector->info;
1093         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1094         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1095         unsigned param;
1096         LLVMValueRef value;
1097
1098         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1099                 return get_primitive_id(bld_base, swizzle);
1100
1101         if (!reg->Register.Dimension)
1102                 return NULL;
1103
1104         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1105
1106         /* GFX9 has the ESGS ring in LDS. */
1107         if (ctx->screen->b.chip_class >= GFX9) {
1108                 unsigned index = reg->Dimension.Index;
1109
1110                 switch (index / 2) {
1111                 case 0:
1112                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1113                                                   index % 2 ? 16 : 0, 16);
1114                         break;
1115                 case 1:
1116                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1117                                                   index % 2 ? 16 : 0, 16);
1118                         break;
1119                 case 2:
1120                         vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1121                                                   index % 2 ? 16 : 0, 16);
1122                         break;
1123                 default:
1124                         assert(0);
1125                         return NULL;
1126                 }
1127
1128                 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1129                                           LLVMConstInt(ctx->i32, param * 4, 0), "");
1130                 return lds_load(bld_base, type, swizzle, vtx_offset);
1131         }
1132
1133         /* GFX6: input load from the ESGS ring in memory. */
1134         if (swizzle == ~0) {
1135                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1136                 unsigned chan;
1137                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1138                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1139                 }
1140                 return lp_build_gather_values(gallivm, values,
1141                                               TGSI_NUM_CHANNELS);
1142         }
1143
1144         /* Get the vertex offset parameter on GFX6. */
1145         unsigned vtx_offset_param = reg->Dimension.Index;
1146         if (vtx_offset_param < 2) {
1147                 vtx_offset_param += ctx->param_gs_vtx0_offset;
1148         } else {
1149                 assert(vtx_offset_param < 6);
1150                 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1151         }
1152         vtx_offset = lp_build_mul_imm(uint,
1153                                       LLVMGetParam(ctx->main_fn,
1154                                                    vtx_offset_param),
1155                                       4);
1156
1157         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1158
1159         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1160                                      vtx_offset, soffset, 0, 1, 0, true, false);
1161         if (tgsi_type_is_64bit(type)) {
1162                 LLVMValueRef value2;
1163                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1164
1165                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1166                                               ctx->i32_0, vtx_offset, soffset,
1167                                               0, 1, 0, true, false);
1168                 return si_llvm_emit_fetch_64bit(bld_base, type,
1169                                                 value, value2);
1170         }
1171         return LLVMBuildBitCast(gallivm->builder,
1172                                 value,
1173                                 tgsi2llvmtype(bld_base, type), "");
1174 }
1175
1176 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1177 {
1178         switch (interpolate) {
1179         case TGSI_INTERPOLATE_CONSTANT:
1180                 return 0;
1181
1182         case TGSI_INTERPOLATE_LINEAR:
1183                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1184                         return SI_PARAM_LINEAR_SAMPLE;
1185                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1186                         return SI_PARAM_LINEAR_CENTROID;
1187                 else
1188                         return SI_PARAM_LINEAR_CENTER;
1189                 break;
1190         case TGSI_INTERPOLATE_COLOR:
1191         case TGSI_INTERPOLATE_PERSPECTIVE:
1192                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1193                         return SI_PARAM_PERSP_SAMPLE;
1194                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1195                         return SI_PARAM_PERSP_CENTROID;
1196                 else
1197                         return SI_PARAM_PERSP_CENTER;
1198                 break;
1199         default:
1200                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1201                 return -1;
1202         }
1203 }
1204
1205 /**
1206  * Interpolate a fragment shader input.
1207  *
1208  * @param ctx           context
1209  * @param input_index           index of the input in hardware
1210  * @param semantic_name         TGSI_SEMANTIC_*
1211  * @param semantic_index        semantic index
1212  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1213  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1214  * @param interp_param          interpolation weights (i,j)
1215  * @param prim_mask             SI_PARAM_PRIM_MASK
1216  * @param face                  SI_PARAM_FRONT_FACE
1217  * @param result                the return value (4 components)
1218  */
1219 static void interp_fs_input(struct si_shader_context *ctx,
1220                             unsigned input_index,
1221                             unsigned semantic_name,
1222                             unsigned semantic_index,
1223                             unsigned num_interp_inputs,
1224                             unsigned colors_read_mask,
1225                             LLVMValueRef interp_param,
1226                             LLVMValueRef prim_mask,
1227                             LLVMValueRef face,
1228                             LLVMValueRef result[4])
1229 {
1230         struct gallivm_state *gallivm = &ctx->gallivm;
1231         LLVMValueRef attr_number;
1232         LLVMValueRef i, j;
1233
1234         unsigned chan;
1235
1236         /* fs.constant returns the param from the middle vertex, so it's not
1237          * really useful for flat shading. It's meant to be used for custom
1238          * interpolation (but the intrinsic can't fetch from the other two
1239          * vertices).
1240          *
1241          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1242          * to do the right thing. The only reason we use fs.constant is that
1243          * fs.interp cannot be used on integers, because they can be equal
1244          * to NaN.
1245          *
1246          * When interp is false we will use fs.constant or for newer llvm,
1247          * amdgcn.interp.mov.
1248          */
1249         bool interp = interp_param != NULL;
1250
1251         attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1252
1253         if (interp) {
1254                 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1255                                                 LLVMVectorType(ctx->f32, 2), "");
1256
1257                 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1258                                                 ctx->i32_0, "");
1259                 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1260                                                 ctx->i32_1, "");
1261         }
1262
1263         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1264             ctx->shader->key.part.ps.prolog.color_two_side) {
1265                 LLVMValueRef is_face_positive;
1266                 LLVMValueRef back_attr_number;
1267
1268                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1269                  * otherwise it's at offset "num_inputs".
1270                  */
1271                 unsigned back_attr_offset = num_interp_inputs;
1272                 if (semantic_index == 1 && colors_read_mask & 0xf)
1273                         back_attr_offset += 1;
1274
1275                 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1276
1277                 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1278                                                  face, ctx->i32_0, "");
1279
1280                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1281                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1282                         LLVMValueRef front, back;
1283
1284                         if (interp) {
1285                                 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1286                                                         attr_number, prim_mask,
1287                                                         i, j);
1288                                 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1289                                                         back_attr_number, prim_mask,
1290                                                         i, j);
1291                         } else {
1292                                 front = ac_build_fs_interp_mov(&ctx->ac,
1293                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1294                                         llvm_chan, attr_number, prim_mask);
1295                                 back = ac_build_fs_interp_mov(&ctx->ac,
1296                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1297                                         llvm_chan, back_attr_number, prim_mask);
1298                         }
1299
1300                         result[chan] = LLVMBuildSelect(gallivm->builder,
1301                                                 is_face_positive,
1302                                                 front,
1303                                                 back,
1304                                                 "");
1305                 }
1306         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1307                 if (interp) {
1308                         result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1309                                                        attr_number, prim_mask, i, j);
1310                 } else {
1311                         result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1312                                                            LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1313                                                            attr_number, prim_mask);
1314                 }
1315                 result[1] =
1316                 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1317                 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1318         } else {
1319                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1320                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1321
1322                         if (interp) {
1323                                 result[chan] = ac_build_fs_interp(&ctx->ac,
1324                                         llvm_chan, attr_number, prim_mask, i, j);
1325                         } else {
1326                                 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1327                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1328                                         llvm_chan, attr_number, prim_mask);
1329                         }
1330                 }
1331         }
1332 }
1333
1334 static void declare_input_fs(
1335         struct si_shader_context *ctx,
1336         unsigned input_index,
1337         const struct tgsi_full_declaration *decl,
1338         LLVMValueRef out[4])
1339 {
1340         struct lp_build_context *base = &ctx->bld_base.base;
1341         struct si_shader *shader = ctx->shader;
1342         LLVMValueRef main_fn = ctx->main_fn;
1343         LLVMValueRef interp_param = NULL;
1344         int interp_param_idx;
1345
1346         /* Get colors from input VGPRs (set by the prolog). */
1347         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1348                 unsigned i = decl->Semantic.Index;
1349                 unsigned colors_read = shader->selector->info.colors_read;
1350                 unsigned mask = colors_read >> (i * 4);
1351                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1352                                   (i ? util_bitcount(colors_read & 0xf) : 0);
1353
1354                 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1355                 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1356                 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1357                 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1358                 return;
1359         }
1360
1361         interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1362                                                      decl->Interp.Location);
1363         if (interp_param_idx == -1)
1364                 return;
1365         else if (interp_param_idx) {
1366                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1367         }
1368
1369         interp_fs_input(ctx, input_index, decl->Semantic.Name,
1370                         decl->Semantic.Index, shader->selector->info.num_inputs,
1371                         shader->selector->info.colors_read, interp_param,
1372                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1373                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1374                         &out[0]);
1375 }
1376
1377 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1378 {
1379         return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1380 }
1381
1382
1383 /**
1384  * Load a dword from a constant buffer.
1385  */
1386 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1387                                       LLVMValueRef resource,
1388                                       LLVMValueRef offset)
1389 {
1390         return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1391                                     0, 0, 0, true, true);
1392 }
1393
1394 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1395 {
1396         struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1397         struct gallivm_state *gallivm = &ctx->gallivm;
1398         LLVMBuilderRef builder = gallivm->builder;
1399         LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1400         LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1401         LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1402
1403         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1404         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1405         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1406
1407         LLVMValueRef pos[4] = {
1408                 buffer_load_const(ctx, resource, offset0),
1409                 buffer_load_const(ctx, resource, offset1),
1410                 LLVMConstReal(ctx->f32, 0),
1411                 LLVMConstReal(ctx->f32, 0)
1412         };
1413
1414         return lp_build_gather_values(gallivm, pos, 4);
1415 }
1416
1417 static void declare_system_value(struct si_shader_context *ctx,
1418                                  unsigned index,
1419                                  const struct tgsi_full_declaration *decl)
1420 {
1421         struct lp_build_context *bld = &ctx->bld_base.base;
1422         struct gallivm_state *gallivm = &ctx->gallivm;
1423         LLVMValueRef value = 0;
1424
1425         assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1426
1427         switch (decl->Semantic.Name) {
1428         case TGSI_SEMANTIC_INSTANCEID:
1429                 value = LLVMGetParam(ctx->main_fn,
1430                                      ctx->param_instance_id);
1431                 break;
1432
1433         case TGSI_SEMANTIC_VERTEXID:
1434                 value = LLVMBuildAdd(gallivm->builder,
1435                                      LLVMGetParam(ctx->main_fn,
1436                                                   ctx->param_vertex_id),
1437                                      LLVMGetParam(ctx->main_fn,
1438                                                   ctx->param_base_vertex), "");
1439                 break;
1440
1441         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1442                 /* Unused. Clarify the meaning in indexed vs. non-indexed
1443                  * draws if this is ever used again. */
1444                 assert(false);
1445                 break;
1446
1447         case TGSI_SEMANTIC_BASEVERTEX:
1448         {
1449                 /* For non-indexed draws, the base vertex set by the driver
1450                  * (for direct draws) or the CP (for indirect draws) is the
1451                  * first vertex ID, but GLSL expects 0 to be returned.
1452                  */
1453                 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1454                 LLVMValueRef indexed;
1455
1456                 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1457                 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1458
1459                 value = LLVMBuildSelect(gallivm->builder, indexed,
1460                                         LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1461                                         ctx->i32_0, "");
1462                 break;
1463         }
1464
1465         case TGSI_SEMANTIC_BASEINSTANCE:
1466                 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1467                 break;
1468
1469         case TGSI_SEMANTIC_DRAWID:
1470                 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1471                 break;
1472
1473         case TGSI_SEMANTIC_INVOCATIONID:
1474                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1475                         value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1476                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1477                         value = LLVMGetParam(ctx->main_fn,
1478                                              ctx->param_gs_instance_id);
1479                 else
1480                         assert(!"INVOCATIONID not implemented");
1481                 break;
1482
1483         case TGSI_SEMANTIC_POSITION:
1484         {
1485                 LLVMValueRef pos[4] = {
1486                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1487                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1488                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1489                         lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1490                                                  LLVMGetParam(ctx->main_fn,
1491                                                               SI_PARAM_POS_W_FLOAT)),
1492                 };
1493                 value = lp_build_gather_values(gallivm, pos, 4);
1494                 break;
1495         }
1496
1497         case TGSI_SEMANTIC_FACE:
1498                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1499                 break;
1500
1501         case TGSI_SEMANTIC_SAMPLEID:
1502                 value = get_sample_id(ctx);
1503                 break;
1504
1505         case TGSI_SEMANTIC_SAMPLEPOS: {
1506                 LLVMValueRef pos[4] = {
1507                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1508                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1509                         LLVMConstReal(ctx->f32, 0),
1510                         LLVMConstReal(ctx->f32, 0)
1511                 };
1512                 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1513                                                   TGSI_OPCODE_FRC, pos[0]);
1514                 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1515                                                   TGSI_OPCODE_FRC, pos[1]);
1516                 value = lp_build_gather_values(gallivm, pos, 4);
1517                 break;
1518         }
1519
1520         case TGSI_SEMANTIC_SAMPLEMASK:
1521                 /* This can only occur with the OpenGL Core profile, which
1522                  * doesn't support smoothing.
1523                  */
1524                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1525                 break;
1526
1527         case TGSI_SEMANTIC_TESSCOORD:
1528         {
1529                 LLVMValueRef coord[4] = {
1530                         LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1531                         LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1532                         bld->zero,
1533                         bld->zero
1534                 };
1535
1536                 /* For triangles, the vector should be (u, v, 1-u-v). */
1537                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1538                     PIPE_PRIM_TRIANGLES)
1539                         coord[2] = lp_build_sub(bld, bld->one,
1540                                                 lp_build_add(bld, coord[0], coord[1]));
1541
1542                 value = lp_build_gather_values(gallivm, coord, 4);
1543                 break;
1544         }
1545
1546         case TGSI_SEMANTIC_VERTICESIN:
1547                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1548                         value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1549                 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1550                         value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1551                 else
1552                         assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1553                 break;
1554
1555         case TGSI_SEMANTIC_TESSINNER:
1556         case TGSI_SEMANTIC_TESSOUTER:
1557         {
1558                 LLVMValueRef buffer, base, addr;
1559                 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1560
1561                 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1562
1563                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1564                 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1565                                           LLVMConstInt(ctx->i32, param, 0));
1566
1567                 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1568                                     ~0, buffer, base, addr, true);
1569
1570                 break;
1571         }
1572
1573         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1574         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1575         {
1576                 LLVMValueRef buf, slot, val[4];
1577                 int i, offset;
1578
1579                 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1580                 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1581                 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1582                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1583
1584                 for (i = 0; i < 4; i++)
1585                         val[i] = buffer_load_const(ctx, buf,
1586                                                    LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1587                 value = lp_build_gather_values(gallivm, val, 4);
1588                 break;
1589         }
1590
1591         case TGSI_SEMANTIC_PRIMID:
1592                 value = get_primitive_id(&ctx->bld_base, 0);
1593                 break;
1594
1595         case TGSI_SEMANTIC_GRID_SIZE:
1596                 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1597                 break;
1598
1599         case TGSI_SEMANTIC_BLOCK_SIZE:
1600         {
1601                 LLVMValueRef values[3];
1602                 unsigned i;
1603                 unsigned *properties = ctx->shader->selector->info.properties;
1604
1605                 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1606                         unsigned sizes[3] = {
1607                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1608                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1609                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1610                         };
1611
1612                         for (i = 0; i < 3; ++i)
1613                                 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1614
1615                         value = lp_build_gather_values(gallivm, values, 3);
1616                 } else {
1617                         value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1618                 }
1619                 break;
1620         }
1621
1622         case TGSI_SEMANTIC_BLOCK_ID:
1623         {
1624                 LLVMValueRef values[3];
1625
1626                 for (int i = 0; i < 3; i++) {
1627                         values[i] = ctx->i32_0;
1628                         if (ctx->param_block_id[i] >= 0) {
1629                                 values[i] = LLVMGetParam(ctx->main_fn,
1630                                                          ctx->param_block_id[i]);
1631                         }
1632                 }
1633                 value = lp_build_gather_values(gallivm, values, 3);
1634                 break;
1635         }
1636
1637         case TGSI_SEMANTIC_THREAD_ID:
1638                 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1639                 break;
1640
1641         case TGSI_SEMANTIC_HELPER_INVOCATION:
1642                 value = lp_build_intrinsic(gallivm->builder,
1643                                            "llvm.amdgcn.ps.live",
1644                                            ctx->i1, NULL, 0,
1645                                            LP_FUNC_ATTR_READNONE);
1646                 value = LLVMBuildNot(gallivm->builder, value, "");
1647                 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1648                 break;
1649
1650         case TGSI_SEMANTIC_SUBGROUP_SIZE:
1651                 value = LLVMConstInt(ctx->i32, 64, 0);
1652                 break;
1653
1654         case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1655                 value = ac_get_thread_id(&ctx->ac);
1656                 break;
1657
1658         case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1659         {
1660                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1661                 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1662                 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1663                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1664                 break;
1665         }
1666
1667         case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1668         case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1669         case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1670         case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1671         {
1672                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1673                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1674                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1675                         /* All bits set except LSB */
1676                         value = LLVMConstInt(ctx->i64, -2, 0);
1677                 } else {
1678                         /* All bits set */
1679                         value = LLVMConstInt(ctx->i64, -1, 0);
1680                 }
1681                 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1682                 value = LLVMBuildShl(gallivm->builder, value, id, "");
1683                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1684                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1685                         value = LLVMBuildNot(gallivm->builder, value, "");
1686                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1687                 break;
1688         }
1689
1690         default:
1691                 assert(!"unknown system value");
1692                 return;
1693         }
1694
1695         ctx->system_values[index] = value;
1696 }
1697
1698 static void declare_compute_memory(struct si_shader_context *ctx,
1699                                    const struct tgsi_full_declaration *decl)
1700 {
1701         struct si_shader_selector *sel = ctx->shader->selector;
1702         struct gallivm_state *gallivm = &ctx->gallivm;
1703
1704         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1705         LLVMValueRef var;
1706
1707         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1708         assert(decl->Range.First == decl->Range.Last);
1709         assert(!ctx->shared_memory);
1710
1711         var = LLVMAddGlobalInAddressSpace(gallivm->module,
1712                                           LLVMArrayType(ctx->i8, sel->local_size),
1713                                           "compute_lds",
1714                                           LOCAL_ADDR_SPACE);
1715         LLVMSetAlignment(var, 4);
1716
1717         ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1718 }
1719
1720 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1721 {
1722         LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1723                                              ctx->param_const_and_shader_buffers);
1724
1725         return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1726                         LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1727 }
1728
1729 static LLVMValueRef fetch_constant(
1730         struct lp_build_tgsi_context *bld_base,
1731         const struct tgsi_full_src_register *reg,
1732         enum tgsi_opcode_type type,
1733         unsigned swizzle)
1734 {
1735         struct si_shader_context *ctx = si_shader_context(bld_base);
1736         struct lp_build_context *base = &bld_base->base;
1737         const struct tgsi_ind_register *ireg = &reg->Indirect;
1738         unsigned buf, idx;
1739
1740         LLVMValueRef addr, bufp;
1741         LLVMValueRef result;
1742
1743         if (swizzle == LP_CHAN_ALL) {
1744                 unsigned chan;
1745                 LLVMValueRef values[4];
1746                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1747                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1748
1749                 return lp_build_gather_values(&ctx->gallivm, values, 4);
1750         }
1751
1752         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1753         idx = reg->Register.Index * 4 + swizzle;
1754
1755         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1756                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1757                 LLVMValueRef index;
1758                 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1759                                                       reg->Dimension.Index,
1760                                                       ctx->num_const_buffers);
1761                 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1762                                      LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1763                 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1764         } else
1765                 bufp = load_const_buffer_desc(ctx, buf);
1766
1767         if (reg->Register.Indirect) {
1768                 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1769                 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1770                 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1771                 addr = lp_build_add(&bld_base->uint_bld, addr,
1772                                     LLVMConstInt(ctx->i32, idx * 4, 0));
1773         } else {
1774                 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1775         }
1776
1777         result = buffer_load_const(ctx, bufp, addr);
1778
1779         if (!tgsi_type_is_64bit(type))
1780                 result = bitcast(bld_base, type, result);
1781         else {
1782                 LLVMValueRef addr2, result2;
1783
1784                 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1785                                      LLVMConstInt(ctx->i32, 4, 0));
1786                 result2 = buffer_load_const(ctx, bufp, addr2);
1787
1788                 result = si_llvm_emit_fetch_64bit(bld_base, type,
1789                                                   result, result2);
1790         }
1791         return result;
1792 }
1793
1794 /* Upper 16 bits must be zero. */
1795 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1796                                            LLVMValueRef val[2])
1797 {
1798         return LLVMBuildOr(ctx->gallivm.builder, val[0],
1799                            LLVMBuildShl(ctx->gallivm.builder, val[1],
1800                                         LLVMConstInt(ctx->i32, 16, 0),
1801                                         ""), "");
1802 }
1803
1804 /* Upper 16 bits are ignored and will be dropped. */
1805 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1806                                                     LLVMValueRef val[2])
1807 {
1808         LLVMValueRef v[2] = {
1809                 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1810                              LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1811                 val[1],
1812         };
1813         return si_llvm_pack_two_int16(ctx, v);
1814 }
1815
1816 /* Initialize arguments for the shader export intrinsic */
1817 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1818                                      LLVMValueRef *values,
1819                                      unsigned target,
1820                                      struct ac_export_args *args)
1821 {
1822         struct si_shader_context *ctx = si_shader_context(bld_base);
1823         struct lp_build_context *base = &bld_base->base;
1824         LLVMBuilderRef builder = ctx->gallivm.builder;
1825         LLVMValueRef val[4];
1826         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1827         unsigned chan;
1828         bool is_int8, is_int10;
1829
1830         /* Default is 0xf. Adjusted below depending on the format. */
1831         args->enabled_channels = 0xf; /* writemask */
1832
1833         /* Specify whether the EXEC mask represents the valid mask */
1834         args->valid_mask = 0;
1835
1836         /* Specify whether this is the last export */
1837         args->done = 0;
1838
1839         /* Specify the target we are exporting */
1840         args->target = target;
1841
1842         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1843                 const struct si_shader_key *key = &ctx->shader->key;
1844                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1845                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1846
1847                 assert(cbuf >= 0 && cbuf < 8);
1848                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1849                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1850                 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1851         }
1852
1853         args->compr = false;
1854         args->out[0] = base->undef;
1855         args->out[1] = base->undef;
1856         args->out[2] = base->undef;
1857         args->out[3] = base->undef;
1858
1859         switch (spi_shader_col_format) {
1860         case V_028714_SPI_SHADER_ZERO:
1861                 args->enabled_channels = 0; /* writemask */
1862                 args->target = V_008DFC_SQ_EXP_NULL;
1863                 break;
1864
1865         case V_028714_SPI_SHADER_32_R:
1866                 args->enabled_channels = 1; /* writemask */
1867                 args->out[0] = values[0];
1868                 break;
1869
1870         case V_028714_SPI_SHADER_32_GR:
1871                 args->enabled_channels = 0x3; /* writemask */
1872                 args->out[0] = values[0];
1873                 args->out[1] = values[1];
1874                 break;
1875
1876         case V_028714_SPI_SHADER_32_AR:
1877                 args->enabled_channels = 0x9; /* writemask */
1878                 args->out[0] = values[0];
1879                 args->out[3] = values[3];
1880                 break;
1881
1882         case V_028714_SPI_SHADER_FP16_ABGR:
1883                 args->compr = 1; /* COMPR flag */
1884
1885                 for (chan = 0; chan < 2; chan++) {
1886                         LLVMValueRef pack_args[2] = {
1887                                 values[2 * chan],
1888                                 values[2 * chan + 1]
1889                         };
1890                         LLVMValueRef packed;
1891
1892                         packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1893                         args->out[chan] =
1894                                 LLVMBuildBitCast(ctx->gallivm.builder,
1895                                                  packed, ctx->f32, "");
1896                 }
1897                 break;
1898
1899         case V_028714_SPI_SHADER_UNORM16_ABGR:
1900                 for (chan = 0; chan < 4; chan++) {
1901                         val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1902                         val[chan] = LLVMBuildFMul(builder, val[chan],
1903                                                   LLVMConstReal(ctx->f32, 65535), "");
1904                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1905                                                   LLVMConstReal(ctx->f32, 0.5), "");
1906                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
1907                                                     ctx->i32, "");
1908                 }
1909
1910                 args->compr = 1; /* COMPR flag */
1911                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1912                                   si_llvm_pack_two_int16(ctx, val));
1913                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1914                                   si_llvm_pack_two_int16(ctx, val+2));
1915                 break;
1916
1917         case V_028714_SPI_SHADER_SNORM16_ABGR:
1918                 for (chan = 0; chan < 4; chan++) {
1919                         /* Clamp between [-1, 1]. */
1920                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1921                                                               values[chan],
1922                                                               LLVMConstReal(ctx->f32, 1));
1923                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1924                                                               val[chan],
1925                                                               LLVMConstReal(ctx->f32, -1));
1926                         /* Convert to a signed integer in [-32767, 32767]. */
1927                         val[chan] = LLVMBuildFMul(builder, val[chan],
1928                                                   LLVMConstReal(ctx->f32, 32767), "");
1929                         /* If positive, add 0.5, else add -0.5. */
1930                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1931                                         LLVMBuildSelect(builder,
1932                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
1933                                                               val[chan], base->zero, ""),
1934                                                 LLVMConstReal(ctx->f32, 0.5),
1935                                                 LLVMConstReal(ctx->f32, -0.5), ""), "");
1936                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1937                 }
1938
1939                 args->compr = 1; /* COMPR flag */
1940                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1941                                   si_llvm_pack_two_int32_as_int16(ctx, val));
1942                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1943                                   si_llvm_pack_two_int32_as_int16(ctx, val+2));
1944                 break;
1945
1946         case V_028714_SPI_SHADER_UINT16_ABGR: {
1947                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1948                         is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1949                 LLVMValueRef max_alpha =
1950                         !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1951
1952                 /* Clamp. */
1953                 for (chan = 0; chan < 4; chan++) {
1954                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1955                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1956                                         val[chan],
1957                                         chan == 3 ? max_alpha : max_rgb);
1958                 }
1959
1960                 args->compr = 1; /* COMPR flag */
1961                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1962                                   si_llvm_pack_two_int16(ctx, val));
1963                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1964                                   si_llvm_pack_two_int16(ctx, val+2));
1965                 break;
1966         }
1967
1968         case V_028714_SPI_SHADER_SINT16_ABGR: {
1969                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1970                         is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1971                 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1972                         is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1973                 LLVMValueRef max_alpha =
1974                         !is_int10 ? max_rgb : ctx->i32_1;
1975                 LLVMValueRef min_alpha =
1976                         !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1977
1978                 /* Clamp. */
1979                 for (chan = 0; chan < 4; chan++) {
1980                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1981                         val[chan] = lp_build_emit_llvm_binary(bld_base,
1982                                         TGSI_OPCODE_IMIN,
1983                                         val[chan], chan == 3 ? max_alpha : max_rgb);
1984                         val[chan] = lp_build_emit_llvm_binary(bld_base,
1985                                         TGSI_OPCODE_IMAX,
1986                                         val[chan], chan == 3 ? min_alpha : min_rgb);
1987                 }
1988
1989                 args->compr = 1; /* COMPR flag */
1990                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1991                                   si_llvm_pack_two_int32_as_int16(ctx, val));
1992                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1993                                   si_llvm_pack_two_int32_as_int16(ctx, val+2));
1994                 break;
1995         }
1996
1997         case V_028714_SPI_SHADER_32_ABGR:
1998                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1999                 break;
2000         }
2001 }
2002
2003 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2004                           LLVMValueRef alpha)
2005 {
2006         struct si_shader_context *ctx = si_shader_context(bld_base);
2007
2008         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2009                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2010                                 SI_PARAM_ALPHA_REF);
2011
2012                 LLVMValueRef alpha_pass =
2013                         lp_build_cmp(&bld_base->base,
2014                                      ctx->shader->key.part.ps.epilog.alpha_func,
2015                                      alpha, alpha_ref);
2016                 LLVMValueRef arg =
2017                         lp_build_select(&bld_base->base,
2018                                         alpha_pass,
2019                                         LLVMConstReal(ctx->f32, 1.0f),
2020                                         LLVMConstReal(ctx->f32, -1.0f));
2021
2022                 ac_build_kill(&ctx->ac, arg);
2023         } else {
2024                 ac_build_kill(&ctx->ac, NULL);
2025         }
2026 }
2027
2028 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2029                                                   LLVMValueRef alpha,
2030                                                   unsigned samplemask_param)
2031 {
2032         struct si_shader_context *ctx = si_shader_context(bld_base);
2033         struct gallivm_state *gallivm = &ctx->gallivm;
2034         LLVMValueRef coverage;
2035
2036         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2037         coverage = LLVMGetParam(ctx->main_fn,
2038                                 samplemask_param);
2039         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2040
2041         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2042                                    ctx->i32,
2043                                    &coverage, 1, LP_FUNC_ATTR_READNONE);
2044
2045         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2046                                    ctx->f32, "");
2047
2048         coverage = LLVMBuildFMul(gallivm->builder, coverage,
2049                                  LLVMConstReal(ctx->f32,
2050                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2051
2052         return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2053 }
2054
2055 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2056                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
2057 {
2058         struct si_shader_context *ctx = si_shader_context(bld_base);
2059         struct lp_build_context *base = &bld_base->base;
2060         unsigned reg_index;
2061         unsigned chan;
2062         unsigned const_chan;
2063         LLVMValueRef base_elt;
2064         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2065         LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2066                                                    SI_VS_CONST_CLIP_PLANES, 0);
2067         LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2068
2069         for (reg_index = 0; reg_index < 2; reg_index ++) {
2070                 struct ac_export_args *args = &pos[2 + reg_index];
2071
2072                 args->out[0] =
2073                 args->out[1] =
2074                 args->out[2] =
2075                 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2076
2077                 /* Compute dot products of position and user clip plane vectors */
2078                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2079                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2080                                 LLVMValueRef addr =
2081                                         LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2082                                                                 const_chan) * 4, 0);
2083                                 base_elt = buffer_load_const(ctx, const_resource,
2084                                                              addr);
2085                                 args->out[chan] =
2086                                         lp_build_add(base, args->out[chan],
2087                                                      lp_build_mul(base, base_elt,
2088                                                                   out_elts[const_chan]));
2089                         }
2090                 }
2091
2092                 args->enabled_channels = 0xf;
2093                 args->valid_mask = 0;
2094                 args->done = 0;
2095                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2096                 args->compr = 0;
2097         }
2098 }
2099
2100 static void si_dump_streamout(struct pipe_stream_output_info *so)
2101 {
2102         unsigned i;
2103
2104         if (so->num_outputs)
2105                 fprintf(stderr, "STREAMOUT\n");
2106
2107         for (i = 0; i < so->num_outputs; i++) {
2108                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2109                                 so->output[i].start_component;
2110                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2111                         i, so->output[i].output_buffer,
2112                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2113                         so->output[i].register_index,
2114                         mask & 1 ? "x" : "",
2115                         mask & 2 ? "y" : "",
2116                         mask & 4 ? "z" : "",
2117                         mask & 8 ? "w" : "");
2118         }
2119 }
2120
2121 static void emit_streamout_output(struct si_shader_context *ctx,
2122                                   LLVMValueRef const *so_buffers,
2123                                   LLVMValueRef const *so_write_offsets,
2124                                   struct pipe_stream_output *stream_out,
2125                                   struct si_shader_output_values *shader_out)
2126 {
2127         struct gallivm_state *gallivm = &ctx->gallivm;
2128         LLVMBuilderRef builder = gallivm->builder;
2129         unsigned buf_idx = stream_out->output_buffer;
2130         unsigned start = stream_out->start_component;
2131         unsigned num_comps = stream_out->num_components;
2132         LLVMValueRef out[4];
2133
2134         assert(num_comps && num_comps <= 4);
2135         if (!num_comps || num_comps > 4)
2136                 return;
2137
2138         /* Load the output as int. */
2139         for (int j = 0; j < num_comps; j++) {
2140                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2141
2142                 out[j] = LLVMBuildBitCast(builder,
2143                                           shader_out->values[start + j],
2144                                 ctx->i32, "");
2145         }
2146
2147         /* Pack the output. */
2148         LLVMValueRef vdata = NULL;
2149
2150         switch (num_comps) {
2151         case 1: /* as i32 */
2152                 vdata = out[0];
2153                 break;
2154         case 2: /* as v2i32 */
2155         case 3: /* as v4i32 (aligned to 4) */
2156         case 4: /* as v4i32 */
2157                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2158                 for (int j = 0; j < num_comps; j++) {
2159                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2160                                                        LLVMConstInt(ctx->i32, j, 0), "");
2161                 }
2162                 break;
2163         }
2164
2165         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2166                                     vdata, num_comps,
2167                                     so_write_offsets[buf_idx],
2168                                     ctx->i32_0,
2169                                     stream_out->dst_offset * 4, 1, 1, true, false);
2170 }
2171
2172 /**
2173  * Write streamout data to buffers for vertex stream @p stream (different
2174  * vertex streams can occur for GS copy shaders).
2175  */
2176 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2177                                    struct si_shader_output_values *outputs,
2178                                    unsigned noutput, unsigned stream)
2179 {
2180         struct si_shader_selector *sel = ctx->shader->selector;
2181         struct pipe_stream_output_info *so = &sel->so;
2182         struct gallivm_state *gallivm = &ctx->gallivm;
2183         LLVMBuilderRef builder = gallivm->builder;
2184         int i;
2185         struct lp_build_if_state if_ctx;
2186
2187         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2188         LLVMValueRef so_vtx_count =
2189                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2190
2191         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2192
2193         /* can_emit = tid < so_vtx_count; */
2194         LLVMValueRef can_emit =
2195                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2196
2197         /* Emit the streamout code conditionally. This actually avoids
2198          * out-of-bounds buffer access. The hw tells us via the SGPR
2199          * (so_vtx_count) which threads are allowed to emit streamout data. */
2200         lp_build_if(&if_ctx, gallivm, can_emit);
2201         {
2202                 /* The buffer offset is computed as follows:
2203                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2204                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2205                  *                attrib_offset
2206                  */
2207
2208                 LLVMValueRef so_write_index =
2209                         LLVMGetParam(ctx->main_fn,
2210                                      ctx->param_streamout_write_index);
2211
2212                 /* Compute (streamout_write_index + thread_id). */
2213                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2214
2215                 /* Load the descriptor and compute the write offset for each
2216                  * enabled buffer. */
2217                 LLVMValueRef so_write_offset[4] = {};
2218                 LLVMValueRef so_buffers[4];
2219                 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2220                                                     ctx->param_rw_buffers);
2221
2222                 for (i = 0; i < 4; i++) {
2223                         if (!so->stride[i])
2224                                 continue;
2225
2226                         LLVMValueRef offset = LLVMConstInt(ctx->i32,
2227                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
2228
2229                         so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2230
2231                         LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2232                                                               ctx->param_streamout_offset[i]);
2233                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2234
2235                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2236                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2237                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2238                 }
2239
2240                 /* Write streamout data. */
2241                 for (i = 0; i < so->num_outputs; i++) {
2242                         unsigned reg = so->output[i].register_index;
2243
2244                         if (reg >= noutput)
2245                                 continue;
2246
2247                         if (stream != so->output[i].stream)
2248                                 continue;
2249
2250                         emit_streamout_output(ctx, so_buffers, so_write_offset,
2251                                               &so->output[i], &outputs[reg]);
2252                 }
2253         }
2254         lp_build_endif(&if_ctx);
2255 }
2256
2257
2258 /* Generate export instructions for hardware VS shader stage */
2259 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2260                               struct si_shader_output_values *outputs,
2261                               unsigned noutput)
2262 {
2263         struct si_shader_context *ctx = si_shader_context(bld_base);
2264         struct si_shader *shader = ctx->shader;
2265         struct lp_build_context *base = &bld_base->base;
2266         struct ac_export_args args, pos_args[4] = {};
2267         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2268         unsigned semantic_name, semantic_index;
2269         unsigned target;
2270         unsigned param_count = 0;
2271         unsigned pos_idx;
2272         int i;
2273
2274         for (i = 0; i < noutput; i++) {
2275                 semantic_name = outputs[i].semantic_name;
2276                 semantic_index = outputs[i].semantic_index;
2277                 bool export_param = true;
2278
2279                 switch (semantic_name) {
2280                 case TGSI_SEMANTIC_POSITION: /* ignore these */
2281                 case TGSI_SEMANTIC_PSIZE:
2282                 case TGSI_SEMANTIC_CLIPVERTEX:
2283                 case TGSI_SEMANTIC_EDGEFLAG:
2284                         break;
2285                 case TGSI_SEMANTIC_GENERIC:
2286                         /* don't process indices the function can't handle */
2287                         if (semantic_index >= SI_MAX_IO_GENERIC)
2288                                 break;
2289                         /* fall through */
2290                 default:
2291                         if (shader->key.opt.kill_outputs &
2292                             (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2293                                 export_param = false;
2294                 }
2295
2296                 if (outputs[i].vertex_stream[0] != 0 &&
2297                     outputs[i].vertex_stream[1] != 0 &&
2298                     outputs[i].vertex_stream[2] != 0 &&
2299                     outputs[i].vertex_stream[3] != 0)
2300                         export_param = false;
2301
2302 handle_semantic:
2303                 /* Select the correct target */
2304                 switch(semantic_name) {
2305                 case TGSI_SEMANTIC_PSIZE:
2306                         psize_value = outputs[i].values[0];
2307                         continue;
2308                 case TGSI_SEMANTIC_EDGEFLAG:
2309                         edgeflag_value = outputs[i].values[0];
2310                         continue;
2311                 case TGSI_SEMANTIC_LAYER:
2312                         layer_value = outputs[i].values[0];
2313                         semantic_name = TGSI_SEMANTIC_GENERIC;
2314                         goto handle_semantic;
2315                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2316                         viewport_index_value = outputs[i].values[0];
2317                         semantic_name = TGSI_SEMANTIC_GENERIC;
2318                         goto handle_semantic;
2319                 case TGSI_SEMANTIC_POSITION:
2320                         target = V_008DFC_SQ_EXP_POS;
2321                         break;
2322                 case TGSI_SEMANTIC_CLIPDIST:
2323                         if (shader->key.opt.clip_disable) {
2324                                 semantic_name = TGSI_SEMANTIC_GENERIC;
2325                                 goto handle_semantic;
2326                         }
2327                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2328                         break;
2329                 case TGSI_SEMANTIC_CLIPVERTEX:
2330                         if (shader->key.opt.clip_disable)
2331                                 continue;
2332                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2333                         continue;
2334                 case TGSI_SEMANTIC_COLOR:
2335                 case TGSI_SEMANTIC_BCOLOR:
2336                 case TGSI_SEMANTIC_PRIMID:
2337                 case TGSI_SEMANTIC_FOG:
2338                 case TGSI_SEMANTIC_TEXCOORD:
2339                 case TGSI_SEMANTIC_GENERIC:
2340                         if (!export_param)
2341                                 continue;
2342                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2343                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2344                         shader->info.vs_output_param_offset[i] = param_count;
2345                         param_count++;
2346                         break;
2347                 default:
2348                         target = 0;
2349                         fprintf(stderr,
2350                                 "Warning: SI unhandled vs output type:%d\n",
2351                                 semantic_name);
2352                 }
2353
2354                 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2355
2356                 if (target >= V_008DFC_SQ_EXP_POS &&
2357                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
2358                         memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2359                                &args, sizeof(args));
2360                 } else {
2361                         ac_build_export(&ctx->ac, &args);
2362                 }
2363
2364                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2365                         semantic_name = TGSI_SEMANTIC_GENERIC;
2366                         goto handle_semantic;
2367                 }
2368         }
2369
2370         shader->info.nr_param_exports = param_count;
2371
2372         /* We need to add the position output manually if it's missing. */
2373         if (!pos_args[0].out[0]) {
2374                 pos_args[0].enabled_channels = 0xf; /* writemask */
2375                 pos_args[0].valid_mask = 0; /* EXEC mask */
2376                 pos_args[0].done = 0; /* last export? */
2377                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2378                 pos_args[0].compr = 0; /* COMPR flag */
2379                 pos_args[0].out[0] = base->zero; /* X */
2380                 pos_args[0].out[1] = base->zero; /* Y */
2381                 pos_args[0].out[2] = base->zero; /* Z */
2382                 pos_args[0].out[3] = base->one;  /* W */
2383         }
2384
2385         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2386         if (shader->selector->info.writes_psize ||
2387             shader->selector->info.writes_edgeflag ||
2388             shader->selector->info.writes_viewport_index ||
2389             shader->selector->info.writes_layer) {
2390                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2391                                                (shader->selector->info.writes_edgeflag << 1) |
2392                                                (shader->selector->info.writes_layer << 2);
2393
2394                 pos_args[1].valid_mask = 0; /* EXEC mask */
2395                 pos_args[1].done = 0; /* last export? */
2396                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2397                 pos_args[1].compr = 0; /* COMPR flag */
2398                 pos_args[1].out[0] = base->zero; /* X */
2399                 pos_args[1].out[1] = base->zero; /* Y */
2400                 pos_args[1].out[2] = base->zero; /* Z */
2401                 pos_args[1].out[3] = base->zero; /* W */
2402
2403                 if (shader->selector->info.writes_psize)
2404                         pos_args[1].out[0] = psize_value;
2405
2406                 if (shader->selector->info.writes_edgeflag) {
2407                         /* The output is a float, but the hw expects an integer
2408                          * with the first bit containing the edge flag. */
2409                         edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2410                                                          edgeflag_value,
2411                                                          ctx->i32, "");
2412                         edgeflag_value = lp_build_min(&bld_base->int_bld,
2413                                                       edgeflag_value,
2414                                                       ctx->i32_1);
2415
2416                         /* The LLVM intrinsic expects a float. */
2417                         pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2418                                                           edgeflag_value,
2419                                                           ctx->f32, "");
2420                 }
2421
2422                 if (ctx->screen->b.chip_class >= GFX9) {
2423                         /* GFX9 has the layer in out.z[10:0] and the viewport
2424                          * index in out.z[19:16].
2425                          */
2426                         if (shader->selector->info.writes_layer)
2427                                 pos_args[1].out[2] = layer_value;
2428
2429                         if (shader->selector->info.writes_viewport_index) {
2430                                 LLVMValueRef v = viewport_index_value;
2431
2432                                 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2433                                 v = LLVMBuildShl(ctx->gallivm.builder, v,
2434                                                  LLVMConstInt(ctx->i32, 16, 0), "");
2435                                 v = LLVMBuildOr(ctx->gallivm.builder, v,
2436                                                 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2437                                                         pos_args[1].out[2]), "");
2438                                 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2439                                 pos_args[1].enabled_channels |= 1 << 2;
2440                         }
2441                 } else {
2442                         if (shader->selector->info.writes_layer)
2443                                 pos_args[1].out[2] = layer_value;
2444
2445                         if (shader->selector->info.writes_viewport_index) {
2446                                 pos_args[1].out[3] = viewport_index_value;
2447                                 pos_args[1].enabled_channels |= 1 << 3;
2448                         }
2449                 }
2450         }
2451
2452         for (i = 0; i < 4; i++)
2453                 if (pos_args[i].out[0])
2454                         shader->info.nr_pos_exports++;
2455
2456         pos_idx = 0;
2457         for (i = 0; i < 4; i++) {
2458                 if (!pos_args[i].out[0])
2459                         continue;
2460
2461                 /* Specify the target we are exporting */
2462                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2463
2464                 if (pos_idx == shader->info.nr_pos_exports)
2465                         /* Specify that this is the last export */
2466                         pos_args[i].done = 1;
2467
2468                 ac_build_export(&ctx->ac, &pos_args[i]);
2469         }
2470 }
2471
2472 /**
2473  * Forward all outputs from the vertex shader to the TES. This is only used
2474  * for the fixed function TCS.
2475  */
2476 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2477 {
2478         struct si_shader_context *ctx = si_shader_context(bld_base);
2479         struct gallivm_state *gallivm = &ctx->gallivm;
2480         LLVMValueRef invocation_id, buffer, buffer_offset;
2481         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2482         uint64_t inputs;
2483
2484         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2485         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2486         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2487
2488         lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2489         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2490                                          lds_vertex_stride, "");
2491         lds_base = get_tcs_in_current_patch_offset(ctx);
2492         lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2493
2494         inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2495         while (inputs) {
2496                 unsigned i = u_bit_scan64(&inputs);
2497
2498                 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2499                                             LLVMConstInt(ctx->i32, 4 * i, 0),
2500                                              "");
2501
2502                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2503                                               get_rel_patch_id(ctx),
2504                                               invocation_id,
2505                                               LLVMConstInt(ctx->i32, i, 0));
2506
2507                 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2508                                               lds_ptr);
2509
2510                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2511                                             buffer_offset, 0, 1, 0, true, false);
2512         }
2513 }
2514
2515 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2516                                   LLVMValueRef rel_patch_id,
2517                                   LLVMValueRef invocation_id,
2518                                   LLVMValueRef tcs_out_current_patch_data_offset)
2519 {
2520         struct si_shader_context *ctx = si_shader_context(bld_base);
2521         struct gallivm_state *gallivm = &ctx->gallivm;
2522         struct si_shader *shader = ctx->shader;
2523         unsigned tess_inner_index, tess_outer_index;
2524         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2525         LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2526         unsigned stride, outer_comps, inner_comps, i, offset;
2527         struct lp_build_if_state if_ctx, inner_if_ctx;
2528
2529         si_llvm_emit_barrier(NULL, bld_base, NULL);
2530
2531         /* Do this only for invocation 0, because the tess levels are per-patch,
2532          * not per-vertex.
2533          *
2534          * This can't jump, because invocation 0 executes this. It should
2535          * at least mask out the loads and stores for other invocations.
2536          */
2537         lp_build_if(&if_ctx, gallivm,
2538                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2539                                   invocation_id, ctx->i32_0, ""));
2540
2541         /* Determine the layout of one tess factor element in the buffer. */
2542         switch (shader->key.part.tcs.epilog.prim_mode) {
2543         case PIPE_PRIM_LINES:
2544                 stride = 2; /* 2 dwords, 1 vec2 store */
2545                 outer_comps = 2;
2546                 inner_comps = 0;
2547                 break;
2548         case PIPE_PRIM_TRIANGLES:
2549                 stride = 4; /* 4 dwords, 1 vec4 store */
2550                 outer_comps = 3;
2551                 inner_comps = 1;
2552                 break;
2553         case PIPE_PRIM_QUADS:
2554                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2555                 outer_comps = 4;
2556                 inner_comps = 2;
2557                 break;
2558         default:
2559                 assert(0);
2560                 return;
2561         }
2562
2563         /* Load tess_inner and tess_outer from LDS.
2564          * Any invocation can write them, so we can't get them from a temporary.
2565          */
2566         tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2567         tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2568
2569         lds_base = tcs_out_current_patch_data_offset;
2570         lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2571                                  LLVMConstInt(ctx->i32,
2572                                               tess_inner_index * 4, 0), "");
2573         lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2574                                  LLVMConstInt(ctx->i32,
2575                                               tess_outer_index * 4, 0), "");
2576
2577         for (i = 0; i < 4; i++) {
2578                 inner[i] = LLVMGetUndef(ctx->i32);
2579                 outer[i] = LLVMGetUndef(ctx->i32);
2580         }
2581
2582         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2583                 /* For isolines, the hardware expects tess factors in the
2584                  * reverse order from what GLSL / TGSI specify.
2585                  */
2586                 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2587                 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2588         } else {
2589                 for (i = 0; i < outer_comps; i++) {
2590                         outer[i] = out[i] =
2591                                 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2592                 }
2593                 for (i = 0; i < inner_comps; i++) {
2594                         inner[i] = out[outer_comps+i] =
2595                                 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2596                 }
2597         }
2598
2599         /* Convert the outputs to vectors for stores. */
2600         vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2601         vec1 = NULL;
2602
2603         if (stride > 4)
2604                 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2605
2606         /* Get the buffer. */
2607         buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2608
2609         /* Get the offset. */
2610         tf_base = LLVMGetParam(ctx->main_fn,
2611                                ctx->param_tcs_factor_offset);
2612         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2613                                   LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2614
2615         lp_build_if(&inner_if_ctx, gallivm,
2616                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2617                                   rel_patch_id, ctx->i32_0, ""));
2618
2619         /* Store the dynamic HS control word. */
2620         offset = 0;
2621         if (ctx->screen->b.chip_class <= VI) {
2622                 ac_build_buffer_store_dword(&ctx->ac, buffer,
2623                                             LLVMConstInt(ctx->i32, 0x80000000, 0),
2624                                             1, ctx->i32_0, tf_base,
2625                                             offset, 1, 0, true, false);
2626                 offset += 4;
2627         }
2628
2629         lp_build_endif(&inner_if_ctx);
2630
2631         /* Store the tessellation factors. */
2632         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2633                                     MIN2(stride, 4), byteoffset, tf_base,
2634                                     offset, 1, 0, true, false);
2635         offset += 16;
2636         if (vec1)
2637                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2638                                             stride - 4, byteoffset, tf_base,
2639                                             offset, 1, 0, true, false);
2640
2641         /* Store the tess factors into the offchip buffer if TES reads them. */
2642         if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2643                 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2644                 LLVMValueRef tf_inner_offset;
2645                 unsigned param_outer, param_inner;
2646
2647                 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2648                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2649
2650                 param_outer = si_shader_io_get_unique_index_patch(
2651                                       TGSI_SEMANTIC_TESSOUTER, 0);
2652                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2653                                         LLVMConstInt(ctx->i32, param_outer, 0));
2654
2655                 outer_vec = lp_build_gather_values(gallivm, outer,
2656                                                    util_next_power_of_two(outer_comps));
2657
2658                 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2659                                             outer_comps, tf_outer_offset,
2660                                             base, 0, 1, 0, true, false);
2661                 if (inner_comps) {
2662                         param_inner = si_shader_io_get_unique_index_patch(
2663                                               TGSI_SEMANTIC_TESSINNER, 0);
2664                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2665                                         LLVMConstInt(ctx->i32, param_inner, 0));
2666
2667                         inner_vec = inner_comps == 1 ? inner[0] :
2668                                     lp_build_gather_values(gallivm, inner, inner_comps);
2669                         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2670                                                     inner_comps, tf_inner_offset,
2671                                                     base, 0, 1, 0, true, false);
2672                 }
2673         }
2674
2675         lp_build_endif(&if_ctx);
2676 }
2677
2678 static LLVMValueRef
2679 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2680                     unsigned param, unsigned return_index)
2681 {
2682         return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2683                                     LLVMGetParam(ctx->main_fn, param),
2684                                     return_index, "");
2685 }
2686
2687 static LLVMValueRef
2688 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2689                           unsigned param, unsigned return_index)
2690 {
2691         LLVMBuilderRef builder = ctx->gallivm.builder;
2692         LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2693
2694         return LLVMBuildInsertValue(builder, ret,
2695                                     LLVMBuildBitCast(builder, p, ctx->f32, ""),
2696                                     return_index, "");
2697 }
2698
2699 static LLVMValueRef
2700 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2701                              unsigned param, unsigned return_index)
2702 {
2703         LLVMBuilderRef builder = ctx->gallivm.builder;
2704         LLVMValueRef ptr, lo, hi;
2705
2706         ptr = LLVMGetParam(ctx->main_fn, param);
2707         ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2708         ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2709         lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2710         hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2711         ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2712         return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2713 }
2714
2715 /* This only writes the tessellation factor levels. */
2716 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2717 {
2718         struct si_shader_context *ctx = si_shader_context(bld_base);
2719         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2720
2721         si_copy_tcs_inputs(bld_base);
2722
2723         rel_patch_id = get_rel_patch_id(ctx);
2724         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2725         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2726
2727         /* Return epilog parameters from this function. */
2728         LLVMBuilderRef builder = ctx->gallivm.builder;
2729         LLVMValueRef ret = ctx->return_value;
2730         unsigned vgpr;
2731
2732         if (ctx->screen->b.chip_class >= GFX9) {
2733                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2734                                           8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2735                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2736                                           8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2737                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2738                                           8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2739                 /* Tess offchip and tess factor offsets are at the beginning. */
2740                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2741                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2742                 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2743         } else {
2744                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2745                                           GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2746                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2747                                           GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2748                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2749                                           GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2750                 /* Tess offchip and tess factor offsets are after user SGPRs. */
2751                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2752                                           GFX6_TCS_NUM_USER_SGPR);
2753                 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2754                                           GFX6_TCS_NUM_USER_SGPR + 1);
2755                 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2756         }
2757
2758         /* VGPRs */
2759         rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2760         invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2761         tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2762
2763         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2764         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2765         ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2766         ctx->return_value = ret;
2767 }
2768
2769 /* Pass TCS inputs from LS to TCS on GFX9. */
2770 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2771 {
2772         LLVMValueRef ret = ctx->return_value;
2773
2774         ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2775         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2776         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2777         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2778         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2779
2780         ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2781                                   8 + SI_SGPR_VS_STATE_BITS);
2782         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2783                                   8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2784         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2785                                   8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2786         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2787                                   8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2788         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2789                                   8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2790         ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2791                                   8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2792
2793         unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2794         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2795                                            8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2796         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2797                                            8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2798
2799         unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2800         ret = si_insert_input_ret_float(ctx, ret,
2801                                         ctx->param_tcs_patch_id, vgpr++);
2802         ret = si_insert_input_ret_float(ctx, ret,
2803                                         ctx->param_tcs_rel_ids, vgpr++);
2804         ctx->return_value = ret;
2805 }
2806
2807 /* Pass GS inputs from ES to GS on GFX9. */
2808 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2809 {
2810         LLVMValueRef ret = ctx->return_value;
2811
2812         ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2813         ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2814         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2815
2816         ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2817
2818         unsigned desc_param = ctx->param_vs_state_bits + 1;
2819         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2820                                            8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
2821         ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2822                                            8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
2823
2824         unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2825         for (unsigned i = 0; i < 5; i++) {
2826                 unsigned param = ctx->param_gs_vtx01_offset + i;
2827                 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2828         }
2829         ctx->return_value = ret;
2830 }
2831
2832 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2833 {
2834         struct si_shader_context *ctx = si_shader_context(bld_base);
2835         struct si_shader *shader = ctx->shader;
2836         struct tgsi_shader_info *info = &shader->selector->info;
2837         struct gallivm_state *gallivm = &ctx->gallivm;
2838         unsigned i, chan;
2839         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2840                                               ctx->param_rel_auto_id);
2841         LLVMValueRef vertex_dw_stride =
2842                 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2843         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2844                                                  vertex_dw_stride, "");
2845
2846         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2847          * its inputs from it. */
2848         for (i = 0; i < info->num_outputs; i++) {
2849                 LLVMValueRef *out_ptr = ctx->outputs[i];
2850                 unsigned name = info->output_semantic_name[i];
2851                 unsigned index = info->output_semantic_index[i];
2852
2853                 /* The ARB_shader_viewport_layer_array spec contains the
2854                  * following issue:
2855                  *
2856                  *    2) What happens if gl_ViewportIndex or gl_Layer is
2857                  *    written in the vertex shader and a geometry shader is
2858                  *    present?
2859                  *
2860                  *    RESOLVED: The value written by the last vertex processing
2861                  *    stage is used. If the last vertex processing stage
2862                  *    (vertex, tessellation evaluation or geometry) does not
2863                  *    statically assign to gl_ViewportIndex or gl_Layer, index
2864                  *    or layer zero is assumed.
2865                  *
2866                  * So writes to those outputs in VS-as-LS are simply ignored.
2867                  */
2868                 if (name == TGSI_SEMANTIC_LAYER ||
2869                     name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2870                         continue;
2871
2872                 int param = si_shader_io_get_unique_index(name, index);
2873                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2874                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
2875
2876                 for (chan = 0; chan < 4; chan++) {
2877                         lds_store(bld_base, chan, dw_addr,
2878                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2879                 }
2880         }
2881
2882         if (ctx->screen->b.chip_class >= GFX9)
2883                 si_set_ls_return_value_for_tcs(ctx);
2884 }
2885
2886 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2887 {
2888         struct si_shader_context *ctx = si_shader_context(bld_base);
2889         struct gallivm_state *gallivm = &ctx->gallivm;
2890         struct si_shader *es = ctx->shader;
2891         struct tgsi_shader_info *info = &es->selector->info;
2892         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2893                                             ctx->param_es2gs_offset);
2894         LLVMValueRef lds_base = NULL;
2895         unsigned chan;
2896         int i;
2897
2898         if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2899                 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2900                 lds_base = LLVMBuildMul(gallivm->builder, ac_get_thread_id(&ctx->ac),
2901                                         LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2902         }
2903
2904         for (i = 0; i < info->num_outputs; i++) {
2905                 LLVMValueRef *out_ptr = ctx->outputs[i];
2906                 int param;
2907
2908                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2909                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2910                         continue;
2911
2912                 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2913                                                       info->output_semantic_index[i]);
2914
2915                 for (chan = 0; chan < 4; chan++) {
2916                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2917                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2918
2919                         /* GFX9 has the ESGS ring in LDS. */
2920                         if (ctx->screen->b.chip_class >= GFX9) {
2921                                 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2922                                 continue;
2923                         }
2924
2925                         ac_build_buffer_store_dword(&ctx->ac,
2926                                                     ctx->esgs_ring,
2927                                                     out_val, 1, NULL, soffset,
2928                                                     (4 * param + chan) * 4,
2929                                                     1, 1, true, true);
2930                 }
2931         }
2932
2933         if (ctx->screen->b.chip_class >= GFX9)
2934                 si_set_es_return_value_for_gs(ctx);
2935 }
2936
2937 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2938 {
2939         if (ctx->screen->b.chip_class >= GFX9)
2940                 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2941         else
2942                 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2943 }
2944
2945 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2946 {
2947         struct si_shader_context *ctx = si_shader_context(bld_base);
2948
2949         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2950                          si_get_gs_wave_id(ctx));
2951 }
2952
2953 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2954 {
2955         struct si_shader_context *ctx = si_shader_context(bld_base);
2956         struct gallivm_state *gallivm = &ctx->gallivm;
2957         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2958         struct si_shader_output_values *outputs = NULL;
2959         int i,j;
2960
2961         assert(!ctx->shader->is_gs_copy_shader);
2962
2963         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2964
2965         /* Vertex color clamping.
2966          *
2967          * This uses a state constant loaded in a user data SGPR and
2968          * an IF statement is added that clamps all colors if the constant
2969          * is true.
2970          */
2971         if (ctx->type == PIPE_SHADER_VERTEX) {
2972                 struct lp_build_if_state if_ctx;
2973                 LLVMValueRef cond = NULL;
2974                 LLVMValueRef addr, val;
2975
2976                 for (i = 0; i < info->num_outputs; i++) {
2977                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2978                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2979                                 continue;
2980
2981                         /* We've found a color. */
2982                         if (!cond) {
2983                                 /* The state is in the first bit of the user SGPR. */
2984                                 cond = LLVMGetParam(ctx->main_fn,
2985                                                     ctx->param_vs_state_bits);
2986                                 cond = LLVMBuildTrunc(gallivm->builder, cond,
2987                                                       ctx->i1, "");
2988                                 lp_build_if(&if_ctx, gallivm, cond);
2989                         }
2990
2991                         for (j = 0; j < 4; j++) {
2992                                 addr = ctx->outputs[i][j];
2993                                 val = LLVMBuildLoad(gallivm->builder, addr, "");
2994                                 val = ac_build_clamp(&ctx->ac, val);
2995                                 LLVMBuildStore(gallivm->builder, val, addr);
2996                         }
2997                 }
2998
2999                 if (cond)
3000                         lp_build_endif(&if_ctx);
3001         }
3002
3003         for (i = 0; i < info->num_outputs; i++) {
3004                 outputs[i].semantic_name = info->output_semantic_name[i];
3005                 outputs[i].semantic_index = info->output_semantic_index[i];
3006
3007                 for (j = 0; j < 4; j++) {
3008                         outputs[i].values[j] =
3009                                 LLVMBuildLoad(gallivm->builder,
3010                                               ctx->outputs[i][j],
3011                                               "");
3012                         outputs[i].vertex_stream[j] =
3013                                 (info->output_streams[i] >> (2 * j)) & 3;
3014                 }
3015         }
3016
3017         if (ctx->shader->selector->so.num_outputs)
3018                 si_llvm_emit_streamout(ctx, outputs, i, 0);
3019
3020         /* Export PrimitiveID. */
3021         if (ctx->shader->key.mono.u.vs_export_prim_id) {
3022                 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3023                 outputs[i].semantic_index = 0;
3024                 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
3025                                                get_primitive_id(bld_base, 0));
3026                 for (j = 1; j < 4; j++)
3027                         outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3028
3029                 memset(outputs[i].vertex_stream, 0,
3030                        sizeof(outputs[i].vertex_stream));
3031                 i++;
3032         }
3033
3034         si_llvm_export_vs(bld_base, outputs, i);
3035         FREE(outputs);
3036 }
3037
3038 struct si_ps_exports {
3039         unsigned num;
3040         struct ac_export_args args[10];
3041 };
3042
3043 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3044                                     bool writes_samplemask)
3045 {
3046         if (writes_z) {
3047                 /* Z needs 32 bits. */
3048                 if (writes_samplemask)
3049                         return V_028710_SPI_SHADER_32_ABGR;
3050                 else if (writes_stencil)
3051                         return V_028710_SPI_SHADER_32_GR;
3052                 else
3053                         return V_028710_SPI_SHADER_32_R;
3054         } else if (writes_stencil || writes_samplemask) {
3055                 /* Both stencil and sample mask need only 16 bits. */
3056                 return V_028710_SPI_SHADER_UINT16_ABGR;
3057         } else {
3058                 return V_028710_SPI_SHADER_ZERO;
3059         }
3060 }
3061
3062 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3063                             LLVMValueRef depth, LLVMValueRef stencil,
3064                             LLVMValueRef samplemask, struct si_ps_exports *exp)
3065 {
3066         struct si_shader_context *ctx = si_shader_context(bld_base);
3067         struct lp_build_context *base = &bld_base->base;
3068         struct ac_export_args args;
3069         unsigned mask = 0;
3070         unsigned format = si_get_spi_shader_z_format(depth != NULL,
3071                                                      stencil != NULL,
3072                                                      samplemask != NULL);
3073
3074         assert(depth || stencil || samplemask);
3075
3076         args.valid_mask = 1; /* whether the EXEC mask is valid */
3077         args.done = 1; /* DONE bit */
3078
3079         /* Specify the target we are exporting */
3080         args.target = V_008DFC_SQ_EXP_MRTZ;
3081
3082         args.compr = 0; /* COMP flag */
3083         args.out[0] = base->undef; /* R, depth */
3084         args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3085         args.out[2] = base->undef; /* B, sample mask */
3086         args.out[3] = base->undef; /* A, alpha to mask */
3087
3088         if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3089                 assert(!depth);
3090                 args.compr = 1; /* COMPR flag */
3091
3092                 if (stencil) {
3093                         /* Stencil should be in X[23:16]. */
3094                         stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3095                         stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3096                                                LLVMConstInt(ctx->i32, 16, 0), "");
3097                         args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3098                         mask |= 0x3;
3099                 }
3100                 if (samplemask) {
3101                         /* SampleMask should be in Y[15:0]. */
3102                         args.out[1] = samplemask;
3103                         mask |= 0xc;
3104                 }
3105         } else {
3106                 if (depth) {
3107                         args.out[0] = depth;
3108                         mask |= 0x1;
3109                 }
3110                 if (stencil) {
3111                         args.out[1] = stencil;
3112                         mask |= 0x2;
3113                 }
3114                 if (samplemask) {
3115                         args.out[2] = samplemask;
3116                         mask |= 0x4;
3117                 }
3118         }
3119
3120         /* SI (except OLAND and HAINAN) has a bug that it only looks
3121          * at the X writemask component. */
3122         if (ctx->screen->b.chip_class == SI &&
3123             ctx->screen->b.family != CHIP_OLAND &&
3124             ctx->screen->b.family != CHIP_HAINAN)
3125                 mask |= 0x1;
3126
3127         /* Specify which components to enable */
3128         args.enabled_channels = mask;
3129
3130         memcpy(&exp->args[exp->num++], &args, sizeof(args));
3131 }
3132
3133 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3134                                 LLVMValueRef *color, unsigned index,
3135                                 unsigned samplemask_param,
3136                                 bool is_last, struct si_ps_exports *exp)
3137 {
3138         struct si_shader_context *ctx = si_shader_context(bld_base);
3139         struct lp_build_context *base = &bld_base->base;
3140         int i;
3141
3142         /* Clamp color */
3143         if (ctx->shader->key.part.ps.epilog.clamp_color)
3144                 for (i = 0; i < 4; i++)
3145                         color[i] = ac_build_clamp(&ctx->ac, color[i]);
3146
3147         /* Alpha to one */
3148         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3149                 color[3] = base->one;
3150
3151         /* Alpha test */
3152         if (index == 0 &&
3153             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3154                 si_alpha_test(bld_base, color[3]);
3155
3156         /* Line & polygon smoothing */
3157         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3158                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3159                                                          samplemask_param);
3160
3161         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3162         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3163                 struct ac_export_args args[8];
3164                 int c, last = -1;
3165
3166                 /* Get the export arguments, also find out what the last one is. */
3167                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3168                         si_llvm_init_export_args(bld_base, color,
3169                                                  V_008DFC_SQ_EXP_MRT + c, &args[c]);
3170                         if (args[c].enabled_channels)
3171                                 last = c;
3172                 }
3173
3174                 /* Emit all exports. */
3175                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3176                         if (is_last && last == c) {
3177                                 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3178                                 args[c].done = 1; /* DONE bit */
3179                         } else if (!args[c].enabled_channels)
3180                                 continue; /* unnecessary NULL export */
3181
3182                         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3183                 }
3184         } else {
3185                 struct ac_export_args args;
3186
3187                 /* Export */
3188                 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3189                                          &args);
3190                 if (is_last) {
3191                         args.valid_mask = 1; /* whether the EXEC mask is valid */
3192                         args.done = 1; /* DONE bit */
3193                 } else if (!args.enabled_channels)
3194                         return; /* unnecessary NULL export */
3195
3196                 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3197         }
3198 }
3199
3200 static void si_emit_ps_exports(struct si_shader_context *ctx,
3201                                struct si_ps_exports *exp)
3202 {
3203         for (unsigned i = 0; i < exp->num; i++)
3204                 ac_build_export(&ctx->ac, &exp->args[i]);
3205 }
3206
3207 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3208 {
3209         struct si_shader_context *ctx = si_shader_context(bld_base);
3210         struct lp_build_context *base = &bld_base->base;
3211         struct ac_export_args args;
3212
3213         args.enabled_channels = 0x0; /* enabled channels */
3214         args.valid_mask = 1; /* whether the EXEC mask is valid */
3215         args.done = 1; /* DONE bit */
3216         args.target = V_008DFC_SQ_EXP_NULL;
3217         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3218         args.out[0] = base->undef; /* R */
3219         args.out[1] = base->undef; /* G */
3220         args.out[2] = base->undef; /* B */
3221         args.out[3] = base->undef; /* A */
3222
3223         ac_build_export(&ctx->ac, &args);
3224 }
3225
3226 /**
3227  * Return PS outputs in this order:
3228  *
3229  * v[0:3] = color0.xyzw
3230  * v[4:7] = color1.xyzw
3231  * ...
3232  * vN+0 = Depth
3233  * vN+1 = Stencil
3234  * vN+2 = SampleMask
3235  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3236  *
3237  * The alpha-ref SGPR is returned via its original location.
3238  */
3239 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3240 {
3241         struct si_shader_context *ctx = si_shader_context(bld_base);
3242         struct si_shader *shader = ctx->shader;
3243         struct tgsi_shader_info *info = &shader->selector->info;
3244         LLVMBuilderRef builder = ctx->gallivm.builder;
3245         unsigned i, j, first_vgpr, vgpr;
3246
3247         LLVMValueRef color[8][4] = {};
3248         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3249         LLVMValueRef ret;
3250
3251         if (ctx->postponed_kill)
3252                 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3253
3254         /* Read the output values. */
3255         for (i = 0; i < info->num_outputs; i++) {
3256                 unsigned semantic_name = info->output_semantic_name[i];
3257                 unsigned semantic_index = info->output_semantic_index[i];
3258
3259                 switch (semantic_name) {
3260                 case TGSI_SEMANTIC_COLOR:
3261                         assert(semantic_index < 8);
3262                         for (j = 0; j < 4; j++) {
3263                                 LLVMValueRef ptr = ctx->outputs[i][j];
3264                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3265                                 color[semantic_index][j] = result;
3266                         }
3267                         break;
3268                 case TGSI_SEMANTIC_POSITION:
3269                         depth = LLVMBuildLoad(builder,
3270                                               ctx->outputs[i][2], "");
3271                         break;
3272                 case TGSI_SEMANTIC_STENCIL:
3273                         stencil = LLVMBuildLoad(builder,
3274                                                 ctx->outputs[i][1], "");
3275                         break;
3276                 case TGSI_SEMANTIC_SAMPLEMASK:
3277                         samplemask = LLVMBuildLoad(builder,
3278                                                    ctx->outputs[i][0], "");
3279                         break;
3280                 default:
3281                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3282                                 semantic_name);
3283                 }
3284         }
3285
3286         /* Fill the return structure. */
3287         ret = ctx->return_value;
3288
3289         /* Set SGPRs. */
3290         ret = LLVMBuildInsertValue(builder, ret,
3291                                    bitcast(bld_base, TGSI_TYPE_SIGNED,
3292                                            LLVMGetParam(ctx->main_fn,
3293                                                         SI_PARAM_ALPHA_REF)),
3294                                    SI_SGPR_ALPHA_REF, "");
3295
3296         /* Set VGPRs */
3297         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3298         for (i = 0; i < ARRAY_SIZE(color); i++) {
3299                 if (!color[i][0])
3300                         continue;
3301
3302                 for (j = 0; j < 4; j++)
3303                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3304         }
3305         if (depth)
3306                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3307         if (stencil)
3308                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3309         if (samplemask)
3310                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3311
3312         /* Add the input sample mask for smoothing at the end. */
3313         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3314                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3315         ret = LLVMBuildInsertValue(builder, ret,
3316                                    LLVMGetParam(ctx->main_fn,
3317                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3318
3319         ctx->return_value = ret;
3320 }
3321
3322 /* Prevent optimizations (at least of memory accesses) across the current
3323  * point in the program by emitting empty inline assembly that is marked as
3324  * having side effects.
3325  *
3326  * Optionally, a value can be passed through the inline assembly to prevent
3327  * LLVM from hoisting calls to ReadNone functions.
3328  */
3329 static void emit_optimization_barrier(struct si_shader_context *ctx,
3330                                       LLVMValueRef *pvgpr)
3331 {
3332         static int counter = 0;
3333
3334         LLVMBuilderRef builder = ctx->gallivm.builder;
3335         char code[16];
3336
3337         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3338
3339         if (!pvgpr) {
3340                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3341                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3342                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3343         } else {
3344                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3345                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3346                 LLVMValueRef vgpr = *pvgpr;
3347                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3348                 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3349                 LLVMValueRef vgpr0;
3350
3351                 assert(vgpr_size % 4 == 0);
3352
3353                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3354                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3355                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3356                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3357                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3358
3359                 *pvgpr = vgpr;
3360         }
3361 }
3362
3363 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3364 {
3365         struct gallivm_state *gallivm = &ctx->gallivm;
3366         LLVMBuilderRef builder = gallivm->builder;
3367         LLVMValueRef args[1] = {
3368                 LLVMConstInt(ctx->i32, simm16, 0)
3369         };
3370         lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3371                            ctx->voidt, args, 1, 0);
3372 }
3373
3374 static void membar_emit(
3375                 const struct lp_build_tgsi_action *action,
3376                 struct lp_build_tgsi_context *bld_base,
3377                 struct lp_build_emit_data *emit_data)
3378 {
3379         struct si_shader_context *ctx = si_shader_context(bld_base);
3380         LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3381         unsigned flags = LLVMConstIntGetZExtValue(src0);
3382         unsigned waitcnt = NOOP_WAITCNT;
3383
3384         if (flags & TGSI_MEMBAR_THREAD_GROUP)
3385                 waitcnt &= VM_CNT & LGKM_CNT;
3386
3387         if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3388                      TGSI_MEMBAR_SHADER_BUFFER |
3389                      TGSI_MEMBAR_SHADER_IMAGE))
3390                 waitcnt &= VM_CNT;
3391
3392         if (flags & TGSI_MEMBAR_SHARED)
3393                 waitcnt &= LGKM_CNT;
3394
3395         if (waitcnt != NOOP_WAITCNT)
3396                 si_emit_waitcnt(ctx, waitcnt);
3397 }
3398
3399 static void clock_emit(
3400                 const struct lp_build_tgsi_action *action,
3401                 struct lp_build_tgsi_context *bld_base,
3402                 struct lp_build_emit_data *emit_data)
3403 {
3404         struct si_shader_context *ctx = si_shader_context(bld_base);
3405         struct gallivm_state *gallivm = &ctx->gallivm;
3406         LLVMValueRef tmp;
3407
3408         tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3409                                  ctx->i64, NULL, 0, 0);
3410         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3411
3412         emit_data->output[0] =
3413                 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3414         emit_data->output[1] =
3415                 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3416 }
3417
3418 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3419 {
3420         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3421                                CONST_ADDR_SPACE);
3422 }
3423
3424 static void si_llvm_emit_ddxy(
3425         const struct lp_build_tgsi_action *action,
3426         struct lp_build_tgsi_context *bld_base,
3427         struct lp_build_emit_data *emit_data)
3428 {
3429         struct si_shader_context *ctx = si_shader_context(bld_base);
3430         struct gallivm_state *gallivm = &ctx->gallivm;
3431         unsigned opcode = emit_data->info->opcode;
3432         LLVMValueRef val;
3433         int idx;
3434         unsigned mask;
3435
3436         if (opcode == TGSI_OPCODE_DDX_FINE)
3437                 mask = AC_TID_MASK_LEFT;
3438         else if (opcode == TGSI_OPCODE_DDY_FINE)
3439                 mask = AC_TID_MASK_TOP;
3440         else
3441                 mask = AC_TID_MASK_TOP_LEFT;
3442
3443         /* for DDX we want to next X pixel, DDY next Y pixel. */
3444         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3445
3446         val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3447         val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3448                             mask, idx, ctx->lds, val);
3449         emit_data->output[emit_data->chan] = val;
3450 }
3451
3452 /*
3453  * this takes an I,J coordinate pair,
3454  * and works out the X and Y derivatives.
3455  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3456  */
3457 static LLVMValueRef si_llvm_emit_ddxy_interp(
3458         struct lp_build_tgsi_context *bld_base,
3459         LLVMValueRef interp_ij)
3460 {
3461         struct si_shader_context *ctx = si_shader_context(bld_base);
3462         struct gallivm_state *gallivm = &ctx->gallivm;
3463         LLVMValueRef result[4], a;
3464         unsigned i;
3465
3466         for (i = 0; i < 2; i++) {
3467                 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3468                                             LLVMConstInt(ctx->i32, i, 0), "");
3469                 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3470                 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3471         }
3472
3473         return lp_build_gather_values(gallivm, result, 4);
3474 }
3475
3476 static void interp_fetch_args(
3477         struct lp_build_tgsi_context *bld_base,
3478         struct lp_build_emit_data *emit_data)
3479 {
3480         struct si_shader_context *ctx = si_shader_context(bld_base);
3481         struct gallivm_state *gallivm = &ctx->gallivm;
3482         const struct tgsi_full_instruction *inst = emit_data->inst;
3483
3484         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3485                 /* offset is in second src, first two channels */
3486                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3487                                                          emit_data->inst, 1,
3488                                                          TGSI_CHAN_X);
3489                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3490                                                          emit_data->inst, 1,
3491                                                          TGSI_CHAN_Y);
3492                 emit_data->arg_count = 2;
3493         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3494                 LLVMValueRef sample_position;
3495                 LLVMValueRef sample_id;
3496                 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3497
3498                 /* fetch sample ID, then fetch its sample position,
3499                  * and place into first two channels.
3500                  */
3501                 sample_id = lp_build_emit_fetch(bld_base,
3502                                                 emit_data->inst, 1, TGSI_CHAN_X);
3503                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3504                                              ctx->i32, "");
3505                 sample_position = load_sample_position(ctx, sample_id);
3506
3507                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3508                                                              sample_position,
3509                                                              ctx->i32_0, "");
3510
3511                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3512                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3513                                                              sample_position,
3514                                                              ctx->i32_1, "");
3515                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3516                 emit_data->arg_count = 2;
3517         }
3518 }
3519
3520 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3521                                 struct lp_build_tgsi_context *bld_base,
3522                                 struct lp_build_emit_data *emit_data)
3523 {
3524         struct si_shader_context *ctx = si_shader_context(bld_base);
3525         struct si_shader *shader = ctx->shader;
3526         struct gallivm_state *gallivm = &ctx->gallivm;
3527         const struct tgsi_shader_info *info = &shader->selector->info;
3528         LLVMValueRef interp_param;
3529         const struct tgsi_full_instruction *inst = emit_data->inst;
3530         const struct tgsi_full_src_register *input = &inst->Src[0];
3531         int input_base, input_array_size;
3532         int chan;
3533         int i;
3534         LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3535         LLVMValueRef array_idx;
3536         int interp_param_idx;
3537         unsigned interp;
3538         unsigned location;
3539
3540         assert(input->Register.File == TGSI_FILE_INPUT);
3541
3542         if (input->Register.Indirect) {
3543                 unsigned array_id = input->Indirect.ArrayID;
3544
3545                 if (array_id) {
3546                         input_base = info->input_array_first[array_id];
3547                         input_array_size = info->input_array_last[array_id] - input_base + 1;
3548                 } else {
3549                         input_base = inst->Src[0].Register.Index;
3550                         input_array_size = info->num_inputs - input_base;
3551                 }
3552
3553                 array_idx = get_indirect_index(ctx, &input->Indirect,
3554                                                input->Register.Index - input_base);
3555         } else {
3556                 input_base = inst->Src[0].Register.Index;
3557                 input_array_size = 1;
3558                 array_idx = ctx->i32_0;
3559         }
3560
3561         interp = shader->selector->info.input_interpolate[input_base];
3562
3563         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3564             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3565                 location = TGSI_INTERPOLATE_LOC_CENTER;
3566         else
3567                 location = TGSI_INTERPOLATE_LOC_CENTROID;
3568
3569         interp_param_idx = lookup_interp_param_index(interp, location);
3570         if (interp_param_idx == -1)
3571                 return;
3572         else if (interp_param_idx)
3573                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3574         else
3575                 interp_param = NULL;
3576
3577         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3578             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3579                 LLVMValueRef ij_out[2];
3580                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3581
3582                 /*
3583                  * take the I then J parameters, and the DDX/Y for it, and
3584                  * calculate the IJ inputs for the interpolator.
3585                  * temp1 = ddx * offset/sample.x + I;
3586                  * interp_param.I = ddy * offset/sample.y + temp1;
3587                  * temp1 = ddx * offset/sample.x + J;
3588                  * interp_param.J = ddy * offset/sample.y + temp1;
3589                  */
3590                 for (i = 0; i < 2; i++) {
3591                         LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3592                         LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3593                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3594                                                                       ddxy_out, ix_ll, "");
3595                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3596                                                                       ddxy_out, iy_ll, "");
3597                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3598                                                                          interp_param, ix_ll, "");
3599                         LLVMValueRef temp1, temp2;
3600
3601                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3602                                                      ctx->f32, "");
3603
3604                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3605
3606                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3607
3608                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3609
3610                         ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3611                 }
3612                 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3613         }
3614
3615         if (interp_param) {
3616                 interp_param = LLVMBuildBitCast(gallivm->builder,
3617                         interp_param, LLVMVectorType(ctx->f32, 2), "");
3618         }
3619
3620         for (chan = 0; chan < 4; chan++) {
3621                 LLVMValueRef llvm_chan;
3622                 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3623                 unsigned schan;
3624
3625                 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3626                 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
3627
3628                 for (unsigned i = 0; i < input_array_size; ++i) {
3629                         LLVMValueRef attr_number = LLVMConstInt(ctx->i32, input_base + i, false);
3630                         LLVMValueRef v;
3631
3632                         if (interp_param) {
3633                                 interp_param = LLVMBuildBitCast(gallivm->builder,
3634                                         interp_param, LLVMVectorType(ctx->f32, 2), "");
3635                                 LLVMValueRef i = LLVMBuildExtractElement(
3636                                         gallivm->builder, interp_param, ctx->i32_0, "");
3637                                 LLVMValueRef j = LLVMBuildExtractElement(
3638                                         gallivm->builder, interp_param, ctx->i32_1, "");
3639                                 v = ac_build_fs_interp(&ctx->ac,
3640                                         llvm_chan, attr_number, params,
3641                                         i, j);
3642                         } else {
3643                                 v = ac_build_fs_interp_mov(&ctx->ac,
3644                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
3645                                         llvm_chan, attr_number, params);
3646                         }
3647
3648                         gather = LLVMBuildInsertElement(gallivm->builder,
3649                                 gather, v, LLVMConstInt(ctx->i32, i, false), "");
3650                 }
3651
3652                 emit_data->output[chan] = LLVMBuildExtractElement(
3653                         gallivm->builder, gather, array_idx, "");
3654         }
3655 }
3656
3657 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3658                                    LLVMValueRef value)
3659 {
3660         struct gallivm_state *gallivm = &ctx->gallivm;
3661         LLVMValueRef args[3] = {
3662                 value,
3663                 ctx->i32_0,
3664                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3665         };
3666
3667         /* We currently have no other way to prevent LLVM from lifting the icmp
3668          * calls to a dominating basic block.
3669          */
3670         emit_optimization_barrier(ctx, &args[0]);
3671
3672         if (LLVMTypeOf(args[0]) != ctx->i32)
3673                 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3674
3675         return lp_build_intrinsic(gallivm->builder,
3676                                   "llvm.amdgcn.icmp.i32",
3677                                   ctx->i64, args, 3,
3678                                   LP_FUNC_ATTR_NOUNWIND |
3679                                   LP_FUNC_ATTR_READNONE |
3680                                   LP_FUNC_ATTR_CONVERGENT);
3681 }
3682
3683 static void vote_all_emit(
3684         const struct lp_build_tgsi_action *action,
3685         struct lp_build_tgsi_context *bld_base,
3686         struct lp_build_emit_data *emit_data)
3687 {
3688         struct si_shader_context *ctx = si_shader_context(bld_base);
3689         struct gallivm_state *gallivm = &ctx->gallivm;
3690         LLVMValueRef active_set, vote_set;
3691         LLVMValueRef tmp;
3692
3693         active_set = si_emit_ballot(ctx, ctx->i32_1);
3694         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3695
3696         tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3697         emit_data->output[emit_data->chan] =
3698                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3699 }
3700
3701 static void vote_any_emit(
3702         const struct lp_build_tgsi_action *action,
3703         struct lp_build_tgsi_context *bld_base,
3704         struct lp_build_emit_data *emit_data)
3705 {
3706         struct si_shader_context *ctx = si_shader_context(bld_base);
3707         struct gallivm_state *gallivm = &ctx->gallivm;
3708         LLVMValueRef vote_set;
3709         LLVMValueRef tmp;
3710
3711         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3712
3713         tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3714                             vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3715         emit_data->output[emit_data->chan] =
3716                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3717 }
3718
3719 static void vote_eq_emit(
3720         const struct lp_build_tgsi_action *action,
3721         struct lp_build_tgsi_context *bld_base,
3722         struct lp_build_emit_data *emit_data)
3723 {
3724         struct si_shader_context *ctx = si_shader_context(bld_base);
3725         struct gallivm_state *gallivm = &ctx->gallivm;
3726         LLVMValueRef active_set, vote_set;
3727         LLVMValueRef all, none, tmp;
3728
3729         active_set = si_emit_ballot(ctx, ctx->i32_1);
3730         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3731
3732         all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3733         none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3734                              vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3735         tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3736         emit_data->output[emit_data->chan] =
3737                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3738 }
3739
3740 static void ballot_emit(
3741         const struct lp_build_tgsi_action *action,
3742         struct lp_build_tgsi_context *bld_base,
3743         struct lp_build_emit_data *emit_data)
3744 {
3745         struct si_shader_context *ctx = si_shader_context(bld_base);
3746         LLVMBuilderRef builder = ctx->gallivm.builder;
3747         LLVMValueRef tmp;
3748
3749         tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3750         tmp = si_emit_ballot(ctx, tmp);
3751         tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3752
3753         emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3754         emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3755 }
3756
3757 static void read_invoc_fetch_args(
3758         struct lp_build_tgsi_context *bld_base,
3759         struct lp_build_emit_data *emit_data)
3760 {
3761         emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3762                                                  0, emit_data->src_chan);
3763
3764         /* Always read the source invocation (= lane) from the X channel. */
3765         emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3766                                                  1, TGSI_CHAN_X);
3767         emit_data->arg_count = 2;
3768 }
3769
3770 static void read_lane_emit(
3771         const struct lp_build_tgsi_action *action,
3772         struct lp_build_tgsi_context *bld_base,
3773         struct lp_build_emit_data *emit_data)
3774 {
3775         struct si_shader_context *ctx = si_shader_context(bld_base);
3776         LLVMBuilderRef builder = ctx->gallivm.builder;
3777
3778         /* We currently have no other way to prevent LLVM from lifting the icmp
3779          * calls to a dominating basic block.
3780          */
3781         emit_optimization_barrier(ctx, &emit_data->args[0]);
3782
3783         for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3784                 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3785                                                       ctx->i32, "");
3786         }
3787
3788         emit_data->output[emit_data->chan] =
3789                 ac_build_intrinsic(&ctx->ac, action->intr_name,
3790                                    ctx->i32, emit_data->args, emit_data->arg_count,
3791                                    AC_FUNC_ATTR_READNONE |
3792                                    AC_FUNC_ATTR_CONVERGENT);
3793 }
3794
3795 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3796                                        struct lp_build_emit_data *emit_data)
3797 {
3798         struct si_shader_context *ctx = si_shader_context(bld_base);
3799         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
3800         LLVMValueRef imm;
3801         unsigned stream;
3802
3803         assert(src0.File == TGSI_FILE_IMMEDIATE);
3804
3805         imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
3806         stream = LLVMConstIntGetZExtValue(imm) & 0x3;
3807         return stream;
3808 }
3809
3810 /* Emit one vertex from the geometry shader */
3811 static void si_llvm_emit_vertex(
3812         const struct lp_build_tgsi_action *action,
3813         struct lp_build_tgsi_context *bld_base,
3814         struct lp_build_emit_data *emit_data)
3815 {
3816         struct si_shader_context *ctx = si_shader_context(bld_base);
3817         struct lp_build_context *uint = &bld_base->uint_bld;
3818         struct si_shader *shader = ctx->shader;
3819         struct tgsi_shader_info *info = &shader->selector->info;
3820         struct gallivm_state *gallivm = &ctx->gallivm;
3821         struct lp_build_if_state if_state;
3822         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3823                                             ctx->param_gs2vs_offset);
3824         LLVMValueRef gs_next_vertex;
3825         LLVMValueRef can_emit, kill;
3826         unsigned chan, offset;
3827         int i;
3828         unsigned stream;
3829
3830         stream = si_llvm_get_stream(bld_base, emit_data);
3831
3832         /* Write vertex attribute values to GSVS ring */
3833         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
3834                                        ctx->gs_next_vertex[stream],
3835                                        "");
3836
3837         /* If this thread has already emitted the declared maximum number of
3838          * vertices, skip the write: excessive vertex emissions are not
3839          * supposed to have any effect.
3840          *
3841          * If the shader has no writes to memory, kill it instead. This skips
3842          * further memory loads and may allow LLVM to skip to the end
3843          * altogether.
3844          */
3845         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
3846                                  LLVMConstInt(ctx->i32,
3847                                               shader->selector->gs_max_out_vertices, 0), "");
3848
3849         bool use_kill = !info->writes_memory;
3850         if (use_kill) {
3851                 kill = lp_build_select(&bld_base->base, can_emit,
3852                                        LLVMConstReal(ctx->f32, 1.0f),
3853                                        LLVMConstReal(ctx->f32, -1.0f));
3854
3855                 ac_build_kill(&ctx->ac, kill);
3856         } else {
3857                 lp_build_if(&if_state, gallivm, can_emit);
3858         }
3859
3860         offset = 0;
3861         for (i = 0; i < info->num_outputs; i++) {
3862                 LLVMValueRef *out_ptr = ctx->outputs[i];
3863
3864                 for (chan = 0; chan < 4; chan++) {
3865                         if (!(info->output_usagemask[i] & (1 << chan)) ||
3866                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
3867                                 continue;
3868
3869                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3870                         LLVMValueRef voffset =
3871                                 LLVMConstInt(ctx->i32, offset *
3872                                              shader->selector->gs_max_out_vertices, 0);
3873                         offset++;
3874
3875                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
3876                         voffset = lp_build_mul_imm(uint, voffset, 4);
3877
3878                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3879
3880                         ac_build_buffer_store_dword(&ctx->ac,
3881                                                     ctx->gsvs_ring[stream],
3882                                                     out_val, 1,
3883                                                     voffset, soffset, 0,
3884                                                     1, 1, true, true);
3885                 }
3886         }
3887
3888         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
3889                                       ctx->i32_1);
3890
3891         LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
3892
3893         /* Signal vertex emission */
3894         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
3895                          si_get_gs_wave_id(ctx));
3896         if (!use_kill)
3897                 lp_build_endif(&if_state);
3898 }
3899
3900 /* Cut one primitive from the geometry shader */
3901 static void si_llvm_emit_primitive(
3902         const struct lp_build_tgsi_action *action,
3903         struct lp_build_tgsi_context *bld_base,
3904         struct lp_build_emit_data *emit_data)
3905 {
3906         struct si_shader_context *ctx = si_shader_context(bld_base);
3907         unsigned stream;
3908
3909         /* Signal primitive cut */
3910         stream = si_llvm_get_stream(bld_base, emit_data);
3911         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
3912                          si_get_gs_wave_id(ctx));
3913 }
3914
3915 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
3916                                  struct lp_build_tgsi_context *bld_base,
3917                                  struct lp_build_emit_data *emit_data)
3918 {
3919         struct si_shader_context *ctx = si_shader_context(bld_base);
3920         struct gallivm_state *gallivm = &ctx->gallivm;
3921
3922         /* SI only (thanks to a hw bug workaround):
3923          * The real barrier instruction isn’t needed, because an entire patch
3924          * always fits into a single wave.
3925          */
3926         if (ctx->screen->b.chip_class == SI &&
3927             ctx->type == PIPE_SHADER_TESS_CTRL) {
3928                 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
3929                 return;
3930         }
3931
3932         lp_build_intrinsic(gallivm->builder,
3933                            "llvm.amdgcn.s.barrier",
3934                            ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
3935 }
3936
3937 static const struct lp_build_tgsi_action interp_action = {
3938         .fetch_args = interp_fetch_args,
3939         .emit = build_interp_intrinsic,
3940 };
3941
3942 static void si_create_function(struct si_shader_context *ctx,
3943                                const char *name,
3944                                LLVMTypeRef *returns, unsigned num_returns,
3945                                LLVMTypeRef *params, unsigned num_params,
3946                                int last_sgpr, unsigned max_workgroup_size)
3947 {
3948         int i;
3949
3950         si_llvm_create_func(ctx, name, returns, num_returns,
3951                             params, num_params);
3952         ctx->return_value = LLVMGetUndef(ctx->return_type);
3953
3954         for (i = 0; i <= last_sgpr; ++i) {
3955                 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
3956
3957                 /* The combination of:
3958                  * - ByVal
3959                  * - dereferenceable
3960                  * - invariant.load
3961                  * allows the optimization passes to move loads and reduces
3962                  * SGPR spilling significantly.
3963                  */
3964                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
3965                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
3966                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
3967                         ac_add_attr_dereferenceable(P, UINT64_MAX);
3968                 } else
3969                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
3970         }
3971
3972         if (max_workgroup_size) {
3973                 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
3974                                       max_workgroup_size);
3975         }
3976         LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
3977                                            "no-signed-zeros-fp-math",
3978                                            "true");
3979
3980         if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
3981                 /* These were copied from some LLVM test. */
3982                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
3983                                                    "less-precise-fpmad",
3984                                                    "true");
3985                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
3986                                                    "no-infs-fp-math",
3987                                                    "true");
3988                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
3989                                                    "no-nans-fp-math",
3990                                                    "true");
3991                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
3992                                                    "unsafe-fp-math",
3993                                                    "true");
3994         }
3995 }
3996
3997 static void declare_streamout_params(struct si_shader_context *ctx,
3998                                      struct pipe_stream_output_info *so,
3999                                      LLVMTypeRef *params, LLVMTypeRef i32,
4000                                      unsigned *num_params)
4001 {
4002         int i;
4003
4004         /* Streamout SGPRs. */
4005         if (so->num_outputs) {
4006                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4007                         params[ctx->param_streamout_config = (*num_params)++] = i32;
4008                 else
4009                         ctx->param_streamout_config = *num_params - 1;
4010
4011                 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
4012         }
4013         /* A streamout buffer offset is loaded if the stride is non-zero. */
4014         for (i = 0; i < 4; i++) {
4015                 if (!so->stride[i])
4016                         continue;
4017
4018                 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
4019         }
4020 }
4021
4022 static unsigned llvm_get_type_size(LLVMTypeRef type)
4023 {
4024         LLVMTypeKind kind = LLVMGetTypeKind(type);
4025
4026         switch (kind) {
4027         case LLVMIntegerTypeKind:
4028                 return LLVMGetIntTypeWidth(type) / 8;
4029         case LLVMFloatTypeKind:
4030                 return 4;
4031         case LLVMPointerTypeKind:
4032                 return 8;
4033         case LLVMVectorTypeKind:
4034                 return LLVMGetVectorSize(type) *
4035                        llvm_get_type_size(LLVMGetElementType(type));
4036         case LLVMArrayTypeKind:
4037                 return LLVMGetArrayLength(type) *
4038                        llvm_get_type_size(LLVMGetElementType(type));
4039         default:
4040                 assert(0);
4041                 return 0;
4042         }
4043 }
4044
4045 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4046 {
4047         struct gallivm_state *gallivm = &ctx->gallivm;
4048
4049         unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4050         ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4051                 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4052                 "lds");
4053 }
4054
4055 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4056 {
4057         switch (shader->selector->type) {
4058         case PIPE_SHADER_TESS_CTRL:
4059                 /* Return this so that LLVM doesn't remove s_barrier
4060                  * instructions on chips where we use s_barrier. */
4061                 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4062
4063         case PIPE_SHADER_GEOMETRY:
4064                 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4065
4066         case PIPE_SHADER_COMPUTE:
4067                 break; /* see below */
4068
4069         default:
4070                 return 0;
4071         }
4072
4073         const unsigned *properties = shader->selector->info.properties;
4074         unsigned max_work_group_size =
4075                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4076                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4077                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4078
4079         if (!max_work_group_size) {
4080                 /* This is a variable group size compute shader,
4081                  * compile it for the maximum possible group size.
4082                  */
4083                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4084         }
4085         return max_work_group_size;
4086 }
4087
4088 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4089                                             LLVMTypeRef *params,
4090                                             unsigned *num_params,
4091                                             bool assign_params)
4092 {
4093         params[(*num_params)++] = si_const_array(ctx->v4i32,
4094                                                  SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS);
4095         params[(*num_params)++] = si_const_array(ctx->v8i32,
4096                                                  SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2);
4097
4098         if (assign_params) {
4099                 ctx->param_const_and_shader_buffers = *num_params - 2;
4100                 ctx->param_samplers_and_images = *num_params - 1;
4101         }
4102 }
4103
4104 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4105                                           LLVMTypeRef *params,
4106                                           unsigned *num_params)
4107 {
4108         params[ctx->param_rw_buffers = (*num_params)++] =
4109                 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
4110         declare_per_stage_desc_pointers(ctx, params, num_params, true);
4111 }
4112
4113 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4114                                             LLVMTypeRef *params,
4115                                             unsigned *num_params)
4116 {
4117         params[ctx->param_vertex_buffers = (*num_params)++] =
4118                 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS);
4119         params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
4120         params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
4121         params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
4122         params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
4123 }
4124
4125 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4126                                    LLVMTypeRef *params, unsigned *num_params,
4127                                    unsigned *num_prolog_vgprs)
4128 {
4129         struct si_shader *shader = ctx->shader;
4130
4131         params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
4132         if (shader->key.as_ls) {
4133                 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
4134                 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
4135         } else {
4136                 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
4137                 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
4138         }
4139         params[(*num_params)++] = ctx->i32; /* unused */
4140
4141         if (!shader->is_gs_copy_shader) {
4142                 /* Vertex load indices. */
4143                 ctx->param_vertex_index0 = (*num_params);
4144                 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4145                         params[(*num_params)++] = ctx->i32;
4146                 *num_prolog_vgprs += shader->selector->info.num_inputs;
4147         }
4148 }
4149
4150 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4151                                     LLVMTypeRef *params, unsigned *num_params)
4152 {
4153         params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
4154         params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
4155         params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
4156         params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
4157 }
4158
4159 enum {
4160         /* Convenient merged shader definitions. */
4161         SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4162         SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4163 };
4164
4165 static void create_function(struct si_shader_context *ctx)
4166 {
4167         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
4168         struct gallivm_state *gallivm = &ctx->gallivm;
4169         struct si_shader *shader = ctx->shader;
4170         LLVMTypeRef params[100]; /* just make it large enough */
4171         LLVMTypeRef returns[16+32*4];
4172         unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
4173         unsigned num_returns = 0;
4174         unsigned num_prolog_vgprs = 0;
4175         unsigned type = ctx->type;
4176
4177         /* Set MERGED shaders. */
4178         if (ctx->screen->b.chip_class >= GFX9) {
4179                 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4180                         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4181                 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4182                         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4183         }
4184
4185         LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4186
4187         switch (type) {
4188         case PIPE_SHADER_VERTEX:
4189                 declare_default_desc_pointers(ctx, params, &num_params);
4190                 declare_vs_specific_input_sgprs(ctx, params, &num_params);
4191
4192                 if (shader->key.as_es) {
4193                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4194                 } else if (shader->key.as_ls) {
4195                         /* no extra parameters */
4196                 } else {
4197                         if (shader->is_gs_copy_shader)
4198                                 num_params = ctx->param_rw_buffers + 1;
4199
4200                         /* The locations of the other parameters are assigned dynamically. */
4201                         declare_streamout_params(ctx, &shader->selector->so,
4202                                                  params, ctx->i32, &num_params);
4203                 }
4204
4205                 last_sgpr = num_params-1;
4206
4207                 /* VGPRs */
4208                 declare_vs_input_vgprs(ctx, params, &num_params,
4209                                        &num_prolog_vgprs);
4210                 break;
4211
4212         case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4213                 declare_default_desc_pointers(ctx, params, &num_params);
4214                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
4215                 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
4216                 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
4217                 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
4218                 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
4219                 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
4220                 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4221                 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
4222                 last_sgpr = num_params - 1;
4223
4224                 /* VGPRs */
4225                 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
4226                 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
4227
4228                 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4229                  * placed after the user SGPRs.
4230                  */
4231                 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4232                         returns[num_returns++] = ctx->i32; /* SGPRs */
4233                 for (i = 0; i < 3; i++)
4234                         returns[num_returns++] = ctx->f32; /* VGPRs */
4235                 break;
4236
4237         case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4238                 /* Merged stages have 8 system SGPRs at the beginning. */
4239                 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4240                         si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
4241                 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4242                 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
4243                 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
4244                 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
4245                 params[num_params++] = ctx->i32; /* unused */
4246                 params[num_params++] = ctx->i32; /* unused */
4247
4248                 params[num_params++] = ctx->i32; /* unused */
4249                 params[num_params++] = ctx->i32; /* unused */
4250                 declare_per_stage_desc_pointers(ctx, params, &num_params,
4251                                                 ctx->type == PIPE_SHADER_VERTEX);
4252                 declare_vs_specific_input_sgprs(ctx, params, &num_params);
4253
4254                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
4255                 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
4256                 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
4257                 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
4258                 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
4259                 params[num_params++] = ctx->i32; /* unused */
4260
4261                 declare_per_stage_desc_pointers(ctx, params, &num_params,
4262                                                 ctx->type == PIPE_SHADER_TESS_CTRL);
4263                 last_sgpr = num_params - 1;
4264
4265                 /* VGPRs (first TCS, then VS) */
4266                 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
4267                 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
4268
4269                 if (ctx->type == PIPE_SHADER_VERTEX) {
4270                         declare_vs_input_vgprs(ctx, params, &num_params,
4271                                                &num_prolog_vgprs);
4272
4273                         /* LS return values are inputs to the TCS main shader part. */
4274                         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4275                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4276                         for (i = 0; i < 2; i++)
4277                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4278                 } else {
4279                         /* TCS return values are inputs to the TCS epilog.
4280                          *
4281                          * param_tcs_offchip_offset, param_tcs_factor_offset,
4282                          * param_tcs_offchip_layout, and param_rw_buffers
4283                          * should be passed to the epilog.
4284                          */
4285                         for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4286                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4287                         for (i = 0; i < 3; i++)
4288                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4289                 }
4290                 break;
4291
4292         case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4293                 /* Merged stages have 8 system SGPRs at the beginning. */
4294                 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4295                         si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
4296                 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
4297                 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
4298                 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4299                 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
4300                 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4301                 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4302
4303                 params[num_params++] = ctx->i32; /* unused */
4304                 params[num_params++] = ctx->i32; /* unused */
4305                 declare_per_stage_desc_pointers(ctx, params, &num_params,
4306                                                 (ctx->type == PIPE_SHADER_VERTEX ||
4307                                                  ctx->type == PIPE_SHADER_TESS_EVAL));
4308                 if (ctx->type == PIPE_SHADER_VERTEX) {
4309                         declare_vs_specific_input_sgprs(ctx, params, &num_params);
4310                 } else {
4311                         /* TESS_EVAL (and also GEOMETRY):
4312                          * Declare as many input SGPRs as the VS has. */
4313                         params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
4314                         params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
4315                         params[num_params++] = ctx->i32; /* unused */
4316                         params[num_params++] = ctx->i32; /* unused */
4317                         params[num_params++] = ctx->i32; /* unused */
4318                         params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
4319                 }
4320
4321                 declare_per_stage_desc_pointers(ctx, params, &num_params,
4322                                                 ctx->type == PIPE_SHADER_GEOMETRY);
4323                 last_sgpr = num_params - 1;
4324
4325                 /* VGPRs (first GS, then VS/TES) */
4326                 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
4327                 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
4328                 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
4329                 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
4330                 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
4331
4332                 if (ctx->type == PIPE_SHADER_VERTEX) {
4333                         declare_vs_input_vgprs(ctx, params, &num_params,
4334                                                &num_prolog_vgprs);
4335                 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4336                         declare_tes_input_vgprs(ctx, params, &num_params);
4337                 }
4338
4339                 if (ctx->type == PIPE_SHADER_VERTEX ||
4340                     ctx->type == PIPE_SHADER_TESS_EVAL) {
4341                         /* ES return values are inputs to GS. */
4342                         for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4343                                 returns[num_returns++] = ctx->i32; /* SGPRs */
4344                         for (i = 0; i < 5; i++)
4345                                 returns[num_returns++] = ctx->f32; /* VGPRs */
4346                 }
4347                 break;
4348
4349         case PIPE_SHADER_TESS_EVAL:
4350                 declare_default_desc_pointers(ctx, params, &num_params);
4351                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
4352                 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
4353
4354                 if (shader->key.as_es) {
4355                         params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4356                         params[num_params++] = ctx->i32;
4357                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4358                 } else {
4359                         params[num_params++] = ctx->i32;
4360                         declare_streamout_params(ctx, &shader->selector->so,
4361                                                  params, ctx->i32, &num_params);
4362                         params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4363                 }
4364                 last_sgpr = num_params - 1;
4365
4366                 /* VGPRs */
4367                 declare_tes_input_vgprs(ctx, params, &num_params);
4368                 break;
4369
4370         case PIPE_SHADER_GEOMETRY:
4371                 declare_default_desc_pointers(ctx, params, &num_params);
4372                 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
4373                 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
4374                 last_sgpr = num_params - 1;
4375
4376                 /* VGPRs */
4377                 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
4378                 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
4379                 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
4380                 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
4381                 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
4382                 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
4383                 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
4384                 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
4385                 break;
4386
4387         case PIPE_SHADER_FRAGMENT:
4388                 declare_default_desc_pointers(ctx, params, &num_params);
4389                 params[SI_PARAM_ALPHA_REF] = ctx->f32;
4390                 params[SI_PARAM_PRIM_MASK] = ctx->i32;
4391                 last_sgpr = SI_PARAM_PRIM_MASK;
4392                 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
4393                 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
4394                 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
4395                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
4396                 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
4397                 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
4398                 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
4399                 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
4400                 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
4401                 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
4402                 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
4403                 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
4404                 params[SI_PARAM_FRONT_FACE] = ctx->i32;
4405                 shader->info.face_vgpr_index = 20;
4406                 params[SI_PARAM_ANCILLARY] = ctx->i32;
4407                 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
4408                 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
4409                 num_params = SI_PARAM_POS_FIXED_PT+1;
4410
4411                 /* Color inputs from the prolog. */
4412                 if (shader->selector->info.colors_read) {
4413                         unsigned num_color_elements =
4414                                 util_bitcount(shader->selector->info.colors_read);
4415
4416                         assert(num_params + num_color_elements <= ARRAY_SIZE(params));
4417                         for (i = 0; i < num_color_elements; i++)
4418                                 params[num_params++] = ctx->f32;
4419
4420                         num_prolog_vgprs += num_color_elements;
4421                 }
4422
4423                 /* Outputs for the epilog. */
4424                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4425                 num_returns =
4426                         num_return_sgprs +
4427                         util_bitcount(shader->selector->info.colors_written) * 4 +
4428                         shader->selector->info.writes_z +
4429                         shader->selector->info.writes_stencil +
4430                         shader->selector->info.writes_samplemask +
4431                         1 /* SampleMaskIn */;
4432
4433                 num_returns = MAX2(num_returns,
4434                                    num_return_sgprs +
4435                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4436
4437                 for (i = 0; i < num_return_sgprs; i++)
4438                         returns[i] = ctx->i32;
4439                 for (; i < num_returns; i++)
4440                         returns[i] = ctx->f32;
4441                 break;
4442
4443         case PIPE_SHADER_COMPUTE:
4444                 declare_default_desc_pointers(ctx, params, &num_params);
4445                 if (shader->selector->info.uses_grid_size)
4446                         params[ctx->param_grid_size = num_params++] = v3i32;
4447                 if (shader->selector->info.uses_block_size)
4448                         params[ctx->param_block_size = num_params++] = v3i32;
4449
4450                 for (i = 0; i < 3; i++) {
4451                         ctx->param_block_id[i] = -1;
4452                         if (shader->selector->info.uses_block_id[i])
4453                                 params[ctx->param_block_id[i] = num_params++] = ctx->i32;
4454                 }
4455                 last_sgpr = num_params - 1;
4456
4457                 params[ctx->param_thread_id = num_params++] = v3i32;
4458                 break;
4459         default:
4460                 assert(0 && "unimplemented shader");
4461                 return;
4462         }
4463
4464         assert(num_params <= ARRAY_SIZE(params));
4465
4466         si_create_function(ctx, "main", returns, num_returns, params,
4467                            num_params, last_sgpr,
4468                            si_get_max_workgroup_size(shader));
4469
4470         /* Reserve register locations for VGPR inputs the PS prolog may need. */
4471         if (ctx->type == PIPE_SHADER_FRAGMENT &&
4472             ctx->separate_prolog) {
4473                 si_llvm_add_attribute(ctx->main_fn,
4474                                       "InitialPSInputAddr",
4475                                       S_0286D0_PERSP_SAMPLE_ENA(1) |
4476                                       S_0286D0_PERSP_CENTER_ENA(1) |
4477                                       S_0286D0_PERSP_CENTROID_ENA(1) |
4478                                       S_0286D0_LINEAR_SAMPLE_ENA(1) |
4479                                       S_0286D0_LINEAR_CENTER_ENA(1) |
4480                                       S_0286D0_LINEAR_CENTROID_ENA(1) |
4481                                       S_0286D0_FRONT_FACE_ENA(1) |
4482                                       S_0286D0_POS_FIXED_PT_ENA(1));
4483         }
4484
4485         shader->info.num_input_sgprs = 0;
4486         shader->info.num_input_vgprs = 0;
4487
4488         for (i = 0; i <= last_sgpr; ++i)
4489                 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
4490
4491         for (; i < num_params; ++i)
4492                 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
4493
4494         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4495         shader->info.num_input_vgprs -= num_prolog_vgprs;
4496
4497         if (!ctx->screen->has_ds_bpermute &&
4498             bld_base->info &&
4499             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
4500              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
4501              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
4502              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
4503              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
4504              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
4505                 ctx->lds =
4506                         LLVMAddGlobalInAddressSpace(gallivm->module,
4507                                                     LLVMArrayType(ctx->i32, 64),
4508                                                     "ddxy_lds",
4509                                                     LOCAL_ADDR_SPACE);
4510
4511         if (shader->key.as_ls ||
4512             ctx->type == PIPE_SHADER_TESS_CTRL ||
4513             /* GFX9 has the ESGS ring buffer in LDS. */
4514             (ctx->screen->b.chip_class >= GFX9 &&
4515              (shader->key.as_es ||
4516               ctx->type == PIPE_SHADER_GEOMETRY)))
4517                 declare_lds_as_pointer(ctx);
4518 }
4519
4520 /**
4521  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4522  * for later use.
4523  */
4524 static void preload_ring_buffers(struct si_shader_context *ctx)
4525 {
4526         struct gallivm_state *gallivm = &ctx->gallivm;
4527         LLVMBuilderRef builder = gallivm->builder;
4528
4529         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4530                                             ctx->param_rw_buffers);
4531
4532         if (ctx->screen->b.chip_class <= VI &&
4533             (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4534                 unsigned ring =
4535                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4536                                                              : SI_ES_RING_ESGS;
4537                 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4538
4539                 ctx->esgs_ring =
4540                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4541         }
4542
4543         if (ctx->shader->is_gs_copy_shader) {
4544                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4545
4546                 ctx->gsvs_ring[0] =
4547                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4548         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4549                 const struct si_shader_selector *sel = ctx->shader->selector;
4550                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4551                 LLVMValueRef base_ring;
4552
4553                 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4554
4555                 /* The conceptual layout of the GSVS ring is
4556                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
4557                  * but the real memory layout is swizzled across
4558                  * threads:
4559                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4560                  *   t16v0c0 ..
4561                  * Override the buffer descriptor accordingly.
4562                  */
4563                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4564                 uint64_t stream_offset = 0;
4565
4566                 for (unsigned stream = 0; stream < 4; ++stream) {
4567                         unsigned num_components;
4568                         unsigned stride;
4569                         unsigned num_records;
4570                         LLVMValueRef ring, tmp;
4571
4572                         num_components = sel->info.num_stream_output_components[stream];
4573                         if (!num_components)
4574                                 continue;
4575
4576                         stride = 4 * num_components * sel->gs_max_out_vertices;
4577
4578                         /* Limit on the stride field for <= CIK. */
4579                         assert(stride < (1 << 14));
4580
4581                         num_records = 64;
4582
4583                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4584                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4585                         tmp = LLVMBuildAdd(builder, tmp,
4586                                            LLVMConstInt(ctx->i64,
4587                                                         stream_offset, 0), "");
4588                         stream_offset += stride * 64;
4589
4590                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4591                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4592                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4593                         tmp = LLVMBuildOr(builder, tmp,
4594                                 LLVMConstInt(ctx->i32,
4595                                              S_008F04_STRIDE(stride) |
4596                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
4597                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4598                         ring = LLVMBuildInsertElement(builder, ring,
4599                                         LLVMConstInt(ctx->i32, num_records, 0),
4600                                         LLVMConstInt(ctx->i32, 2, 0), "");
4601                         ring = LLVMBuildInsertElement(builder, ring,
4602                                 LLVMConstInt(ctx->i32,
4603                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4604                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4605                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4606                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4607                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4608                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4609                                              S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4610                                              S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4611                                              S_008F0C_ADD_TID_ENABLE(1),
4612                                              0),
4613                                 LLVMConstInt(ctx->i32, 3, 0), "");
4614
4615                         ctx->gsvs_ring[stream] = ring;
4616                 }
4617         }
4618 }
4619
4620 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4621                                          LLVMValueRef param_rw_buffers,
4622                                          unsigned param_pos_fixed_pt)
4623 {
4624         struct gallivm_state *gallivm = &ctx->gallivm;
4625         LLVMBuilderRef builder = gallivm->builder;
4626         LLVMValueRef slot, desc, offset, row, bit, address[2];
4627
4628         /* Use the fixed-point gl_FragCoord input.
4629          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4630          * per coordinate to get the repeating effect.
4631          */
4632         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4633         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4634
4635         /* Load the buffer descriptor. */
4636         slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4637         desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4638
4639         /* The stipple pattern is 32x32, each row has 32 bits. */
4640         offset = LLVMBuildMul(builder, address[1],
4641                               LLVMConstInt(ctx->i32, 4, 0), "");
4642         row = buffer_load_const(ctx, desc, offset);
4643         row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4644         bit = LLVMBuildLShr(builder, row, address[0], "");
4645         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4646
4647         /* The intrinsic kills the thread if arg < 0. */
4648         bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4649                               LLVMConstReal(ctx->f32, -1), "");
4650         ac_build_kill(&ctx->ac, bit);
4651 }
4652
4653 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4654                                   struct si_shader_config *conf,
4655                                   unsigned symbol_offset)
4656 {
4657         unsigned i;
4658         const unsigned char *config =
4659                 ac_shader_binary_config_start(binary, symbol_offset);
4660         bool really_needs_scratch = false;
4661
4662         /* LLVM adds SGPR spills to the scratch size.
4663          * Find out if we really need the scratch buffer.
4664          */
4665         for (i = 0; i < binary->reloc_count; i++) {
4666                 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4667
4668                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4669                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4670                         really_needs_scratch = true;
4671                         break;
4672                 }
4673         }
4674
4675         /* XXX: We may be able to emit some of these values directly rather than
4676          * extracting fields to be emitted later.
4677          */
4678
4679         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4680                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4681                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4682                 switch (reg) {
4683                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4684                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4685                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4686                 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4687                 case R_00B848_COMPUTE_PGM_RSRC1:
4688                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4689                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4690                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
4691                         conf->rsrc1 = value;
4692                         break;
4693                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4694                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4695                         break;
4696                 case R_00B84C_COMPUTE_PGM_RSRC2:
4697                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4698                         conf->rsrc2 = value;
4699                         break;
4700                 case R_0286CC_SPI_PS_INPUT_ENA:
4701                         conf->spi_ps_input_ena = value;
4702                         break;
4703                 case R_0286D0_SPI_PS_INPUT_ADDR:
4704                         conf->spi_ps_input_addr = value;
4705                         break;
4706                 case R_0286E8_SPI_TMPRING_SIZE:
4707                 case R_00B860_COMPUTE_TMPRING_SIZE:
4708                         /* WAVESIZE is in units of 256 dwords. */
4709                         if (really_needs_scratch)
4710                                 conf->scratch_bytes_per_wave =
4711                                         G_00B860_WAVESIZE(value) * 256 * 4;
4712                         break;
4713                 case 0x4: /* SPILLED_SGPRS */
4714                         conf->spilled_sgprs = value;
4715                         break;
4716                 case 0x8: /* SPILLED_VGPRS */
4717                         conf->spilled_vgprs = value;
4718                         break;
4719                 default:
4720                         {
4721                                 static bool printed;
4722
4723                                 if (!printed) {
4724                                         fprintf(stderr, "Warning: LLVM emitted unknown "
4725                                                 "config register: 0x%x\n", reg);
4726                                         printed = true;
4727                                 }
4728                         }
4729                         break;
4730                 }
4731         }
4732
4733         if (!conf->spi_ps_input_addr)
4734                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4735 }
4736
4737 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4738                                     uint64_t scratch_va)
4739 {
4740         unsigned i;
4741         uint32_t scratch_rsrc_dword0 = scratch_va;
4742         uint32_t scratch_rsrc_dword1 =
4743                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4744
4745         /* Enable scratch coalescing. */
4746         scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4747
4748         for (i = 0 ; i < shader->binary.reloc_count; i++) {
4749                 const struct ac_shader_reloc *reloc =
4750                                         &shader->binary.relocs[i];
4751                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4752                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4753                         &scratch_rsrc_dword0, 4);
4754                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4755                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4756                         &scratch_rsrc_dword1, 4);
4757                 }
4758         }
4759 }
4760
4761 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4762 {
4763         unsigned size = shader->binary.code_size;
4764
4765         if (shader->prolog)
4766                 size += shader->prolog->binary.code_size;
4767         if (shader->previous_stage)
4768                 size += shader->previous_stage->binary.code_size;
4769         if (shader->prolog2)
4770                 size += shader->prolog2->binary.code_size;
4771         if (shader->epilog)
4772                 size += shader->epilog->binary.code_size;
4773         return size;
4774 }
4775
4776 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4777 {
4778         const struct ac_shader_binary *prolog =
4779                 shader->prolog ? &shader->prolog->binary : NULL;
4780         const struct ac_shader_binary *previous_stage =
4781                 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4782         const struct ac_shader_binary *prolog2 =
4783                 shader->prolog2 ? &shader->prolog2->binary : NULL;
4784         const struct ac_shader_binary *epilog =
4785                 shader->epilog ? &shader->epilog->binary : NULL;
4786         const struct ac_shader_binary *mainb = &shader->binary;
4787         unsigned bo_size = si_get_shader_binary_size(shader) +
4788                            (!epilog ? mainb->rodata_size : 0);
4789         unsigned char *ptr;
4790
4791         assert(!prolog || !prolog->rodata_size);
4792         assert(!previous_stage || !previous_stage->rodata_size);
4793         assert(!prolog2 || !prolog2->rodata_size);
4794         assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4795                !mainb->rodata_size);
4796         assert(!epilog || !epilog->rodata_size);
4797
4798         r600_resource_reference(&shader->bo, NULL);
4799         shader->bo = (struct r600_resource*)
4800                      pipe_buffer_create(&sscreen->b.b, 0,
4801                                         PIPE_USAGE_IMMUTABLE,
4802                                         align(bo_size, SI_CPDMA_ALIGNMENT));
4803         if (!shader->bo)
4804                 return -ENOMEM;
4805
4806         /* Upload. */
4807         ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
4808                                         PIPE_TRANSFER_READ_WRITE |
4809                                         PIPE_TRANSFER_UNSYNCHRONIZED);
4810
4811         /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
4812          * endian-independent. */
4813         if (prolog) {
4814                 memcpy(ptr, prolog->code, prolog->code_size);
4815                 ptr += prolog->code_size;
4816         }
4817         if (previous_stage) {
4818                 memcpy(ptr, previous_stage->code, previous_stage->code_size);
4819                 ptr += previous_stage->code_size;
4820         }
4821         if (prolog2) {
4822                 memcpy(ptr, prolog2->code, prolog2->code_size);
4823                 ptr += prolog2->code_size;
4824         }
4825
4826         memcpy(ptr, mainb->code, mainb->code_size);
4827         ptr += mainb->code_size;
4828
4829         if (epilog)
4830                 memcpy(ptr, epilog->code, epilog->code_size);
4831         else if (mainb->rodata_size > 0)
4832                 memcpy(ptr, mainb->rodata, mainb->rodata_size);
4833
4834         sscreen->b.ws->buffer_unmap(shader->bo->buf);
4835         return 0;
4836 }
4837
4838 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
4839                                        struct pipe_debug_callback *debug,
4840                                        const char *name, FILE *file)
4841 {
4842         char *line, *p;
4843         unsigned i, count;
4844
4845         if (binary->disasm_string) {
4846                 fprintf(file, "Shader %s disassembly:\n", name);
4847                 fprintf(file, "%s", binary->disasm_string);
4848
4849                 if (debug && debug->debug_message) {
4850                         /* Very long debug messages are cut off, so send the
4851                          * disassembly one line at a time. This causes more
4852                          * overhead, but on the plus side it simplifies
4853                          * parsing of resulting logs.
4854                          */
4855                         pipe_debug_message(debug, SHADER_INFO,
4856                                            "Shader Disassembly Begin");
4857
4858                         line = binary->disasm_string;
4859                         while (*line) {
4860                                 p = util_strchrnul(line, '\n');
4861                                 count = p - line;
4862
4863                                 if (count) {
4864                                         pipe_debug_message(debug, SHADER_INFO,
4865                                                            "%.*s", count, line);
4866                                 }
4867
4868                                 if (!*p)
4869                                         break;
4870                                 line = p + 1;
4871                         }
4872
4873                         pipe_debug_message(debug, SHADER_INFO,
4874                                            "Shader Disassembly End");
4875                 }
4876         } else {
4877                 fprintf(file, "Shader %s binary:\n", name);
4878                 for (i = 0; i < binary->code_size; i += 4) {
4879                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
4880                                 binary->code[i + 3], binary->code[i + 2],
4881                                 binary->code[i + 1], binary->code[i]);
4882                 }
4883         }
4884 }
4885
4886 static void si_shader_dump_stats(struct si_screen *sscreen,
4887                                  const struct si_shader *shader,
4888                                  struct pipe_debug_callback *debug,
4889                                  unsigned processor,
4890                                  FILE *file,
4891                                  bool check_debug_option)
4892 {
4893         const struct si_shader_config *conf = &shader->config;
4894         unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
4895         unsigned code_size = si_get_shader_binary_size(shader);
4896         unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
4897         unsigned lds_per_wave = 0;
4898         unsigned max_simd_waves = 10;
4899
4900         /* Compute LDS usage for PS. */
4901         switch (processor) {
4902         case PIPE_SHADER_FRAGMENT:
4903                 /* The minimum usage per wave is (num_inputs * 48). The maximum
4904                  * usage is (num_inputs * 48 * 16).
4905                  * We can get anything in between and it varies between waves.
4906                  *
4907                  * The 48 bytes per input for a single primitive is equal to
4908                  * 4 bytes/component * 4 components/input * 3 points.
4909                  *
4910                  * Other stages don't know the size at compile time or don't
4911                  * allocate LDS per wave, but instead they do it per thread group.
4912                  */
4913                 lds_per_wave = conf->lds_size * lds_increment +
4914                                align(num_inputs * 48, lds_increment);
4915                 break;
4916         case PIPE_SHADER_COMPUTE:
4917                 if (shader->selector) {
4918                         unsigned max_workgroup_size =
4919                                 si_get_max_workgroup_size(shader);
4920                         lds_per_wave = (conf->lds_size * lds_increment) /
4921                                        DIV_ROUND_UP(max_workgroup_size, 64);
4922                 }
4923                 break;
4924         }
4925
4926         /* Compute the per-SIMD wave counts. */
4927         if (conf->num_sgprs) {
4928                 if (sscreen->b.chip_class >= VI)
4929                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
4930                 else
4931                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
4932         }
4933
4934         if (conf->num_vgprs)
4935                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
4936
4937         /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
4938          * 16KB makes some SIMDs unoccupied). */
4939         if (lds_per_wave)
4940                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
4941
4942         if (!check_debug_option ||
4943             r600_can_dump_shader(&sscreen->b, processor)) {
4944                 if (processor == PIPE_SHADER_FRAGMENT) {
4945                         fprintf(file, "*** SHADER CONFIG ***\n"
4946                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
4947                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
4948                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
4949                 }
4950
4951                 fprintf(file, "*** SHADER STATS ***\n"
4952                         "SGPRS: %d\n"
4953                         "VGPRS: %d\n"
4954                         "Spilled SGPRs: %d\n"
4955                         "Spilled VGPRs: %d\n"
4956                         "Private memory VGPRs: %d\n"
4957                         "Code Size: %d bytes\n"
4958                         "LDS: %d blocks\n"
4959                         "Scratch: %d bytes per wave\n"
4960                         "Max Waves: %d\n"
4961                         "********************\n\n\n",
4962                         conf->num_sgprs, conf->num_vgprs,
4963                         conf->spilled_sgprs, conf->spilled_vgprs,
4964                         conf->private_mem_vgprs, code_size,
4965                         conf->lds_size, conf->scratch_bytes_per_wave,
4966                         max_simd_waves);
4967         }
4968
4969         pipe_debug_message(debug, SHADER_INFO,
4970                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
4971                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
4972                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
4973                            conf->num_sgprs, conf->num_vgprs, code_size,
4974                            conf->lds_size, conf->scratch_bytes_per_wave,
4975                            max_simd_waves, conf->spilled_sgprs,
4976                            conf->spilled_vgprs, conf->private_mem_vgprs);
4977 }
4978
4979 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
4980 {
4981         switch (processor) {
4982         case PIPE_SHADER_VERTEX:
4983                 if (shader->key.as_es)
4984                         return "Vertex Shader as ES";
4985                 else if (shader->key.as_ls)
4986                         return "Vertex Shader as LS";
4987                 else
4988                         return "Vertex Shader as VS";
4989         case PIPE_SHADER_TESS_CTRL:
4990                 return "Tessellation Control Shader";
4991         case PIPE_SHADER_TESS_EVAL:
4992                 if (shader->key.as_es)
4993                         return "Tessellation Evaluation Shader as ES";
4994                 else
4995                         return "Tessellation Evaluation Shader as VS";
4996         case PIPE_SHADER_GEOMETRY:
4997                 if (shader->is_gs_copy_shader)
4998                         return "GS Copy Shader as VS";
4999                 else
5000                         return "Geometry Shader";
5001         case PIPE_SHADER_FRAGMENT:
5002                 return "Pixel Shader";
5003         case PIPE_SHADER_COMPUTE:
5004                 return "Compute Shader";
5005         default:
5006                 return "Unknown Shader";
5007         }
5008 }
5009
5010 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5011                     struct pipe_debug_callback *debug, unsigned processor,
5012                     FILE *file, bool check_debug_option)
5013 {
5014         if (!check_debug_option ||
5015             r600_can_dump_shader(&sscreen->b, processor))
5016                 si_dump_shader_key(processor, shader, file);
5017
5018         if (!check_debug_option && shader->binary.llvm_ir_string) {
5019                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5020                         si_get_shader_name(shader, processor));
5021                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5022         }
5023
5024         if (!check_debug_option ||
5025             (r600_can_dump_shader(&sscreen->b, processor) &&
5026              !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5027                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5028
5029                 if (shader->prolog)
5030                         si_shader_dump_disassembly(&shader->prolog->binary,
5031                                                    debug, "prolog", file);
5032                 if (shader->previous_stage)
5033                         si_shader_dump_disassembly(&shader->previous_stage->binary,
5034                                                    debug, "previous stage", file);
5035                 if (shader->prolog2)
5036                         si_shader_dump_disassembly(&shader->prolog2->binary,
5037                                                    debug, "prolog2", file);
5038
5039                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5040
5041                 if (shader->epilog)
5042                         si_shader_dump_disassembly(&shader->epilog->binary,
5043                                                    debug, "epilog", file);
5044                 fprintf(file, "\n");
5045         }
5046
5047         si_shader_dump_stats(sscreen, shader, debug, processor, file,
5048                              check_debug_option);
5049 }
5050
5051 static int si_compile_llvm(struct si_screen *sscreen,
5052                            struct ac_shader_binary *binary,
5053                            struct si_shader_config *conf,
5054                            LLVMTargetMachineRef tm,
5055                            LLVMModuleRef mod,
5056                            struct pipe_debug_callback *debug,
5057                            unsigned processor,
5058                            const char *name)
5059 {
5060         int r = 0;
5061         unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5062
5063         if (r600_can_dump_shader(&sscreen->b, processor)) {
5064                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5065
5066                 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5067                         fprintf(stderr, "%s LLVM IR:\n\n", name);
5068                         ac_dump_module(mod);
5069                         fprintf(stderr, "\n");
5070                 }
5071         }
5072
5073         if (sscreen->record_llvm_ir) {
5074                 char *ir = LLVMPrintModuleToString(mod);
5075                 binary->llvm_ir_string = strdup(ir);
5076                 LLVMDisposeMessage(ir);
5077         }
5078
5079         if (!si_replace_shader(count, binary)) {
5080                 r = si_llvm_compile(mod, binary, tm, debug);
5081                 if (r)
5082                         return r;
5083         }
5084
5085         si_shader_binary_read_config(binary, conf, 0);
5086
5087         /* Enable 64-bit and 16-bit denormals, because there is no performance
5088          * cost.
5089          *
5090          * If denormals are enabled, all floating-point output modifiers are
5091          * ignored.
5092          *
5093          * Don't enable denormals for 32-bit floats, because:
5094          * - Floating-point output modifiers would be ignored by the hw.
5095          * - Some opcodes don't support denormals, such as v_mad_f32. We would
5096          *   have to stop using those.
5097          * - SI & CI would be very slow.
5098          */
5099         conf->float_mode |= V_00B028_FP_64_DENORMS;
5100
5101         FREE(binary->config);
5102         FREE(binary->global_symbol_offsets);
5103         binary->config = NULL;
5104         binary->global_symbol_offsets = NULL;
5105
5106         /* Some shaders can't have rodata because their binaries can be
5107          * concatenated.
5108          */
5109         if (binary->rodata_size &&
5110             (processor == PIPE_SHADER_VERTEX ||
5111              processor == PIPE_SHADER_TESS_CTRL ||
5112              processor == PIPE_SHADER_TESS_EVAL ||
5113              processor == PIPE_SHADER_FRAGMENT)) {
5114                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5115                 return -EINVAL;
5116         }
5117
5118         return r;
5119 }
5120
5121 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5122 {
5123         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5124                 LLVMBuildRetVoid(ctx->gallivm.builder);
5125         else
5126                 LLVMBuildRet(ctx->gallivm.builder, ret);
5127 }
5128
5129 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5130 struct si_shader *
5131 si_generate_gs_copy_shader(struct si_screen *sscreen,
5132                            LLVMTargetMachineRef tm,
5133                            struct si_shader_selector *gs_selector,
5134                            struct pipe_debug_callback *debug)
5135 {
5136         struct si_shader_context ctx;
5137         struct si_shader *shader;
5138         struct gallivm_state *gallivm = &ctx.gallivm;
5139         LLVMBuilderRef builder;
5140         struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5141         struct lp_build_context *uint = &bld_base->uint_bld;
5142         struct si_shader_output_values *outputs;
5143         struct tgsi_shader_info *gsinfo = &gs_selector->info;
5144         int i, r;
5145
5146         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5147
5148         if (!outputs)
5149                 return NULL;
5150
5151         shader = CALLOC_STRUCT(si_shader);
5152         if (!shader) {
5153                 FREE(outputs);
5154                 return NULL;
5155         }
5156
5157
5158         shader->selector = gs_selector;
5159         shader->is_gs_copy_shader = true;
5160
5161         si_init_shader_ctx(&ctx, sscreen, tm);
5162         ctx.shader = shader;
5163         ctx.type = PIPE_SHADER_VERTEX;
5164
5165         builder = gallivm->builder;
5166
5167         create_function(&ctx);
5168         preload_ring_buffers(&ctx);
5169
5170         LLVMValueRef voffset =
5171                 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
5172                                                     ctx.param_vertex_id), 4);
5173
5174         /* Fetch the vertex stream ID.*/
5175         LLVMValueRef stream_id;
5176
5177         if (gs_selector->so.num_outputs)
5178                 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5179         else
5180                 stream_id = ctx.i32_0;
5181
5182         /* Fill in output information. */
5183         for (i = 0; i < gsinfo->num_outputs; ++i) {
5184                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5185                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5186
5187                 for (int chan = 0; chan < 4; chan++) {
5188                         outputs[i].vertex_stream[chan] =
5189                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5190                 }
5191         }
5192
5193         LLVMBasicBlockRef end_bb;
5194         LLVMValueRef switch_inst;
5195
5196         end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5197         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5198
5199         for (int stream = 0; stream < 4; stream++) {
5200                 LLVMBasicBlockRef bb;
5201                 unsigned offset;
5202
5203                 if (!gsinfo->num_stream_output_components[stream])
5204                         continue;
5205
5206                 if (stream > 0 && !gs_selector->so.num_outputs)
5207                         continue;
5208
5209                 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5210                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5211                 LLVMPositionBuilderAtEnd(builder, bb);
5212
5213                 /* Fetch vertex data from GSVS ring */
5214                 offset = 0;
5215                 for (i = 0; i < gsinfo->num_outputs; ++i) {
5216                         for (unsigned chan = 0; chan < 4; chan++) {
5217                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5218                                     outputs[i].vertex_stream[chan] != stream) {
5219                                         outputs[i].values[chan] = ctx.bld_base.base.undef;
5220                                         continue;
5221                                 }
5222
5223                                 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5224                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5225                                 offset++;
5226
5227                                 outputs[i].values[chan] =
5228                                         ac_build_buffer_load(&ctx.ac,
5229                                                              ctx.gsvs_ring[0], 1,
5230                                                              ctx.i32_0, voffset,
5231                                                              soffset, 0, 1, 1,
5232                                                              true, false);
5233                         }
5234                 }
5235
5236                 /* Streamout and exports. */
5237                 if (gs_selector->so.num_outputs) {
5238                         si_llvm_emit_streamout(&ctx, outputs,
5239                                                gsinfo->num_outputs,
5240                                                stream);
5241                 }
5242
5243                 if (stream == 0)
5244                         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5245
5246                 LLVMBuildBr(builder, end_bb);
5247         }
5248
5249         LLVMPositionBuilderAtEnd(builder, end_bb);
5250
5251         LLVMBuildRetVoid(gallivm->builder);
5252
5253         ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5254         si_llvm_optimize_module(&ctx);
5255
5256         r = si_compile_llvm(sscreen, &ctx.shader->binary,
5257                             &ctx.shader->config, ctx.tm,
5258                             ctx.gallivm.module,
5259                             debug, PIPE_SHADER_GEOMETRY,
5260                             "GS Copy Shader");
5261         if (!r) {
5262                 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5263                         fprintf(stderr, "GS Copy Shader:\n");
5264                 si_shader_dump(sscreen, ctx.shader, debug,
5265                                PIPE_SHADER_GEOMETRY, stderr, true);
5266                 r = si_shader_binary_upload(sscreen, ctx.shader);
5267         }
5268
5269         si_llvm_dispose(&ctx);
5270
5271         FREE(outputs);
5272
5273         if (r != 0) {
5274                 FREE(shader);
5275                 shader = NULL;
5276         }
5277         return shader;
5278 }
5279
5280 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5281                                   const struct si_vs_prolog_bits *prolog,
5282                                   const char *prefix, FILE *f)
5283 {
5284         fprintf(f, "  %s.instance_divisor_is_one = %u\n",
5285                 prefix, prolog->instance_divisor_is_one);
5286         fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
5287                 prefix, prolog->instance_divisor_is_fetched);
5288
5289         fprintf(f, "  mono.vs.fix_fetch = {");
5290         for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5291                 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5292         fprintf(f, "}\n");
5293 }
5294
5295 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5296                                FILE *f)
5297 {
5298         const struct si_shader_key *key = &shader->key;
5299
5300         fprintf(f, "SHADER KEY\n");
5301
5302         switch (processor) {
5303         case PIPE_SHADER_VERTEX:
5304                 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5305                                       "part.vs.prolog", f);
5306                 fprintf(f, "  as_es = %u\n", key->as_es);
5307                 fprintf(f, "  as_ls = %u\n", key->as_ls);
5308                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5309                         key->mono.u.vs_export_prim_id);
5310                 break;
5311
5312         case PIPE_SHADER_TESS_CTRL:
5313                 if (shader->selector->screen->b.chip_class >= GFX9) {
5314                         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5315                                               "part.tcs.ls_prolog", f);
5316                 }
5317                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5318                 fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5319                 break;
5320
5321         case PIPE_SHADER_TESS_EVAL:
5322                 fprintf(f, "  as_es = %u\n", key->as_es);
5323                 fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
5324                         key->mono.u.vs_export_prim_id);
5325                 break;
5326
5327         case PIPE_SHADER_GEOMETRY:
5328                 if (shader->is_gs_copy_shader)
5329                         break;
5330
5331                 if (shader->selector->screen->b.chip_class >= GFX9 &&
5332                     key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5333                         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5334                                               "part.gs.vs_prolog", f);
5335                 }
5336                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5337                 break;
5338
5339         case PIPE_SHADER_COMPUTE:
5340                 break;
5341
5342         case PIPE_SHADER_FRAGMENT:
5343                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5344                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5345                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5346                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5347                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5348                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5349                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5350                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5351                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5352                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5353                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5354                 fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5355                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5356                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5357                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5358                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5359                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5360                 break;
5361
5362         default:
5363                 assert(0);
5364         }
5365
5366         if ((processor == PIPE_SHADER_GEOMETRY ||
5367              processor == PIPE_SHADER_TESS_EVAL ||
5368              processor == PIPE_SHADER_VERTEX) &&
5369             !key->as_es && !key->as_ls) {
5370                 fprintf(f, "  opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5371                 fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
5372         }
5373 }
5374
5375 static void si_init_shader_ctx(struct si_shader_context *ctx,
5376                                struct si_screen *sscreen,
5377                                LLVMTargetMachineRef tm)
5378 {
5379         struct lp_build_tgsi_context *bld_base;
5380
5381         si_llvm_context_init(ctx, sscreen, tm);
5382
5383         bld_base = &ctx->bld_base;
5384         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5385
5386         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5387         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5388         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5389
5390         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5391
5392         bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5393
5394         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5395         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5396         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5397         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5398
5399         bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5400         bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5401         bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5402         bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5403         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5404         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5405         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5406         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5407         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5408
5409         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5410         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5411         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5412 }
5413
5414 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5415 {
5416         struct si_shader *shader = ctx->shader;
5417         struct tgsi_shader_info *info = &shader->selector->info;
5418
5419         if ((ctx->type != PIPE_SHADER_VERTEX &&
5420              ctx->type != PIPE_SHADER_TESS_EVAL) ||
5421             shader->key.as_ls ||
5422             shader->key.as_es)
5423                 return;
5424
5425         ac_optimize_vs_outputs(&ctx->ac,
5426                                ctx->main_fn,
5427                                shader->info.vs_output_param_offset,
5428                                info->num_outputs,
5429                                &shader->info.nr_param_exports);
5430 }
5431
5432 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5433 {
5434         ctx->shader->config.private_mem_vgprs = 0;
5435
5436         /* Process all LLVM instructions. */
5437         LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5438         while (bb) {
5439                 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5440
5441                 while (next) {
5442                         LLVMValueRef inst = next;
5443                         next = LLVMGetNextInstruction(next);
5444
5445                         if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5446                                 continue;
5447
5448                         LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5449                         /* No idea why LLVM aligns allocas to 4 elements. */
5450                         unsigned alignment = LLVMGetAlignment(inst);
5451                         unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
5452                         ctx->shader->config.private_mem_vgprs += dw_size;
5453                 }
5454                 bb = LLVMGetNextBasicBlock(bb);
5455         }
5456 }
5457
5458 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5459 {
5460         LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5461         lp_build_intrinsic(ctx->gallivm.builder,
5462                            "llvm.amdgcn.init.exec", ctx->voidt,
5463                            &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5464 }
5465
5466 static void si_init_exec_from_input(struct si_shader_context *ctx,
5467                                     unsigned param, unsigned bitoffset)
5468 {
5469         LLVMValueRef args[] = {
5470                 LLVMGetParam(ctx->main_fn, param),
5471                 LLVMConstInt(ctx->i32, bitoffset, 0),
5472         };
5473         lp_build_intrinsic(ctx->gallivm.builder,
5474                            "llvm.amdgcn.init.exec.from.input",
5475                            ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5476 }
5477
5478 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5479                                  bool is_monolithic)
5480 {
5481         struct si_shader *shader = ctx->shader;
5482         struct si_shader_selector *sel = shader->selector;
5483         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5484
5485         switch (ctx->type) {
5486         case PIPE_SHADER_VERTEX:
5487                 ctx->load_input = declare_input_vs;
5488                 if (shader->key.as_ls)
5489                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5490                 else if (shader->key.as_es)
5491                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5492                 else
5493                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5494                 break;
5495         case PIPE_SHADER_TESS_CTRL:
5496                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5497                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5498                 bld_base->emit_store = store_output_tcs;
5499                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5500                 break;
5501         case PIPE_SHADER_TESS_EVAL:
5502                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5503                 if (shader->key.as_es)
5504                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5505                 else
5506                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5507                 break;
5508         case PIPE_SHADER_GEOMETRY:
5509                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5510                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5511                 break;
5512         case PIPE_SHADER_FRAGMENT:
5513                 ctx->load_input = declare_input_fs;
5514                 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
5515                 break;
5516         case PIPE_SHADER_COMPUTE:
5517                 ctx->declare_memory_region = declare_compute_memory;
5518                 break;
5519         default:
5520                 assert(!"Unsupported shader type");
5521                 return false;
5522         }
5523
5524         create_function(ctx);
5525         preload_ring_buffers(ctx);
5526
5527         /* For GFX9 merged shaders:
5528          * - Set EXEC. If the prolog is present, set EXEC there instead.
5529          * - Add a barrier before the second shader.
5530          *
5531          * The same thing for monolithic shaders is done in
5532          * si_build_wrapper_function.
5533          */
5534         if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
5535                 if (sel->info.num_instructions > 1 && /* not empty shader */
5536                     (shader->key.as_es || shader->key.as_ls) &&
5537                     (ctx->type == PIPE_SHADER_TESS_EVAL ||
5538                      (ctx->type == PIPE_SHADER_VERTEX &&
5539                       !sel->vs_needs_prolog))) {
5540                         si_init_exec_from_input(ctx,
5541                                                 ctx->param_merged_wave_info, 0);
5542                 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5543                            ctx->type == PIPE_SHADER_GEOMETRY) {
5544                         si_init_exec_from_input(ctx,
5545                                                 ctx->param_merged_wave_info, 8);
5546                         si_llvm_emit_barrier(NULL, bld_base, NULL);
5547                 }
5548         }
5549
5550         if (ctx->type == PIPE_SHADER_GEOMETRY) {
5551                 int i;
5552                 for (i = 0; i < 4; i++) {
5553                         ctx->gs_next_vertex[i] =
5554                                 lp_build_alloca(&ctx->gallivm,
5555                                                 ctx->i32, "");
5556                 }
5557         }
5558
5559         if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5560             ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5561                 /* This is initialized to 0.0 = not kill. */
5562                 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5563         }
5564
5565         if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5566                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5567                 return false;
5568         }
5569
5570         si_llvm_build_ret(ctx, ctx->return_value);
5571         return true;
5572 }
5573
5574 /**
5575  * Compute the VS prolog key, which contains all the information needed to
5576  * build the VS prolog function, and set shader->info bits where needed.
5577  *
5578  * \param info             Shader info of the vertex shader.
5579  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
5580  * \param prolog_key       Key of the VS prolog
5581  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
5582  * \param key              Output shader part key.
5583  */
5584 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5585                                  unsigned num_input_sgprs,
5586                                  const struct si_vs_prolog_bits *prolog_key,
5587                                  struct si_shader *shader_out,
5588                                  union si_shader_part_key *key)
5589 {
5590         memset(key, 0, sizeof(*key));
5591         key->vs_prolog.states = *prolog_key;
5592         key->vs_prolog.num_input_sgprs = num_input_sgprs;
5593         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5594         key->vs_prolog.as_ls = shader_out->key.as_ls;
5595
5596         if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5597                 key->vs_prolog.as_ls = 1;
5598                 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5599         } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5600                 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5601         }
5602
5603         /* Enable loading the InstanceID VGPR. */
5604         uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5605
5606         if ((key->vs_prolog.states.instance_divisor_is_one |
5607              key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5608                 shader_out->info.uses_instanceid = true;
5609 }
5610
5611 /**
5612  * Compute the PS prolog key, which contains all the information needed to
5613  * build the PS prolog function, and set related bits in shader->config.
5614  */
5615 static void si_get_ps_prolog_key(struct si_shader *shader,
5616                                  union si_shader_part_key *key,
5617                                  bool separate_prolog)
5618 {
5619         struct tgsi_shader_info *info = &shader->selector->info;
5620
5621         memset(key, 0, sizeof(*key));
5622         key->ps_prolog.states = shader->key.part.ps.prolog;
5623         key->ps_prolog.colors_read = info->colors_read;
5624         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5625         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5626         key->ps_prolog.wqm = info->uses_derivatives &&
5627                 (key->ps_prolog.colors_read ||
5628                  key->ps_prolog.states.force_persp_sample_interp ||
5629                  key->ps_prolog.states.force_linear_sample_interp ||
5630                  key->ps_prolog.states.force_persp_center_interp ||
5631                  key->ps_prolog.states.force_linear_center_interp ||
5632                  key->ps_prolog.states.bc_optimize_for_persp ||
5633                  key->ps_prolog.states.bc_optimize_for_linear);
5634
5635         if (info->colors_read) {
5636                 unsigned *color = shader->selector->color_attr_index;
5637
5638                 if (shader->key.part.ps.prolog.color_two_side) {
5639                         /* BCOLORs are stored after the last input. */
5640                         key->ps_prolog.num_interp_inputs = info->num_inputs;
5641                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5642                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5643                 }
5644
5645                 for (unsigned i = 0; i < 2; i++) {
5646                         unsigned interp = info->input_interpolate[color[i]];
5647                         unsigned location = info->input_interpolate_loc[color[i]];
5648
5649                         if (!(info->colors_read & (0xf << i*4)))
5650                                 continue;
5651
5652                         key->ps_prolog.color_attr_index[i] = color[i];
5653
5654                         if (shader->key.part.ps.prolog.flatshade_colors &&
5655                             interp == TGSI_INTERPOLATE_COLOR)
5656                                 interp = TGSI_INTERPOLATE_CONSTANT;
5657
5658                         switch (interp) {
5659                         case TGSI_INTERPOLATE_CONSTANT:
5660                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5661                                 break;
5662                         case TGSI_INTERPOLATE_PERSPECTIVE:
5663                         case TGSI_INTERPOLATE_COLOR:
5664                                 /* Force the interpolation location for colors here. */
5665                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5666                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
5667                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
5668                                         location = TGSI_INTERPOLATE_LOC_CENTER;
5669
5670                                 switch (location) {
5671                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
5672                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
5673                                         shader->config.spi_ps_input_ena |=
5674                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
5675                                         break;
5676                                 case TGSI_INTERPOLATE_LOC_CENTER:
5677                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
5678                                         shader->config.spi_ps_input_ena |=
5679                                                 S_0286CC_PERSP_CENTER_ENA(1);
5680                                         break;
5681                                 case TGSI_INTERPOLATE_LOC_CENTROID:
5682                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
5683                                         shader->config.spi_ps_input_ena |=
5684                                                 S_0286CC_PERSP_CENTROID_ENA(1);
5685                                         break;
5686                                 default:
5687                                         assert(0);
5688                                 }
5689                                 break;
5690                         case TGSI_INTERPOLATE_LINEAR:
5691                                 /* Force the interpolation location for colors here. */
5692                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5693                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
5694                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
5695                                         location = TGSI_INTERPOLATE_LOC_CENTER;
5696
5697                                 /* The VGPR assignment for non-monolithic shaders
5698                                  * works because InitialPSInputAddr is set on the
5699                                  * main shader and PERSP_PULL_MODEL is never used.
5700                                  */
5701                                 switch (location) {
5702                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
5703                                         key->ps_prolog.color_interp_vgpr_index[i] =
5704                                                 separate_prolog ? 6 : 9;
5705                                         shader->config.spi_ps_input_ena |=
5706                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
5707                                         break;
5708                                 case TGSI_INTERPOLATE_LOC_CENTER:
5709                                         key->ps_prolog.color_interp_vgpr_index[i] =
5710                                                 separate_prolog ? 8 : 11;
5711                                         shader->config.spi_ps_input_ena |=
5712                                                 S_0286CC_LINEAR_CENTER_ENA(1);
5713                                         break;
5714                                 case TGSI_INTERPOLATE_LOC_CENTROID:
5715                                         key->ps_prolog.color_interp_vgpr_index[i] =
5716                                                 separate_prolog ? 10 : 13;
5717                                         shader->config.spi_ps_input_ena |=
5718                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
5719                                         break;
5720                                 default:
5721                                         assert(0);
5722                                 }
5723                                 break;
5724                         default:
5725                                 assert(0);
5726                         }
5727                 }
5728         }
5729 }
5730
5731 /**
5732  * Check whether a PS prolog is required based on the key.
5733  */
5734 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5735 {
5736         return key->ps_prolog.colors_read ||
5737                key->ps_prolog.states.force_persp_sample_interp ||
5738                key->ps_prolog.states.force_linear_sample_interp ||
5739                key->ps_prolog.states.force_persp_center_interp ||
5740                key->ps_prolog.states.force_linear_center_interp ||
5741                key->ps_prolog.states.bc_optimize_for_persp ||
5742                key->ps_prolog.states.bc_optimize_for_linear ||
5743                key->ps_prolog.states.poly_stipple;
5744 }
5745
5746 /**
5747  * Compute the PS epilog key, which contains all the information needed to
5748  * build the PS epilog function.
5749  */
5750 static void si_get_ps_epilog_key(struct si_shader *shader,
5751                                  union si_shader_part_key *key)
5752 {
5753         struct tgsi_shader_info *info = &shader->selector->info;
5754         memset(key, 0, sizeof(*key));
5755         key->ps_epilog.colors_written = info->colors_written;
5756         key->ps_epilog.writes_z = info->writes_z;
5757         key->ps_epilog.writes_stencil = info->writes_stencil;
5758         key->ps_epilog.writes_samplemask = info->writes_samplemask;
5759         key->ps_epilog.states = shader->key.part.ps.epilog;
5760 }
5761
5762 /**
5763  * Build the GS prolog function. Rotate the input vertices for triangle strips
5764  * with adjacency.
5765  */
5766 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
5767                                         union si_shader_part_key *key)
5768 {
5769         unsigned num_sgprs, num_vgprs;
5770         struct gallivm_state *gallivm = &ctx->gallivm;
5771         LLVMBuilderRef builder = gallivm->builder;
5772         LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
5773         LLVMTypeRef returns[48];
5774         LLVMValueRef func, ret;
5775
5776         if (ctx->screen->b.chip_class >= GFX9) {
5777                 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
5778                 num_vgprs = 5; /* ES inputs are not needed by GS */
5779         } else {
5780                 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
5781                 num_vgprs = 8;
5782         }
5783
5784         for (unsigned i = 0; i < num_sgprs; ++i) {
5785                 params[i] = ctx->i32;
5786                 returns[i] = ctx->i32;
5787         }
5788
5789         for (unsigned i = 0; i < num_vgprs; ++i) {
5790                 params[num_sgprs + i] = ctx->i32;
5791                 returns[num_sgprs + i] = ctx->f32;
5792         }
5793
5794         /* Create the function. */
5795         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
5796                            params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
5797         func = ctx->main_fn;
5798
5799         /* Set the full EXEC mask for the prolog, because we are only fiddling
5800          * with registers here. The main shader part will set the correct EXEC
5801          * mask.
5802          */
5803         if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
5804                 si_init_exec_full_mask(ctx);
5805
5806         /* Copy inputs to outputs. This should be no-op, as the registers match,
5807          * but it will prevent the compiler from overwriting them unintentionally.
5808          */
5809         ret = ctx->return_value;
5810         for (unsigned i = 0; i < num_sgprs; i++) {
5811                 LLVMValueRef p = LLVMGetParam(func, i);
5812                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
5813         }
5814         for (unsigned i = 0; i < num_vgprs; i++) {
5815                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
5816                 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
5817                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
5818         }
5819
5820         if (key->gs_prolog.states.tri_strip_adj_fix) {
5821                 /* Remap the input vertices for every other primitive. */
5822                 const unsigned gfx6_vtx_params[6] = {
5823                         num_sgprs,
5824                         num_sgprs + 1,
5825                         num_sgprs + 3,
5826                         num_sgprs + 4,
5827                         num_sgprs + 5,
5828                         num_sgprs + 6
5829                 };
5830                 const unsigned gfx9_vtx_params[3] = {
5831                         num_sgprs,
5832                         num_sgprs + 1,
5833                         num_sgprs + 4,
5834                 };
5835                 LLVMValueRef vtx_in[6], vtx_out[6];
5836                 LLVMValueRef prim_id, rotate;
5837
5838                 if (ctx->screen->b.chip_class >= GFX9) {
5839                         for (unsigned i = 0; i < 3; i++) {
5840                                 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
5841                                 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
5842                         }
5843                 } else {
5844                         for (unsigned i = 0; i < 6; i++)
5845                                 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
5846                 }
5847
5848                 prim_id = LLVMGetParam(func, num_sgprs + 2);
5849                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
5850
5851                 for (unsigned i = 0; i < 6; ++i) {
5852                         LLVMValueRef base, rotated;
5853                         base = vtx_in[i];
5854                         rotated = vtx_in[(i + 4) % 6];
5855                         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
5856                 }
5857
5858                 if (ctx->screen->b.chip_class >= GFX9) {
5859                         for (unsigned i = 0; i < 3; i++) {
5860                                 LLVMValueRef hi, out;
5861
5862                                 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
5863                                                   LLVMConstInt(ctx->i32, 16, 0), "");
5864                                 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
5865                                 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
5866                                 ret = LLVMBuildInsertValue(builder, ret, out,
5867                                                            gfx9_vtx_params[i], "");
5868                         }
5869                 } else {
5870                         for (unsigned i = 0; i < 6; i++) {
5871                                 LLVMValueRef out;
5872
5873                                 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
5874                                 ret = LLVMBuildInsertValue(builder, ret, out,
5875                                                            gfx6_vtx_params[i], "");
5876                         }
5877                 }
5878         }
5879
5880         LLVMBuildRet(builder, ret);
5881 }
5882
5883 /**
5884  * Given a list of shader part functions, build a wrapper function that
5885  * runs them in sequence to form a monolithic shader.
5886  */
5887 static void si_build_wrapper_function(struct si_shader_context *ctx,
5888                                       LLVMValueRef *parts,
5889                                       unsigned num_parts,
5890                                       unsigned main_part,
5891                                       unsigned next_shader_first_part)
5892 {
5893         struct gallivm_state *gallivm = &ctx->gallivm;
5894         LLVMBuilderRef builder = ctx->gallivm.builder;
5895         /* PS epilog has one arg per color component */
5896         LLVMTypeRef param_types[48];
5897         LLVMValueRef initial[48], out[48];
5898         LLVMTypeRef function_type;
5899         unsigned num_params;
5900         unsigned num_out, initial_num_out;
5901         MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
5902         MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
5903         unsigned num_sgprs, num_vgprs;
5904         unsigned last_sgpr_param;
5905         unsigned gprs;
5906         struct lp_build_if_state if_state;
5907
5908         for (unsigned i = 0; i < num_parts; ++i) {
5909                 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
5910                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
5911         }
5912
5913         /* The parameters of the wrapper function correspond to those of the
5914          * first part in terms of SGPRs and VGPRs, but we use the types of the
5915          * main part to get the right types. This is relevant for the
5916          * dereferenceable attribute on descriptor table pointers.
5917          */
5918         num_sgprs = 0;
5919         num_vgprs = 0;
5920
5921         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
5922         num_params = LLVMCountParamTypes(function_type);
5923
5924         for (unsigned i = 0; i < num_params; ++i) {
5925                 LLVMValueRef param = LLVMGetParam(parts[0], i);
5926
5927                 if (ac_is_sgpr_param(param)) {
5928                         assert(num_vgprs == 0);
5929                         num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
5930                 } else {
5931                         num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
5932                 }
5933         }
5934         assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
5935
5936         num_params = 0;
5937         last_sgpr_param = 0;
5938         gprs = 0;
5939         while (gprs < num_sgprs + num_vgprs) {
5940                 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
5941                 unsigned size;
5942
5943                 param_types[num_params] = LLVMTypeOf(param);
5944                 if (gprs < num_sgprs)
5945                         last_sgpr_param = num_params;
5946                 size = llvm_get_type_size(param_types[num_params]) / 4;
5947                 num_params++;
5948
5949                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
5950                 assert(gprs + size <= num_sgprs + num_vgprs &&
5951                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
5952
5953                 gprs += size;
5954         }
5955
5956         si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
5957                            last_sgpr_param,
5958                            si_get_max_workgroup_size(ctx->shader));
5959
5960         if (is_merged_shader(ctx->shader))
5961                 si_init_exec_full_mask(ctx);
5962
5963         /* Record the arguments of the function as if they were an output of
5964          * a previous part.
5965          */
5966         num_out = 0;
5967         num_out_sgpr = 0;
5968
5969         for (unsigned i = 0; i < num_params; ++i) {
5970                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
5971                 LLVMTypeRef param_type = LLVMTypeOf(param);
5972                 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
5973                 unsigned size = llvm_get_type_size(param_type) / 4;
5974
5975                 if (size == 1) {
5976                         if (param_type != out_type)
5977                                 param = LLVMBuildBitCast(builder, param, out_type, "");
5978                         out[num_out++] = param;
5979                 } else {
5980                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
5981
5982                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
5983                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
5984                                 param_type = ctx->i64;
5985                         }
5986
5987                         if (param_type != vector_type)
5988                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
5989
5990                         for (unsigned j = 0; j < size; ++j)
5991                                 out[num_out++] = LLVMBuildExtractElement(
5992                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
5993                 }
5994
5995                 if (i <= last_sgpr_param)
5996                         num_out_sgpr = num_out;
5997         }
5998
5999         memcpy(initial, out, sizeof(out));
6000         initial_num_out = num_out;
6001         initial_num_out_sgpr = num_out_sgpr;
6002
6003         /* Now chain the parts. */
6004         for (unsigned part = 0; part < num_parts; ++part) {
6005                 LLVMValueRef in[48];
6006                 LLVMValueRef ret;
6007                 LLVMTypeRef ret_type;
6008                 unsigned out_idx = 0;
6009
6010                 num_params = LLVMCountParams(parts[part]);
6011                 assert(num_params <= ARRAY_SIZE(param_types));
6012
6013                 /* Merged shaders are executed conditionally depending
6014                  * on the number of enabled threads passed in the input SGPRs. */
6015                 if (is_merged_shader(ctx->shader) &&
6016                     (part == 0 || part == next_shader_first_part)) {
6017                         LLVMValueRef ena, count = initial[3];
6018
6019                         /* The thread count for the 2nd shader is at bit-offset 8. */
6020                         if (part == next_shader_first_part) {
6021                                 count = LLVMBuildLShr(builder, count,
6022                                                       LLVMConstInt(ctx->i32, 8, 0), "");
6023                         }
6024                         count = LLVMBuildAnd(builder, count,
6025                                              LLVMConstInt(ctx->i32, 0x7f, 0), "");
6026                         ena = LLVMBuildICmp(builder, LLVMIntULT,
6027                                             ac_get_thread_id(&ctx->ac), count, "");
6028                         lp_build_if(&if_state, &ctx->gallivm, ena);
6029                 }
6030
6031                 /* Derive arguments for the next part from outputs of the
6032                  * previous one.
6033                  */
6034                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6035                         LLVMValueRef param;
6036                         LLVMTypeRef param_type;
6037                         bool is_sgpr;
6038                         unsigned param_size;
6039                         LLVMValueRef arg = NULL;
6040
6041                         param = LLVMGetParam(parts[part], param_idx);
6042                         param_type = LLVMTypeOf(param);
6043                         param_size = llvm_get_type_size(param_type) / 4;
6044                         is_sgpr = ac_is_sgpr_param(param);
6045
6046                         if (is_sgpr) {
6047 #if HAVE_LLVM < 0x0400
6048                                 LLVMRemoveAttribute(param, LLVMByValAttribute);
6049 #else
6050                                 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6051                                 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6052 #endif
6053                                 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6054                         }
6055
6056                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6057                         assert(is_sgpr || out_idx >= num_out_sgpr);
6058
6059                         if (param_size == 1)
6060                                 arg = out[out_idx];
6061                         else
6062                                 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6063
6064                         if (LLVMTypeOf(arg) != param_type) {
6065                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6066                                         arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6067                                         arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6068                                 } else {
6069                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
6070                                 }
6071                         }
6072
6073                         in[param_idx] = arg;
6074                         out_idx += param_size;
6075                 }
6076
6077                 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6078
6079                 if (is_merged_shader(ctx->shader) &&
6080                     (part + 1 == next_shader_first_part ||
6081                      part + 1 == num_parts)) {
6082                         lp_build_endif(&if_state);
6083
6084                         if (part + 1 == next_shader_first_part) {
6085                                 /* A barrier is required between 2 merged shaders. */
6086                                 si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
6087
6088                                 /* The second half of the merged shader should use
6089                                  * the inputs from the toplevel (wrapper) function,
6090                                  * not the return value from the last call.
6091                                  *
6092                                  * That's because the last call was executed condi-
6093                                  * tionally, so we can't consume it in the main
6094                                  * block.
6095                                  */
6096                                 memcpy(out, initial, sizeof(initial));
6097                                 num_out = initial_num_out;
6098                                 num_out_sgpr = initial_num_out_sgpr;
6099                         }
6100                         continue;
6101                 }
6102
6103                 /* Extract the returned GPRs. */
6104                 ret_type = LLVMTypeOf(ret);
6105                 num_out = 0;
6106                 num_out_sgpr = 0;
6107
6108                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6109                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6110
6111                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6112
6113                         for (unsigned i = 0; i < ret_size; ++i) {
6114                                 LLVMValueRef val =
6115                                         LLVMBuildExtractValue(builder, ret, i, "");
6116
6117                                 out[num_out++] = val;
6118
6119                                 if (LLVMTypeOf(val) == ctx->i32) {
6120                                         assert(num_out_sgpr + 1 == num_out);
6121                                         num_out_sgpr = num_out;
6122                                 }
6123                         }
6124                 }
6125         }
6126
6127         LLVMBuildRetVoid(builder);
6128 }
6129
6130 int si_compile_tgsi_shader(struct si_screen *sscreen,
6131                            LLVMTargetMachineRef tm,
6132                            struct si_shader *shader,
6133                            bool is_monolithic,
6134                            struct pipe_debug_callback *debug)
6135 {
6136         struct si_shader_selector *sel = shader->selector;
6137         struct si_shader_context ctx;
6138         int r = -1;
6139
6140         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6141          * conversion fails. */
6142         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6143             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6144                 tgsi_dump(sel->tokens, 0);
6145                 si_dump_streamout(&sel->so);
6146         }
6147
6148         si_init_shader_ctx(&ctx, sscreen, tm);
6149         si_llvm_context_set_tgsi(&ctx, shader);
6150         ctx.separate_prolog = !is_monolithic;
6151
6152         memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6153                sizeof(shader->info.vs_output_param_offset));
6154
6155         shader->info.uses_instanceid = sel->info.uses_instanceid;
6156
6157         ctx.load_system_value = declare_system_value;
6158
6159         if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6160                 si_llvm_dispose(&ctx);
6161                 return -1;
6162         }
6163
6164         if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6165                 LLVMValueRef parts[2];
6166                 bool need_prolog = sel->vs_needs_prolog;
6167
6168                 parts[1] = ctx.main_fn;
6169
6170                 if (need_prolog) {
6171                         union si_shader_part_key prolog_key;
6172                         si_get_vs_prolog_key(&sel->info,
6173                                              shader->info.num_input_sgprs,
6174                                              &shader->key.part.vs.prolog,
6175                                              shader, &prolog_key);
6176                         si_build_vs_prolog_function(&ctx, &prolog_key);
6177                         parts[0] = ctx.main_fn;
6178                 }
6179
6180                 si_build_wrapper_function(&ctx, parts + !need_prolog,
6181                                           1 + need_prolog, need_prolog, 0);
6182         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6183                 if (sscreen->b.chip_class >= GFX9) {
6184                         struct si_shader_selector *ls = shader->key.part.tcs.ls;
6185                         LLVMValueRef parts[4];
6186
6187                         /* TCS main part */
6188                         parts[2] = ctx.main_fn;
6189
6190                         /* TCS epilog */
6191                         union si_shader_part_key tcs_epilog_key;
6192                         memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6193                         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6194                         si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6195                         parts[3] = ctx.main_fn;
6196
6197                         /* VS prolog */
6198                         if (ls->vs_needs_prolog) {
6199                                 union si_shader_part_key vs_prolog_key;
6200                                 si_get_vs_prolog_key(&ls->info,
6201                                                      shader->info.num_input_sgprs,
6202                                                      &shader->key.part.tcs.ls_prolog,
6203                                                      shader, &vs_prolog_key);
6204                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6205                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6206                                 parts[0] = ctx.main_fn;
6207                         }
6208
6209                         /* VS as LS main part */
6210                         struct si_shader shader_ls = {};
6211                         shader_ls.selector = ls;
6212                         shader_ls.key.as_ls = 1;
6213                         shader_ls.key.mono = shader->key.mono;
6214                         shader_ls.key.opt = shader->key.opt;
6215                         si_llvm_context_set_tgsi(&ctx, &shader_ls);
6216
6217                         if (!si_compile_tgsi_main(&ctx, true)) {
6218                                 si_llvm_dispose(&ctx);
6219                                 return -1;
6220                         }
6221                         shader->info.uses_instanceid |= ls->info.uses_instanceid;
6222                         parts[1] = ctx.main_fn;
6223
6224                         /* Reset the shader context. */
6225                         ctx.shader = shader;
6226                         ctx.type = PIPE_SHADER_TESS_CTRL;
6227
6228                         si_build_wrapper_function(&ctx,
6229                                                   parts + !ls->vs_needs_prolog,
6230                                                   4 - !ls->vs_needs_prolog, 0,
6231                                                   ls->vs_needs_prolog ? 2 : 1);
6232                 } else {
6233                         LLVMValueRef parts[2];
6234                         union si_shader_part_key epilog_key;
6235
6236                         parts[0] = ctx.main_fn;
6237
6238                         memset(&epilog_key, 0, sizeof(epilog_key));
6239                         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6240                         si_build_tcs_epilog_function(&ctx, &epilog_key);
6241                         parts[1] = ctx.main_fn;
6242
6243                         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6244                 }
6245         } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6246                 if (ctx.screen->b.chip_class >= GFX9) {
6247                         struct si_shader_selector *es = shader->key.part.gs.es;
6248                         LLVMValueRef es_prolog = NULL;
6249                         LLVMValueRef es_main = NULL;
6250                         LLVMValueRef gs_prolog = NULL;
6251                         LLVMValueRef gs_main = ctx.main_fn;
6252
6253                         /* GS prolog */
6254                         union si_shader_part_key gs_prolog_key;
6255                         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6256                         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6257                         gs_prolog_key.gs_prolog.is_monolithic = true;
6258                         si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6259                         gs_prolog = ctx.main_fn;
6260
6261                         /* ES prolog */
6262                         if (es->vs_needs_prolog) {
6263                                 union si_shader_part_key vs_prolog_key;
6264                                 si_get_vs_prolog_key(&es->info,
6265                                                      shader->info.num_input_sgprs,
6266                                                      &shader->key.part.tcs.ls_prolog,
6267                                                      shader, &vs_prolog_key);
6268                                 vs_prolog_key.vs_prolog.is_monolithic = true;
6269                                 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6270                                 es_prolog = ctx.main_fn;
6271                         }
6272
6273                         /* ES main part */
6274                         struct si_shader shader_es = {};
6275                         shader_es.selector = es;
6276                         shader_es.key.as_es = 1;
6277                         shader_es.key.mono = shader->key.mono;
6278                         shader_es.key.opt = shader->key.opt;
6279                         si_llvm_context_set_tgsi(&ctx, &shader_es);
6280
6281                         if (!si_compile_tgsi_main(&ctx, true)) {
6282                                 si_llvm_dispose(&ctx);
6283                                 return -1;
6284                         }
6285                         shader->info.uses_instanceid |= es->info.uses_instanceid;
6286                         es_main = ctx.main_fn;
6287
6288                         /* Reset the shader context. */
6289                         ctx.shader = shader;
6290                         ctx.type = PIPE_SHADER_GEOMETRY;
6291
6292                         /* Prepare the array of shader parts. */
6293                         LLVMValueRef parts[4];
6294                         unsigned num_parts = 0, main_part, next_first_part;
6295
6296                         if (es_prolog)
6297                                 parts[num_parts++] = es_prolog;
6298
6299                         parts[main_part = num_parts++] = es_main;
6300                         parts[next_first_part = num_parts++] = gs_prolog;
6301                         parts[num_parts++] = gs_main;
6302
6303                         si_build_wrapper_function(&ctx, parts, num_parts,
6304                                                   main_part, next_first_part);
6305                 } else {
6306                         LLVMValueRef parts[2];
6307                         union si_shader_part_key prolog_key;
6308
6309                         parts[1] = ctx.main_fn;
6310
6311                         memset(&prolog_key, 0, sizeof(prolog_key));
6312                         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6313                         si_build_gs_prolog_function(&ctx, &prolog_key);
6314                         parts[0] = ctx.main_fn;
6315
6316                         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6317                 }
6318         } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6319                 LLVMValueRef parts[3];
6320                 union si_shader_part_key prolog_key;
6321                 union si_shader_part_key epilog_key;
6322                 bool need_prolog;
6323
6324                 si_get_ps_prolog_key(shader, &prolog_key, false);
6325                 need_prolog = si_need_ps_prolog(&prolog_key);
6326
6327                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6328
6329                 if (need_prolog) {
6330                         si_build_ps_prolog_function(&ctx, &prolog_key);
6331                         parts[0] = ctx.main_fn;
6332                 }
6333
6334                 si_get_ps_epilog_key(shader, &epilog_key);
6335                 si_build_ps_epilog_function(&ctx, &epilog_key);
6336                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6337
6338                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6339                                           need_prolog ? 1 : 0, 0);
6340         }
6341
6342         si_llvm_optimize_module(&ctx);
6343
6344         /* Post-optimization transformations and analysis. */
6345         si_optimize_vs_outputs(&ctx);
6346
6347         if ((debug && debug->debug_message) ||
6348             r600_can_dump_shader(&sscreen->b, ctx.type))
6349                 si_count_scratch_private_memory(&ctx);
6350
6351         /* Compile to bytecode. */
6352         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6353                             ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6354         si_llvm_dispose(&ctx);
6355         if (r) {
6356                 fprintf(stderr, "LLVM failed to compile shader\n");
6357                 return r;
6358         }
6359
6360         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6361          * LLVM 3.9svn has this bug.
6362          */
6363         if (sel->type == PIPE_SHADER_COMPUTE) {
6364                 unsigned wave_size = 64;
6365                 unsigned max_vgprs = 256;
6366                 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6367                 unsigned max_sgprs_per_wave = 128;
6368                 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6369                 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6370                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6371
6372                 max_vgprs = max_vgprs / min_waves_per_simd;
6373                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6374
6375                 if (shader->config.num_sgprs > max_sgprs ||
6376                     shader->config.num_vgprs > max_vgprs) {
6377                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
6378                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6379                                 shader->config.num_sgprs, shader->config.num_vgprs,
6380                                 max_sgprs, max_vgprs);
6381
6382                         /* Just terminate the process, because dependent
6383                          * shaders can hang due to bad input data, but use
6384                          * the env var to allow shader-db to work.
6385                          */
6386                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6387                                 abort();
6388                 }
6389         }
6390
6391         /* Add the scratch offset to input SGPRs. */
6392         if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6393                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6394
6395         /* Calculate the number of fragment input VGPRs. */
6396         if (ctx.type == PIPE_SHADER_FRAGMENT) {
6397                 shader->info.num_input_vgprs = 0;
6398                 shader->info.face_vgpr_index = -1;
6399
6400                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6401                         shader->info.num_input_vgprs += 2;
6402                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6403                         shader->info.num_input_vgprs += 2;
6404                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6405                         shader->info.num_input_vgprs += 2;
6406                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6407                         shader->info.num_input_vgprs += 3;
6408                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6409                         shader->info.num_input_vgprs += 2;
6410                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6411                         shader->info.num_input_vgprs += 2;
6412                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6413                         shader->info.num_input_vgprs += 2;
6414                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6415                         shader->info.num_input_vgprs += 1;
6416                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6417                         shader->info.num_input_vgprs += 1;
6418                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6419                         shader->info.num_input_vgprs += 1;
6420                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6421                         shader->info.num_input_vgprs += 1;
6422                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6423                         shader->info.num_input_vgprs += 1;
6424                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6425                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6426                         shader->info.num_input_vgprs += 1;
6427                 }
6428                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6429                         shader->info.num_input_vgprs += 1;
6430                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6431                         shader->info.num_input_vgprs += 1;
6432                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6433                         shader->info.num_input_vgprs += 1;
6434         }
6435
6436         return 0;
6437 }
6438
6439 /**
6440  * Create, compile and return a shader part (prolog or epilog).
6441  *
6442  * \param sscreen       screen
6443  * \param list          list of shader parts of the same category
6444  * \param type          shader type
6445  * \param key           shader part key
6446  * \param prolog        whether the part being requested is a prolog
6447  * \param tm            LLVM target machine
6448  * \param debug         debug callback
6449  * \param build         the callback responsible for building the main function
6450  * \return              non-NULL on success
6451  */
6452 static struct si_shader_part *
6453 si_get_shader_part(struct si_screen *sscreen,
6454                    struct si_shader_part **list,
6455                    enum pipe_shader_type type,
6456                    bool prolog,
6457                    union si_shader_part_key *key,
6458                    LLVMTargetMachineRef tm,
6459                    struct pipe_debug_callback *debug,
6460                    void (*build)(struct si_shader_context *,
6461                                  union si_shader_part_key *),
6462                    const char *name)
6463 {
6464         struct si_shader_part *result;
6465
6466         mtx_lock(&sscreen->shader_parts_mutex);
6467
6468         /* Find existing. */
6469         for (result = *list; result; result = result->next) {
6470                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6471                         mtx_unlock(&sscreen->shader_parts_mutex);
6472                         return result;
6473                 }
6474         }
6475
6476         /* Compile a new one. */
6477         result = CALLOC_STRUCT(si_shader_part);
6478         result->key = *key;
6479
6480         struct si_shader shader = {};
6481         struct si_shader_context ctx;
6482         struct gallivm_state *gallivm = &ctx.gallivm;
6483
6484         si_init_shader_ctx(&ctx, sscreen, tm);
6485         ctx.shader = &shader;
6486         ctx.type = type;
6487
6488         switch (type) {
6489         case PIPE_SHADER_VERTEX:
6490                 break;
6491         case PIPE_SHADER_TESS_CTRL:
6492                 assert(!prolog);
6493                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6494                 break;
6495         case PIPE_SHADER_GEOMETRY:
6496                 assert(prolog);
6497                 break;
6498         case PIPE_SHADER_FRAGMENT:
6499                 if (prolog)
6500                         shader.key.part.ps.prolog = key->ps_prolog.states;
6501                 else
6502                         shader.key.part.ps.epilog = key->ps_epilog.states;
6503                 break;
6504         default:
6505                 unreachable("bad shader part");
6506         }
6507
6508         build(&ctx, key);
6509
6510         /* Compile. */
6511         si_llvm_optimize_module(&ctx);
6512
6513         if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6514                             gallivm->module, debug, ctx.type, name)) {
6515                 FREE(result);
6516                 result = NULL;
6517                 goto out;
6518         }
6519
6520         result->next = *list;
6521         *list = result;
6522
6523 out:
6524         si_llvm_dispose(&ctx);
6525         mtx_unlock(&sscreen->shader_parts_mutex);
6526         return result;
6527 }
6528
6529 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6530 {
6531         struct gallivm_state *gallivm = &ctx->gallivm;
6532         LLVMValueRef ptr[2], list;
6533
6534         /* Get the pointer to rw buffers. */
6535         ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6536         ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6537         list = lp_build_gather_values(gallivm, ptr, 2);
6538         list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6539         list = LLVMBuildIntToPtr(gallivm->builder, list,
6540                                  si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6541         return list;
6542 }
6543
6544 /**
6545  * Build the vertex shader prolog function.
6546  *
6547  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6548  * All inputs are returned unmodified. The vertex load indices are
6549  * stored after them, which will be used by the API VS for fetching inputs.
6550  *
6551  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6552  *   input_v0,
6553  *   input_v1,
6554  *   input_v2,
6555  *   input_v3,
6556  *   (VertexID + BaseVertex),
6557  *   (InstanceID + StartInstance),
6558  *   (InstanceID / 2 + StartInstance)
6559  */
6560 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6561                                         union si_shader_part_key *key)
6562 {
6563         struct gallivm_state *gallivm = &ctx->gallivm;
6564         LLVMTypeRef *params, *returns;
6565         LLVMValueRef ret, func;
6566         int last_sgpr, num_params, num_returns, i;
6567         unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
6568                                  key->vs_prolog.num_merged_next_stage_vgprs;
6569         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6570         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6571                                       num_input_vgprs;
6572         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6573
6574         ctx->param_vertex_id = first_vs_vgpr;
6575         ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
6576
6577         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6578         params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
6579         returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6580                          sizeof(LLVMTypeRef));
6581         num_params = 0;
6582         num_returns = 0;
6583
6584         /* Declare input and output SGPRs. */
6585         num_params = 0;
6586         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6587                 params[num_params++] = ctx->i32;
6588                 returns[num_returns++] = ctx->i32;
6589         }
6590         last_sgpr = num_params - 1;
6591
6592         /* Preloaded VGPRs (outputs must be floats) */
6593         for (i = 0; i < num_input_vgprs; i++) {
6594                 params[num_params++] = ctx->i32;
6595                 returns[num_returns++] = ctx->f32;
6596         }
6597
6598         /* Vertex load indices. */
6599         for (i = 0; i <= key->vs_prolog.last_input; i++)
6600                 returns[num_returns++] = ctx->f32;
6601
6602         /* Create the function. */
6603         si_create_function(ctx, "vs_prolog", returns, num_returns, params,
6604                            num_params, last_sgpr, 0);
6605         func = ctx->main_fn;
6606
6607         if (key->vs_prolog.num_merged_next_stage_vgprs &&
6608             !key->vs_prolog.is_monolithic)
6609                 si_init_exec_from_input(ctx, 3, 0);
6610
6611         /* Copy inputs to outputs. This should be no-op, as the registers match,
6612          * but it will prevent the compiler from overwriting them unintentionally.
6613          */
6614         ret = ctx->return_value;
6615         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6616                 LLVMValueRef p = LLVMGetParam(func, i);
6617                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6618         }
6619         for (; i < num_params; i++) {
6620                 LLVMValueRef p = LLVMGetParam(func, i);
6621                 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6622                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6623         }
6624
6625         /* Compute vertex load indices from instance divisors. */
6626         LLVMValueRef instance_divisor_constbuf = NULL;
6627
6628         if (key->vs_prolog.states.instance_divisor_is_fetched) {
6629                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6630                 LLVMValueRef buf_index =
6631                         LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6632                 instance_divisor_constbuf =
6633                         ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6634         }
6635
6636         for (i = 0; i <= key->vs_prolog.last_input; i++) {
6637                 bool divisor_is_one =
6638                         key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6639                 bool divisor_is_fetched =
6640                         key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6641                 LLVMValueRef index;
6642
6643                 if (divisor_is_one || divisor_is_fetched) {
6644                         LLVMValueRef divisor = ctx->i32_1;
6645
6646                         if (divisor_is_fetched) {
6647                                 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6648                                                             LLVMConstInt(ctx->i32, i * 4, 0));
6649                                 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6650                                                            ctx->i32, "");
6651                         }
6652
6653                         /* InstanceID / Divisor + StartInstance */
6654                         index = get_instance_index_for_fetch(ctx,
6655                                                              user_sgpr_base +
6656                                                              SI_SGPR_START_INSTANCE,
6657                                                              divisor);
6658                 } else {
6659                         /* VertexID + BaseVertex */
6660                         index = LLVMBuildAdd(gallivm->builder,
6661                                              LLVMGetParam(func, ctx->param_vertex_id),
6662                                              LLVMGetParam(func, user_sgpr_base +
6663                                                                 SI_SGPR_BASE_VERTEX), "");
6664                 }
6665
6666                 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6667                 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6668                                            num_params++, "");
6669         }
6670
6671         si_llvm_build_ret(ctx, ret);
6672 }
6673
6674 static bool si_get_vs_prolog(struct si_screen *sscreen,
6675                              LLVMTargetMachineRef tm,
6676                              struct si_shader *shader,
6677                              struct pipe_debug_callback *debug,
6678                              struct si_shader *main_part,
6679                              const struct si_vs_prolog_bits *key)
6680 {
6681         struct si_shader_selector *vs = main_part->selector;
6682
6683         /* The prolog is a no-op if there are no inputs. */
6684         if (!vs->vs_needs_prolog)
6685                 return true;
6686
6687         /* Get the prolog. */
6688         union si_shader_part_key prolog_key;
6689         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6690                              key, shader, &prolog_key);
6691
6692         shader->prolog =
6693                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6694                                    PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6695                                    debug, si_build_vs_prolog_function,
6696                                    "Vertex Shader Prolog");
6697         return shader->prolog != NULL;
6698 }
6699
6700 /**
6701  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6702  */
6703 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6704                                       LLVMTargetMachineRef tm,
6705                                       struct si_shader *shader,
6706                                       struct pipe_debug_callback *debug)
6707 {
6708         return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6709                                 &shader->key.part.vs.prolog);
6710 }
6711
6712 /**
6713  * Compile the TCS epilog function. This writes tesselation factors to memory
6714  * based on the output primitive type of the tesselator (determined by TES).
6715  */
6716 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6717                                          union si_shader_part_key *key)
6718 {
6719         struct gallivm_state *gallivm = &ctx->gallivm;
6720         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6721         LLVMTypeRef params[32];
6722         LLVMValueRef func;
6723         int last_sgpr, num_params = 0;
6724
6725         if (ctx->screen->b.chip_class >= GFX9) {
6726                 params[num_params++] = ctx->i64;
6727                 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6728                 params[num_params++] = ctx->i32; /* wave info */
6729                 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
6730                 params[num_params++] = ctx->i32;
6731                 params[num_params++] = ctx->i32;
6732                 params[num_params++] = ctx->i32;
6733                 params[num_params++] = ctx->i64;
6734                 params[num_params++] = ctx->i64;
6735                 params[num_params++] = ctx->i64;
6736                 params[num_params++] = ctx->i64;
6737                 params[num_params++] = ctx->i32;
6738                 params[num_params++] = ctx->i32;
6739                 params[num_params++] = ctx->i32;
6740                 params[num_params++] = ctx->i32;
6741                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6742                 params[num_params++] = ctx->i32;
6743                 params[num_params++] = ctx->i32;
6744                 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6745                 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
6746         } else {
6747                 params[num_params++] = ctx->i64;
6748                 params[num_params++] = ctx->i64;
6749                 params[num_params++] = ctx->i64;
6750                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6751                 params[num_params++] = ctx->i32;
6752                 params[num_params++] = ctx->i32;
6753                 params[num_params++] = ctx->i32;
6754                 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6755                 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
6756                 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6757                 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
6758         }
6759         last_sgpr = num_params - 1;
6760
6761         params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
6762         params[num_params++] = ctx->i32; /* invocation ID within the patch */
6763         params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
6764
6765         /* Create the function. */
6766         si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
6767                            ctx->screen->b.chip_class >= CIK ? 128 : 64);
6768         declare_lds_as_pointer(ctx);
6769         func = ctx->main_fn;
6770
6771         si_write_tess_factors(bld_base,
6772                               LLVMGetParam(func, last_sgpr + 1),
6773                               LLVMGetParam(func, last_sgpr + 2),
6774                               LLVMGetParam(func, last_sgpr + 3));
6775
6776         LLVMBuildRetVoid(gallivm->builder);
6777 }
6778
6779 /**
6780  * Select and compile (or reuse) TCS parts (epilog).
6781  */
6782 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6783                                        LLVMTargetMachineRef tm,
6784                                        struct si_shader *shader,
6785                                        struct pipe_debug_callback *debug)
6786 {
6787         if (sscreen->b.chip_class >= GFX9) {
6788                 struct si_shader *ls_main_part =
6789                         shader->key.part.tcs.ls->main_shader_part_ls;
6790
6791                 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
6792                                       &shader->key.part.tcs.ls_prolog))
6793                         return false;
6794
6795                 shader->previous_stage = ls_main_part;
6796         }
6797
6798         /* Get the epilog. */
6799         union si_shader_part_key epilog_key;
6800         memset(&epilog_key, 0, sizeof(epilog_key));
6801         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6802
6803         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6804                                             PIPE_SHADER_TESS_CTRL, false,
6805                                             &epilog_key, tm, debug,
6806                                             si_build_tcs_epilog_function,
6807                                             "Tessellation Control Shader Epilog");
6808         return shader->epilog != NULL;
6809 }
6810
6811 /**
6812  * Select and compile (or reuse) GS parts (prolog).
6813  */
6814 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
6815                                       LLVMTargetMachineRef tm,
6816                                       struct si_shader *shader,
6817                                       struct pipe_debug_callback *debug)
6818 {
6819         if (sscreen->b.chip_class >= GFX9) {
6820                 struct si_shader *es_main_part =
6821                         shader->key.part.gs.es->main_shader_part_es;
6822
6823                 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
6824                     !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
6825                                       &shader->key.part.gs.vs_prolog))
6826                         return false;
6827
6828                 shader->previous_stage = es_main_part;
6829         }
6830
6831         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
6832                 return true;
6833
6834         union si_shader_part_key prolog_key;
6835         memset(&prolog_key, 0, sizeof(prolog_key));
6836         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6837
6838         shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
6839                                             PIPE_SHADER_GEOMETRY, true,
6840                                             &prolog_key, tm, debug,
6841                                             si_build_gs_prolog_function,
6842                                             "Geometry Shader Prolog");
6843         return shader->prolog2 != NULL;
6844 }
6845
6846 /**
6847  * Build the pixel shader prolog function. This handles:
6848  * - two-side color selection and interpolation
6849  * - overriding interpolation parameters for the API PS
6850  * - polygon stippling
6851  *
6852  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6853  * overriden by other states. (e.g. per-sample interpolation)
6854  * Interpolated colors are stored after the preloaded VGPRs.
6855  */
6856 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
6857                                         union si_shader_part_key *key)
6858 {
6859         struct gallivm_state *gallivm = &ctx->gallivm;
6860         LLVMTypeRef *params;
6861         LLVMValueRef ret, func;
6862         int last_sgpr, num_params, num_returns, i, num_color_channels;
6863
6864         assert(si_need_ps_prolog(key));
6865
6866         /* Number of inputs + 8 color elements. */
6867         params = alloca((key->ps_prolog.num_input_sgprs +
6868                          key->ps_prolog.num_input_vgprs + 8) *
6869                         sizeof(LLVMTypeRef));
6870
6871         /* Declare inputs. */
6872         num_params = 0;
6873         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
6874                 params[num_params++] = ctx->i32;
6875         last_sgpr = num_params - 1;
6876
6877         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
6878                 params[num_params++] = ctx->f32;
6879
6880         /* Declare outputs (same as inputs + add colors if needed) */
6881         num_returns = num_params;
6882         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6883         for (i = 0; i < num_color_channels; i++)
6884                 params[num_returns++] = ctx->f32;
6885
6886         /* Create the function. */
6887         si_create_function(ctx, "ps_prolog", params, num_returns, params,
6888                            num_params, last_sgpr, 0);
6889         func = ctx->main_fn;
6890
6891         /* Copy inputs to outputs. This should be no-op, as the registers match,
6892          * but it will prevent the compiler from overwriting them unintentionally.
6893          */
6894         ret = ctx->return_value;
6895         for (i = 0; i < num_params; i++) {
6896                 LLVMValueRef p = LLVMGetParam(func, i);
6897                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6898         }
6899
6900         /* Polygon stippling. */
6901         if (key->ps_prolog.states.poly_stipple) {
6902                 /* POS_FIXED_PT is always last. */
6903                 unsigned pos = key->ps_prolog.num_input_sgprs +
6904                                key->ps_prolog.num_input_vgprs - 1;
6905                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6906
6907                 si_llvm_emit_polygon_stipple(ctx, list, pos);
6908         }
6909
6910         if (key->ps_prolog.states.bc_optimize_for_persp ||
6911             key->ps_prolog.states.bc_optimize_for_linear) {
6912                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6913                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
6914
6915                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
6916                  * The hw doesn't compute CENTROID if the whole wave only
6917                  * contains fully-covered quads.
6918                  *
6919                  * PRIM_MASK is after user SGPRs.
6920                  */
6921                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6922                 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
6923                                             LLVMConstInt(ctx->i32, 31, 0), "");
6924                 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
6925                                              ctx->i1, "");
6926
6927                 if (key->ps_prolog.states.bc_optimize_for_persp) {
6928                         /* Read PERSP_CENTER. */
6929                         for (i = 0; i < 2; i++)
6930                                 center[i] = LLVMGetParam(func, base + 2 + i);
6931                         /* Read PERSP_CENTROID. */
6932                         for (i = 0; i < 2; i++)
6933                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
6934                         /* Select PERSP_CENTROID. */
6935                         for (i = 0; i < 2; i++) {
6936                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
6937                                                       center[i], centroid[i], "");
6938                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6939                                                            tmp, base + 4 + i, "");
6940                         }
6941                 }
6942                 if (key->ps_prolog.states.bc_optimize_for_linear) {
6943                         /* Read LINEAR_CENTER. */
6944                         for (i = 0; i < 2; i++)
6945                                 center[i] = LLVMGetParam(func, base + 8 + i);
6946                         /* Read LINEAR_CENTROID. */
6947                         for (i = 0; i < 2; i++)
6948                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
6949                         /* Select LINEAR_CENTROID. */
6950                         for (i = 0; i < 2; i++) {
6951                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
6952                                                       center[i], centroid[i], "");
6953                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6954                                                            tmp, base + 10 + i, "");
6955                         }
6956                 }
6957         }
6958
6959         /* Force per-sample interpolation. */
6960         if (key->ps_prolog.states.force_persp_sample_interp) {
6961                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6962                 LLVMValueRef persp_sample[2];
6963
6964                 /* Read PERSP_SAMPLE. */
6965                 for (i = 0; i < 2; i++)
6966                         persp_sample[i] = LLVMGetParam(func, base + i);
6967                 /* Overwrite PERSP_CENTER. */
6968                 for (i = 0; i < 2; i++)
6969                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
6970                                                    persp_sample[i], base + 2 + i, "");
6971                 /* Overwrite PERSP_CENTROID. */
6972                 for (i = 0; i < 2; i++)
6973                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
6974                                                    persp_sample[i], base + 4 + i, "");
6975         }
6976         if (key->ps_prolog.states.force_linear_sample_interp) {
6977                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6978                 LLVMValueRef linear_sample[2];
6979
6980                 /* Read LINEAR_SAMPLE. */
6981                 for (i = 0; i < 2; i++)
6982                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
6983                 /* Overwrite LINEAR_CENTER. */
6984                 for (i = 0; i < 2; i++)
6985                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
6986                                                    linear_sample[i], base + 8 + i, "");
6987                 /* Overwrite LINEAR_CENTROID. */
6988                 for (i = 0; i < 2; i++)
6989                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
6990                                                    linear_sample[i], base + 10 + i, "");
6991         }
6992
6993         /* Force center interpolation. */
6994         if (key->ps_prolog.states.force_persp_center_interp) {
6995                 unsigned i, base = key->ps_prolog.num_input_sgprs;
6996                 LLVMValueRef persp_center[2];
6997
6998                 /* Read PERSP_CENTER. */
6999                 for (i = 0; i < 2; i++)
7000                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
7001                 /* Overwrite PERSP_SAMPLE. */
7002                 for (i = 0; i < 2; i++)
7003                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7004                                                    persp_center[i], base + i, "");
7005                 /* Overwrite PERSP_CENTROID. */
7006                 for (i = 0; i < 2; i++)
7007                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7008                                                    persp_center[i], base + 4 + i, "");
7009         }
7010         if (key->ps_prolog.states.force_linear_center_interp) {
7011                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7012                 LLVMValueRef linear_center[2];
7013
7014                 /* Read LINEAR_CENTER. */
7015                 for (i = 0; i < 2; i++)
7016                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
7017                 /* Overwrite LINEAR_SAMPLE. */
7018                 for (i = 0; i < 2; i++)
7019                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7020                                                    linear_center[i], base + 6 + i, "");
7021                 /* Overwrite LINEAR_CENTROID. */
7022                 for (i = 0; i < 2; i++)
7023                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7024                                                    linear_center[i], base + 10 + i, "");
7025         }
7026
7027         /* Interpolate colors. */
7028         for (i = 0; i < 2; i++) {
7029                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7030                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7031                                      key->ps_prolog.face_vgpr_index;
7032                 LLVMValueRef interp[2], color[4];
7033                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7034
7035                 if (!writemask)
7036                         continue;
7037
7038                 /* If the interpolation qualifier is not CONSTANT (-1). */
7039                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7040                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7041                                                key->ps_prolog.color_interp_vgpr_index[i];
7042
7043                         /* Get the (i,j) updated by bc_optimize handling. */
7044                         interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7045                                                           interp_vgpr, "");
7046                         interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7047                                                           interp_vgpr + 1, "");
7048                         interp_ij = lp_build_gather_values(gallivm, interp, 2);
7049                 }
7050
7051                 /* Use the absolute location of the input. */
7052                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7053
7054                 if (key->ps_prolog.states.color_two_side) {
7055                         face = LLVMGetParam(func, face_vgpr);
7056                         face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7057                 }
7058
7059                 interp_fs_input(ctx,
7060                                 key->ps_prolog.color_attr_index[i],
7061                                 TGSI_SEMANTIC_COLOR, i,
7062                                 key->ps_prolog.num_interp_inputs,
7063                                 key->ps_prolog.colors_read, interp_ij,
7064                                 prim_mask, face, color);
7065
7066                 while (writemask) {
7067                         unsigned chan = u_bit_scan(&writemask);
7068                         ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7069                                                    num_params++, "");
7070                 }
7071         }
7072
7073         /* Tell LLVM to insert WQM instruction sequence when needed. */
7074         if (key->ps_prolog.wqm) {
7075                 LLVMAddTargetDependentFunctionAttr(func,
7076                                                    "amdgpu-ps-wqm-outputs", "");
7077         }
7078
7079         si_llvm_build_ret(ctx, ret);
7080 }
7081
7082 /**
7083  * Build the pixel shader epilog function. This handles everything that must be
7084  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7085  */
7086 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7087                                         union si_shader_part_key *key)
7088 {
7089         struct gallivm_state *gallivm = &ctx->gallivm;
7090         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7091         LLVMTypeRef params[16+8*4+3];
7092         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7093         int last_sgpr, num_params = 0, i;
7094         struct si_ps_exports exp = {};
7095
7096         /* Declare input SGPRs. */
7097         params[ctx->param_rw_buffers = num_params++] = ctx->i64;
7098         params[ctx->param_const_and_shader_buffers = num_params++] = ctx->i64;
7099         params[ctx->param_samplers_and_images = num_params++] = ctx->i64;
7100         assert(num_params == SI_PARAM_ALPHA_REF);
7101         params[SI_PARAM_ALPHA_REF] = ctx->f32;
7102         last_sgpr = SI_PARAM_ALPHA_REF;
7103
7104         /* Declare input VGPRs. */
7105         num_params = (last_sgpr + 1) +
7106                      util_bitcount(key->ps_epilog.colors_written) * 4 +
7107                      key->ps_epilog.writes_z +
7108                      key->ps_epilog.writes_stencil +
7109                      key->ps_epilog.writes_samplemask;
7110
7111         num_params = MAX2(num_params,
7112                           last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7113
7114         assert(num_params <= ARRAY_SIZE(params));
7115
7116         for (i = last_sgpr + 1; i < num_params; i++)
7117                 params[i] = ctx->f32;
7118
7119         /* Create the function. */
7120         si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
7121                            last_sgpr, 0);
7122         /* Disable elimination of unused inputs. */
7123         si_llvm_add_attribute(ctx->main_fn,
7124                                   "InitialPSInputAddr", 0xffffff);
7125
7126         /* Process colors. */
7127         unsigned vgpr = last_sgpr + 1;
7128         unsigned colors_written = key->ps_epilog.colors_written;
7129         int last_color_export = -1;
7130
7131         /* Find the last color export. */
7132         if (!key->ps_epilog.writes_z &&
7133             !key->ps_epilog.writes_stencil &&
7134             !key->ps_epilog.writes_samplemask) {
7135                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7136
7137                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7138                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7139                         /* Just set this if any of the colorbuffers are enabled. */
7140                         if (spi_format &
7141                             ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7142                                 last_color_export = 0;
7143                 } else {
7144                         for (i = 0; i < 8; i++)
7145                                 if (colors_written & (1 << i) &&
7146                                     (spi_format >> (i * 4)) & 0xf)
7147                                         last_color_export = i;
7148                 }
7149         }
7150
7151         while (colors_written) {
7152                 LLVMValueRef color[4];
7153                 int mrt = u_bit_scan(&colors_written);
7154
7155                 for (i = 0; i < 4; i++)
7156                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7157
7158                 si_export_mrt_color(bld_base, color, mrt,
7159                                     num_params - 1,
7160                                     mrt == last_color_export, &exp);
7161         }
7162
7163         /* Process depth, stencil, samplemask. */
7164         if (key->ps_epilog.writes_z)
7165                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7166         if (key->ps_epilog.writes_stencil)
7167                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7168         if (key->ps_epilog.writes_samplemask)
7169                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7170
7171         if (depth || stencil || samplemask)
7172                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7173         else if (last_color_export == -1)
7174                 si_export_null(bld_base);
7175
7176         if (exp.num)
7177                 si_emit_ps_exports(ctx, &exp);
7178
7179         /* Compile. */
7180         LLVMBuildRetVoid(gallivm->builder);
7181 }
7182
7183 /**
7184  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7185  */
7186 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7187                                       LLVMTargetMachineRef tm,
7188                                       struct si_shader *shader,
7189                                       struct pipe_debug_callback *debug)
7190 {
7191         union si_shader_part_key prolog_key;
7192         union si_shader_part_key epilog_key;
7193
7194         /* Get the prolog. */
7195         si_get_ps_prolog_key(shader, &prolog_key, true);
7196
7197         /* The prolog is a no-op if these aren't set. */
7198         if (si_need_ps_prolog(&prolog_key)) {
7199                 shader->prolog =
7200                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
7201                                            PIPE_SHADER_FRAGMENT, true,
7202                                            &prolog_key, tm, debug,
7203                                            si_build_ps_prolog_function,
7204                                            "Fragment Shader Prolog");
7205                 if (!shader->prolog)
7206                         return false;
7207         }
7208
7209         /* Get the epilog. */
7210         si_get_ps_epilog_key(shader, &epilog_key);
7211
7212         shader->epilog =
7213                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7214                                    PIPE_SHADER_FRAGMENT, false,
7215                                    &epilog_key, tm, debug,
7216                                    si_build_ps_epilog_function,
7217                                    "Fragment Shader Epilog");
7218         if (!shader->epilog)
7219                 return false;
7220
7221         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7222         if (shader->key.part.ps.prolog.poly_stipple) {
7223                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7224                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7225         }
7226
7227         /* Set up the enable bits for per-sample shading if needed. */
7228         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7229             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7230              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7231                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7232                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7233                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7234         }
7235         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7236             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7237              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7238                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7239                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7240                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7241         }
7242         if (shader->key.part.ps.prolog.force_persp_center_interp &&
7243             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7244              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7245                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7246                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7247                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7248         }
7249         if (shader->key.part.ps.prolog.force_linear_center_interp &&
7250             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7251              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7252                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7253                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7254                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7255         }
7256
7257         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7258         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7259             !(shader->config.spi_ps_input_ena & 0xf)) {
7260                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7261                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7262         }
7263
7264         /* At least one pair of interpolation weights must be enabled. */
7265         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7266                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7267                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7268         }
7269
7270         /* The sample mask input is always enabled, because the API shader always
7271          * passes it through to the epilog. Disable it here if it's unused.
7272          */
7273         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7274             !shader->selector->info.reads_samplemask)
7275                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7276
7277         return true;
7278 }
7279
7280 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7281                                       unsigned *lds_size)
7282 {
7283         /* SPI barrier management bug:
7284          *   Make sure we have at least 4k of LDS in use to avoid the bug.
7285          *   It applies to workgroup sizes of more than one wavefront.
7286          */
7287         if (sscreen->b.family == CHIP_BONAIRE ||
7288             sscreen->b.family == CHIP_KABINI ||
7289             sscreen->b.family == CHIP_MULLINS)
7290                 *lds_size = MAX2(*lds_size, 8);
7291 }
7292
7293 static void si_fix_resource_usage(struct si_screen *sscreen,
7294                                   struct si_shader *shader)
7295 {
7296         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7297
7298         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7299
7300         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7301             si_get_max_workgroup_size(shader) > 64) {
7302                 si_multiwave_lds_size_workaround(sscreen,
7303                                                  &shader->config.lds_size);
7304         }
7305 }
7306
7307 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7308                      struct si_shader *shader,
7309                      struct pipe_debug_callback *debug)
7310 {
7311         struct si_shader_selector *sel = shader->selector;
7312         struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7313         int r;
7314
7315         /* LS, ES, VS are compiled on demand if the main part hasn't been
7316          * compiled for that stage.
7317          *
7318          * Vertex shaders are compiled on demand when a vertex fetch
7319          * workaround must be applied.
7320          */
7321         if (shader->is_monolithic) {
7322                 /* Monolithic shader (compiled as a whole, has many variants,
7323                  * may take a long time to compile).
7324                  */
7325                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7326                 if (r)
7327                         return r;
7328         } else {
7329                 /* The shader consists of 2-3 parts:
7330                  *
7331                  * - the middle part is the user shader, it has 1 variant only
7332                  *   and it was compiled during the creation of the shader
7333                  *   selector
7334                  * - the prolog part is inserted at the beginning
7335                  * - the epilog part is inserted at the end
7336                  *
7337                  * The prolog and epilog have many (but simple) variants.
7338                  */
7339
7340                 /* Copy the compiled TGSI shader data over. */
7341                 shader->is_binary_shared = true;
7342                 shader->binary = mainp->binary;
7343                 shader->config = mainp->config;
7344                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7345                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7346                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7347                 memcpy(shader->info.vs_output_param_offset,
7348                        mainp->info.vs_output_param_offset,
7349                        sizeof(mainp->info.vs_output_param_offset));
7350                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7351                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7352                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7353
7354                 /* Select prologs and/or epilogs. */
7355                 switch (sel->type) {
7356                 case PIPE_SHADER_VERTEX:
7357                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7358                                 return -1;
7359                         break;
7360                 case PIPE_SHADER_TESS_CTRL:
7361                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7362                                 return -1;
7363                         break;
7364                 case PIPE_SHADER_TESS_EVAL:
7365                         break;
7366                 case PIPE_SHADER_GEOMETRY:
7367                         if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7368                                 return -1;
7369                         break;
7370                 case PIPE_SHADER_FRAGMENT:
7371                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7372                                 return -1;
7373
7374                         /* Make sure we have at least as many VGPRs as there
7375                          * are allocated inputs.
7376                          */
7377                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7378                                                         shader->info.num_input_vgprs);
7379                         break;
7380                 }
7381
7382                 /* Update SGPR and VGPR counts. */
7383                 if (shader->prolog) {
7384                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7385                                                         shader->prolog->config.num_sgprs);
7386                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7387                                                         shader->prolog->config.num_vgprs);
7388                 }
7389                 if (shader->previous_stage) {
7390                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7391                                                         shader->previous_stage->config.num_sgprs);
7392                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7393                                                         shader->previous_stage->config.num_vgprs);
7394                         shader->config.spilled_sgprs =
7395                                 MAX2(shader->config.spilled_sgprs,
7396                                      shader->previous_stage->config.spilled_sgprs);
7397                         shader->config.spilled_vgprs =
7398                                 MAX2(shader->config.spilled_vgprs,
7399                                      shader->previous_stage->config.spilled_vgprs);
7400                         shader->config.private_mem_vgprs =
7401                                 MAX2(shader->config.private_mem_vgprs,
7402                                      shader->previous_stage->config.private_mem_vgprs);
7403                         shader->config.scratch_bytes_per_wave =
7404                                 MAX2(shader->config.scratch_bytes_per_wave,
7405                                      shader->previous_stage->config.scratch_bytes_per_wave);
7406                         shader->info.uses_instanceid |=
7407                                 shader->previous_stage->info.uses_instanceid;
7408                 }
7409                 if (shader->prolog2) {
7410                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7411                                                         shader->prolog2->config.num_sgprs);
7412                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7413                                                         shader->prolog2->config.num_vgprs);
7414                 }
7415                 if (shader->epilog) {
7416                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7417                                                         shader->epilog->config.num_sgprs);
7418                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7419                                                         shader->epilog->config.num_vgprs);
7420                 }
7421         }
7422
7423         si_fix_resource_usage(sscreen, shader);
7424         si_shader_dump(sscreen, shader, debug, sel->info.processor,
7425                        stderr, true);
7426
7427         /* Upload. */
7428         r = si_shader_binary_upload(sscreen, shader);
7429         if (r) {
7430                 fprintf(stderr, "LLVM failed to upload shader\n");
7431                 return r;
7432         }
7433
7434         return 0;
7435 }
7436
7437 void si_shader_destroy(struct si_shader *shader)
7438 {
7439         if (shader->scratch_bo)
7440                 r600_resource_reference(&shader->scratch_bo, NULL);
7441
7442         r600_resource_reference(&shader->bo, NULL);
7443
7444         if (!shader->is_binary_shared)
7445                 radeon_shader_binary_clean(&shader->binary);
7446
7447         free(shader->shader_log);
7448 }