src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_flow.h"
  35 #include "gallivm/lp_bld_misc.h"
  36 #include "util/u_memory.h"
  37 #include "util/u_string.h"
  38 #include "tgsi/tgsi_build.h"
  39 #include "tgsi/tgsi_util.h"
  40 #include "tgsi/tgsi_dump.h"
  41
  42 #include "ac_binary.h"
  43 #include "ac_llvm_util.h"
  44 #include "ac_exp_param.h"
  45 #include "si_shader_internal.h"
  46 #include "si_pipe.h"
  47 #include "sid.h"
  48
  49
  50 static const char *scratch_rsrc_dword0_symbol =
  51         "SCRATCH_RSRC_DWORD0";
  52
  53 static const char *scratch_rsrc_dword1_symbol =
  54         "SCRATCH_RSRC_DWORD1";
  55
  56 struct si_shader_output_values
  57 {
  58         LLVMValueRef values[4];
  59         unsigned semantic_name;
  60         unsigned semantic_index;
  61         ubyte vertex_stream[4];
  62 };
  63
  64 static void si_init_shader_ctx(struct si_shader_context *ctx,
  65                                struct si_screen *sscreen,
  66                                LLVMTargetMachineRef tm);
  67
  68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  69                                  struct lp_build_tgsi_context *bld_base,
  70                                  struct lp_build_emit_data *emit_data);
  71
  72 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
  73                                FILE *f);
  74
  75 static unsigned llvm_get_type_size(LLVMTypeRef type);
  76
  77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  78                                         union si_shader_part_key *key);
  79 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
  80                                         union si_shader_part_key *key);
  81 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  82                                          union si_shader_part_key *key);
  83 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  84                                         union si_shader_part_key *key);
  85 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  86                                         union si_shader_part_key *key);
  87
  88 /* Ideally pass the sample mask input to the PS epilog as v13, which
  89  * is its usual location, so that the shader doesn't have to add v_mov.
  90  */
  91 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
  92
  93 /* The VS location of the PrimitiveID input is the same in the epilog,
  94  * so that the main shader part doesn't have to move it.
  95  */
  96 #define VS_EPILOG_PRIMID_LOC 2
  97
  98 enum {
  99         CONST_ADDR_SPACE = 2,
 100         LOCAL_ADDR_SPACE = 3,
 101 };
 102
 103 /**
 104  * Returns a unique index for a semantic name and index. The index must be
 105  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 106  * calculated.
 107  */
 108 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 109 {
 110         switch (semantic_name) {
 111         case TGSI_SEMANTIC_POSITION:
 112                 return 0;
 113         case TGSI_SEMANTIC_PSIZE:
 114                 return 1;
 115         case TGSI_SEMANTIC_CLIPDIST:
 116                 assert(index <= 1);
 117                 return 2 + index;
 118         case TGSI_SEMANTIC_GENERIC:
 119                 if (index <= 63-4)
 120                         return 4 + index;
 121
 122                 assert(!"invalid generic index");
 123                 return 0;
 124
 125         /* patch indices are completely separate and thus start from 0 */
 126         case TGSI_SEMANTIC_TESSOUTER:
 127                 return 0;
 128         case TGSI_SEMANTIC_TESSINNER:
 129                 return 1;
 130         case TGSI_SEMANTIC_PATCH:
 131                 return 2 + index;
 132
 133         default:
 134                 assert(!"invalid semantic name");
 135                 return 0;
 136         }
 137 }
 138
 139 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
 140 {
 141         switch (name) {
 142         case TGSI_SEMANTIC_FOG:
 143                 return 0;
 144         case TGSI_SEMANTIC_LAYER:
 145                 return 1;
 146         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 147                 return 2;
 148         case TGSI_SEMANTIC_PRIMID:
 149                 return 3;
 150         case TGSI_SEMANTIC_COLOR: /* these alias */
 151         case TGSI_SEMANTIC_BCOLOR:
 152                 return 4 + index;
 153         case TGSI_SEMANTIC_TEXCOORD:
 154                 return 6 + index;
 155         default:
 156                 assert(!"invalid semantic name");
 157                 return 0;
 158         }
 159 }
 160
 161 /**
 162  * Get the value of a shader input parameter and extract a bitfield.
 163  */
 164 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 165                                  unsigned param, unsigned rshift,
 166                                  unsigned bitwidth)
 167 {
 168         struct gallivm_state *gallivm = &ctx->gallivm;
 169         LLVMValueRef value = LLVMGetParam(ctx->main_fn,
 170                                           param);
 171
 172         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 173                 value = bitcast(&ctx->bld_base,
 174                                 TGSI_TYPE_UNSIGNED, value);
 175
 176         if (rshift)
 177                 value = LLVMBuildLShr(gallivm->builder, value,
 178                                       LLVMConstInt(ctx->i32, rshift, 0), "");
 179
 180         if (rshift + bitwidth < 32) {
 181                 unsigned mask = (1 << bitwidth) - 1;
 182                 value = LLVMBuildAnd(gallivm->builder, value,
 183                                      LLVMConstInt(ctx->i32, mask, 0), "");
 184         }
 185
 186         return value;
 187 }
 188
 189 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 190 {
 191         switch (ctx->type) {
 192         case PIPE_SHADER_TESS_CTRL:
 193                 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
 194
 195         case PIPE_SHADER_TESS_EVAL:
 196                 return LLVMGetParam(ctx->main_fn,
 197                                     ctx->param_tes_rel_patch_id);
 198
 199         default:
 200                 assert(0);
 201                 return NULL;
 202         }
 203 }
 204
 205 /* Tessellation shaders pass outputs to the next shader using LDS.
 206  *
 207  * LS outputs = TCS inputs
 208  * TCS outputs = TES inputs
 209  *
 210  * The LDS layout is:
 211  * - TCS inputs for patch 0
 212  * - TCS inputs for patch 1
 213  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 214  * - ...
 215  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 216  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 217  * - TCS outputs for patch 1
 218  * - Per-patch TCS outputs for patch 1
 219  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 220  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 221  * - ...
 222  *
 223  * All three shaders VS(LS), TCS, TES share the same LDS space.
 224  */
 225
 226 static LLVMValueRef
 227 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 228 {
 229         return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 230 }
 231
 232 static LLVMValueRef
 233 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 234 {
 235         return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 236 }
 237
 238 static LLVMValueRef
 239 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 240 {
 241         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 242                                 unpack_param(ctx,
 243                                              ctx->param_tcs_out_lds_offsets,
 244                                              0, 16),
 245                                 4);
 246 }
 247
 248 static LLVMValueRef
 249 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 250 {
 251         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 252                                 unpack_param(ctx,
 253                                              ctx->param_tcs_out_lds_offsets,
 254                                              16, 16),
 255                                 4);
 256 }
 257
 258 static LLVMValueRef
 259 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 260 {
 261         struct gallivm_state *gallivm = &ctx->gallivm;
 262         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 263         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 264
 265         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 266 }
 267
 268 static LLVMValueRef
 269 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 270 {
 271         struct gallivm_state *gallivm = &ctx->gallivm;
 272         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 273         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 274         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 275
 276         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 277                             LLVMBuildMul(gallivm->builder, patch_stride,
 278                                          rel_patch_id, ""),
 279                             "");
 280 }
 281
 282 static LLVMValueRef
 283 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 284 {
 285         struct gallivm_state *gallivm = &ctx->gallivm;
 286         LLVMValueRef patch0_patch_data_offset =
 287                 get_tcs_out_patch0_patch_data_offset(ctx);
 288         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 289         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 290
 291         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 292                             LLVMBuildMul(gallivm->builder, patch_stride,
 293                                          rel_patch_id, ""),
 294                             "");
 295 }
 296
 297 static LLVMValueRef get_instance_index_for_fetch(
 298         struct si_shader_context *ctx,
 299         unsigned param_start_instance, unsigned divisor)
 300 {
 301         struct gallivm_state *gallivm = &ctx->gallivm;
 302
 303         LLVMValueRef result = LLVMGetParam(ctx->main_fn,
 304                                            ctx->param_instance_id);
 305
 306         /* The division must be done before START_INSTANCE is added. */
 307         if (divisor > 1)
 308                 result = LLVMBuildUDiv(gallivm->builder, result,
 309                                 LLVMConstInt(ctx->i32, divisor, 0), "");
 310
 311         return LLVMBuildAdd(gallivm->builder, result,
 312                             LLVMGetParam(ctx->main_fn, param_start_instance), "");
 313 }
 314
 315 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
 316  * to float. */
 317 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
 318                                             LLVMValueRef vec4,
 319                                             unsigned double_index)
 320 {
 321         LLVMBuilderRef builder = ctx->gallivm.builder;
 322         LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
 323         LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
 324                                               LLVMVectorType(f64, 2), "");
 325         LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
 326         LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
 327         return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
 328 }
 329
 330 static void declare_input_vs(
 331         struct si_shader_context *ctx,
 332         unsigned input_index,
 333         const struct tgsi_full_declaration *decl,
 334         LLVMValueRef out[4])
 335 {
 336         struct gallivm_state *gallivm = &ctx->gallivm;
 337
 338         unsigned chan;
 339         unsigned fix_fetch;
 340         unsigned num_fetches;
 341         unsigned fetch_stride;
 342
 343         LLVMValueRef t_list_ptr;
 344         LLVMValueRef t_offset;
 345         LLVMValueRef t_list;
 346         LLVMValueRef vertex_index;
 347         LLVMValueRef input[3];
 348
 349         /* Load the T list */
 350         t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 351
 352         t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 353
 354         t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
 355
 356         vertex_index = LLVMGetParam(ctx->main_fn,
 357                                     ctx->param_vertex_index0 +
 358                                     input_index);
 359
 360         fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
 361
 362         /* Do multiple loads for special formats. */
 363         switch (fix_fetch) {
 364         case SI_FIX_FETCH_RGB_64_FLOAT:
 365                 num_fetches = 3; /* 3 2-dword loads */
 366                 fetch_stride = 8;
 367                 break;
 368         case SI_FIX_FETCH_RGBA_64_FLOAT:
 369                 num_fetches = 2; /* 2 4-dword loads */
 370                 fetch_stride = 16;
 371                 break;
 372         case SI_FIX_FETCH_RGB_8:
 373         case SI_FIX_FETCH_RGB_8_INT:
 374                 num_fetches = 3;
 375                 fetch_stride = 1;
 376                 break;
 377         case SI_FIX_FETCH_RGB_16:
 378         case SI_FIX_FETCH_RGB_16_INT:
 379                 num_fetches = 3;
 380                 fetch_stride = 2;
 381                 break;
 382         default:
 383                 num_fetches = 1;
 384                 fetch_stride = 0;
 385         }
 386
 387         for (unsigned i = 0; i < num_fetches; i++) {
 388                 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
 389
 390                 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
 391                                                        vertex_index, voffset,
 392                                                        true);
 393         }
 394
 395         /* Break up the vec4 into individual components */
 396         for (chan = 0; chan < 4; chan++) {
 397                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
 398                 out[chan] = LLVMBuildExtractElement(gallivm->builder,
 399                                                     input[0], llvm_chan, "");
 400         }
 401
 402         switch (fix_fetch) {
 403         case SI_FIX_FETCH_A2_SNORM:
 404         case SI_FIX_FETCH_A2_SSCALED:
 405         case SI_FIX_FETCH_A2_SINT: {
 406                 /* The hardware returns an unsigned value; convert it to a
 407                  * signed one.
 408                  */
 409                 LLVMValueRef tmp = out[3];
 410                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 411
 412                 /* First, recover the sign-extended signed integer value. */
 413                 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
 414                         tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
 415                 else
 416                         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
 417
 418                 /* For the integer-like cases, do a natural sign extension.
 419                  *
 420                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 421                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 422                  * exponent.
 423                  */
 424                 tmp = LLVMBuildShl(gallivm->builder, tmp,
 425                                    fix_fetch == SI_FIX_FETCH_A2_SNORM ?
 426                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 427                 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
 428
 429                 /* Convert back to the right type. */
 430                 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
 431                         LLVMValueRef clamp;
 432                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 433                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 434                         clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
 435                         tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
 436                 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
 437                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 438                 }
 439
 440                 out[3] = tmp;
 441                 break;
 442         }
 443         case SI_FIX_FETCH_RGBA_32_UNORM:
 444         case SI_FIX_FETCH_RGBX_32_UNORM:
 445                 for (chan = 0; chan < 4; chan++) {
 446                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 447                                                      ctx->i32, "");
 448                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 449                                                     out[chan], ctx->f32, "");
 450                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 451                                                   LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
 452                 }
 453                 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 454                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
 455                         out[3] = LLVMConstReal(ctx->f32, 1);
 456                 break;
 457         case SI_FIX_FETCH_RGBA_32_SNORM:
 458         case SI_FIX_FETCH_RGBX_32_SNORM:
 459         case SI_FIX_FETCH_RGBA_32_FIXED:
 460         case SI_FIX_FETCH_RGBX_32_FIXED: {
 461                 double scale;
 462                 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
 463                         scale = 1.0 / 0x10000;
 464                 else
 465                         scale = 1.0 / INT_MAX;
 466
 467                 for (chan = 0; chan < 4; chan++) {
 468                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 469                                                      ctx->i32, "");
 470                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 471                                                     out[chan], ctx->f32, "");
 472                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 473                                                   LLVMConstReal(ctx->f32, scale), "");
 474                 }
 475                 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 476                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
 477                     fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
 478                         out[3] = LLVMConstReal(ctx->f32, 1);
 479                 break;
 480         }
 481         case SI_FIX_FETCH_RGBA_32_USCALED:
 482                 for (chan = 0; chan < 4; chan++) {
 483                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 484                                                      ctx->i32, "");
 485                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 486                                                     out[chan], ctx->f32, "");
 487                 }
 488                 break;
 489         case SI_FIX_FETCH_RGBA_32_SSCALED:
 490                 for (chan = 0; chan < 4; chan++) {
 491                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 492                                                      ctx->i32, "");
 493                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 494                                                     out[chan], ctx->f32, "");
 495                 }
 496                 break;
 497         case SI_FIX_FETCH_RG_64_FLOAT:
 498                 for (chan = 0; chan < 2; chan++)
 499                         out[chan] = extract_double_to_float(ctx, input[0], chan);
 500
 501                 out[2] = LLVMConstReal(ctx->f32, 0);
 502                 out[3] = LLVMConstReal(ctx->f32, 1);
 503                 break;
 504         case SI_FIX_FETCH_RGB_64_FLOAT:
 505                 for (chan = 0; chan < 3; chan++)
 506                         out[chan] = extract_double_to_float(ctx, input[chan], 0);
 507
 508                 out[3] = LLVMConstReal(ctx->f32, 1);
 509                 break;
 510         case SI_FIX_FETCH_RGBA_64_FLOAT:
 511                 for (chan = 0; chan < 4; chan++) {
 512                         out[chan] = extract_double_to_float(ctx, input[chan / 2],
 513                                                             chan % 2);
 514                 }
 515                 break;
 516         case SI_FIX_FETCH_RGB_8:
 517         case SI_FIX_FETCH_RGB_8_INT:
 518         case SI_FIX_FETCH_RGB_16:
 519         case SI_FIX_FETCH_RGB_16_INT:
 520                 for (chan = 0; chan < 3; chan++) {
 521                         out[chan] = LLVMBuildExtractElement(gallivm->builder,
 522                                                             input[chan],
 523                                                             ctx->i32_0, "");
 524                 }
 525                 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
 526                     fix_fetch == SI_FIX_FETCH_RGB_16) {
 527                         out[3] = LLVMConstReal(ctx->f32, 1);
 528                 } else {
 529                         out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
 530                                                   ctx->f32, "");
 531                 }
 532                 break;
 533         }
 534 }
 535
 536 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 537                                      unsigned swizzle)
 538 {
 539         struct si_shader_context *ctx = si_shader_context(bld_base);
 540
 541         if (swizzle > 0)
 542                 return ctx->i32_0;
 543
 544         switch (ctx->type) {
 545         case PIPE_SHADER_VERTEX:
 546                 return LLVMGetParam(ctx->main_fn,
 547                                     ctx->param_vs_prim_id);
 548         case PIPE_SHADER_TESS_CTRL:
 549                 return LLVMGetParam(ctx->main_fn,
 550                                     ctx->param_tcs_patch_id);
 551         case PIPE_SHADER_TESS_EVAL:
 552                 return LLVMGetParam(ctx->main_fn,
 553                                     ctx->param_tes_patch_id);
 554         case PIPE_SHADER_GEOMETRY:
 555                 return LLVMGetParam(ctx->main_fn,
 556                                     ctx->param_gs_prim_id);
 557         default:
 558                 assert(0);
 559                 return ctx->i32_0;
 560         }
 561 }
 562
 563 /**
 564  * Return the value of tgsi_ind_register for indexing.
 565  * This is the indirect index with the constant offset added to it.
 566  */
 567 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 568                                        const struct tgsi_ind_register *ind,
 569                                        int rel_index)
 570 {
 571         struct gallivm_state *gallivm = &ctx->gallivm;
 572         LLVMValueRef result;
 573
 574         result = ctx->addrs[ind->Index][ind->Swizzle];
 575         result = LLVMBuildLoad(gallivm->builder, result, "");
 576         result = LLVMBuildAdd(gallivm->builder, result,
 577                               LLVMConstInt(ctx->i32, rel_index, 0), "");
 578         return result;
 579 }
 580
 581 /**
 582  * Like get_indirect_index, but restricts the return value to a (possibly
 583  * undefined) value inside [0..num).
 584  */
 585 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
 586                                                const struct tgsi_ind_register *ind,
 587                                                int rel_index, unsigned num)
 588 {
 589         LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
 590
 591         /* LLVM 3.8: If indirect resource indexing is used:
 592          * - SI & CIK hang
 593          * - VI crashes
 594          */
 595         if (HAVE_LLVM == 0x0308)
 596                 return LLVMGetUndef(ctx->i32);
 597
 598         return si_llvm_bound_index(ctx, result, num);
 599 }
 600
 601
 602 /**
 603  * Calculate a dword address given an input or output register and a stride.
 604  */
 605 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 606                                    const struct tgsi_full_dst_register *dst,
 607                                    const struct tgsi_full_src_register *src,
 608                                    LLVMValueRef vertex_dw_stride,
 609                                    LLVMValueRef base_addr)
 610 {
 611         struct gallivm_state *gallivm = &ctx->gallivm;
 612         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 613         ubyte *name, *index, *array_first;
 614         int first, param;
 615         struct tgsi_full_dst_register reg;
 616
 617         /* Set the register description. The address computation is the same
 618          * for sources and destinations. */
 619         if (src) {
 620                 reg.Register.File = src->Register.File;
 621                 reg.Register.Index = src->Register.Index;
 622                 reg.Register.Indirect = src->Register.Indirect;
 623                 reg.Register.Dimension = src->Register.Dimension;
 624                 reg.Indirect = src->Indirect;
 625                 reg.Dimension = src->Dimension;
 626                 reg.DimIndirect = src->DimIndirect;
 627         } else
 628                 reg = *dst;
 629
 630         /* If the register is 2-dimensional (e.g. an array of vertices
 631          * in a primitive), calculate the base address of the vertex. */
 632         if (reg.Register.Dimension) {
 633                 LLVMValueRef index;
 634
 635                 if (reg.Dimension.Indirect)
 636                         index = get_indirect_index(ctx, &reg.DimIndirect,
 637                                                    reg.Dimension.Index);
 638                 else
 639                         index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 640
 641                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 642                                          LLVMBuildMul(gallivm->builder, index,
 643                                                       vertex_dw_stride, ""), "");
 644         }
 645
 646         /* Get information about the register. */
 647         if (reg.Register.File == TGSI_FILE_INPUT) {
 648                 name = info->input_semantic_name;
 649                 index = info->input_semantic_index;
 650                 array_first = info->input_array_first;
 651         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 652                 name = info->output_semantic_name;
 653                 index = info->output_semantic_index;
 654                 array_first = info->output_array_first;
 655         } else {
 656                 assert(0);
 657                 return NULL;
 658         }
 659
 660         if (reg.Register.Indirect) {
 661                 /* Add the relative address of the element. */
 662                 LLVMValueRef ind_index;
 663
 664                 if (reg.Indirect.ArrayID)
 665                         first = array_first[reg.Indirect.ArrayID];
 666                 else
 667                         first = reg.Register.Index;
 668
 669                 ind_index = get_indirect_index(ctx, &reg.Indirect,
 670                                            reg.Register.Index - first);
 671
 672                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 673                                     LLVMBuildMul(gallivm->builder, ind_index,
 674                                                  LLVMConstInt(ctx->i32, 4, 0), ""), "");
 675
 676                 param = si_shader_io_get_unique_index(name[first], index[first]);
 677         } else {
 678                 param = si_shader_io_get_unique_index(name[reg.Register.Index],
 679                                                       index[reg.Register.Index]);
 680         }
 681
 682         /* Add the base address of the element. */
 683         return LLVMBuildAdd(gallivm->builder, base_addr,
 684                             LLVMConstInt(ctx->i32, param * 4, 0), "");
 685 }
 686
 687 /* The offchip buffer layout for TCS->TES is
 688  *
 689  * - attribute 0 of patch 0 vertex 0
 690  * - attribute 0 of patch 0 vertex 1
 691  * - attribute 0 of patch 0 vertex 2
 692  *   ...
 693  * - attribute 0 of patch 1 vertex 0
 694  * - attribute 0 of patch 1 vertex 1
 695  *   ...
 696  * - attribute 1 of patch 0 vertex 0
 697  * - attribute 1 of patch 0 vertex 1
 698  *   ...
 699  * - per patch attribute 0 of patch 0
 700  * - per patch attribute 0 of patch 1
 701  *   ...
 702  *
 703  * Note that every attribute has 4 components.
 704  */
 705 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 706                                                LLVMValueRef rel_patch_id,
 707                                                LLVMValueRef vertex_index,
 708                                                LLVMValueRef param_index)
 709 {
 710         struct gallivm_state *gallivm = &ctx->gallivm;
 711         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 712         LLVMValueRef param_stride, constant16;
 713
 714         vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 6);
 715         num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 9);
 716         total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 717                                       num_patches, "");
 718
 719         constant16 = LLVMConstInt(ctx->i32, 16, 0);
 720         if (vertex_index) {
 721                 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
 722                                          vertices_per_patch, "");
 723
 724                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 725                                          vertex_index, "");
 726
 727                 param_stride = total_vertices;
 728         } else {
 729                 base_addr = rel_patch_id;
 730                 param_stride = num_patches;
 731         }
 732
 733         base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 734                                  LLVMBuildMul(gallivm->builder, param_index,
 735                                               param_stride, ""), "");
 736
 737         base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 738
 739         if (!vertex_index) {
 740                 LLVMValueRef patch_data_offset =
 741                            unpack_param(ctx, ctx->param_tcs_offchip_layout, 16, 16);
 742
 743                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 744                                          patch_data_offset, "");
 745         }
 746         return base_addr;
 747 }
 748
 749 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 750                                        struct si_shader_context *ctx,
 751                                        const struct tgsi_full_dst_register *dst,
 752                                        const struct tgsi_full_src_register *src)
 753 {
 754         struct gallivm_state *gallivm = &ctx->gallivm;
 755         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 756         ubyte *name, *index, *array_first;
 757         struct tgsi_full_src_register reg;
 758         LLVMValueRef vertex_index = NULL;
 759         LLVMValueRef param_index = NULL;
 760         unsigned param_index_base, param_base;
 761
 762         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 763
 764         if (reg.Register.Dimension) {
 765
 766                 if (reg.Dimension.Indirect)
 767                         vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
 768                                                           reg.Dimension.Index);
 769                 else
 770                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 771         }
 772
 773         /* Get information about the register. */
 774         if (reg.Register.File == TGSI_FILE_INPUT) {
 775                 name = info->input_semantic_name;
 776                 index = info->input_semantic_index;
 777                 array_first = info->input_array_first;
 778         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 779                 name = info->output_semantic_name;
 780                 index = info->output_semantic_index;
 781                 array_first = info->output_array_first;
 782         } else {
 783                 assert(0);
 784                 return NULL;
 785         }
 786
 787         if (reg.Register.Indirect) {
 788                 if (reg.Indirect.ArrayID)
 789                         param_base = array_first[reg.Indirect.ArrayID];
 790                 else
 791                         param_base = reg.Register.Index;
 792
 793                 param_index = get_indirect_index(ctx, &reg.Indirect,
 794                                                  reg.Register.Index - param_base);
 795
 796         } else {
 797                 param_base = reg.Register.Index;
 798                 param_index = ctx->i32_0;
 799         }
 800
 801         param_index_base = si_shader_io_get_unique_index(name[param_base],
 802                                                          index[param_base]);
 803
 804         param_index = LLVMBuildAdd(gallivm->builder, param_index,
 805                                    LLVMConstInt(ctx->i32, param_index_base, 0),
 806                                    "");
 807
 808         return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
 809                                           vertex_index, param_index);
 810 }
 811
 812 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 813                                 enum tgsi_opcode_type type, unsigned swizzle,
 814                                 LLVMValueRef buffer, LLVMValueRef offset,
 815                                 LLVMValueRef base, bool readonly_memory)
 816 {
 817         struct si_shader_context *ctx = si_shader_context(bld_base);
 818         struct gallivm_state *gallivm = &ctx->gallivm;
 819         LLVMValueRef value, value2;
 820         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
 821         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 822
 823         if (swizzle == ~0) {
 824                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 825                                              0, 1, 0, readonly_memory);
 826
 827                 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 828         }
 829
 830         if (!tgsi_type_is_64bit(type)) {
 831                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 832                                              0, 1, 0, readonly_memory);
 833
 834                 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 835                 return LLVMBuildExtractElement(gallivm->builder, value,
 836                                     LLVMConstInt(ctx->i32, swizzle, 0), "");
 837         }
 838
 839         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 840                                   swizzle * 4, 1, 0, readonly_memory);
 841
 842         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 843                                    swizzle * 4 + 4, 1, 0, readonly_memory);
 844
 845         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 846 }
 847
 848 /**
 849  * Load from LDS.
 850  *
 851  * \param type          output value type
 852  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 853  * \param dw_addr       address in dwords
 854  */
 855 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 856                              enum tgsi_opcode_type type, unsigned swizzle,
 857                              LLVMValueRef dw_addr)
 858 {
 859         struct si_shader_context *ctx = si_shader_context(bld_base);
 860         struct gallivm_state *gallivm = &ctx->gallivm;
 861         LLVMValueRef value;
 862
 863         if (swizzle == ~0) {
 864                 LLVMValueRef values[TGSI_NUM_CHANNELS];
 865
 866                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
 867                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
 868
 869                 return lp_build_gather_values(gallivm, values,
 870                                               TGSI_NUM_CHANNELS);
 871         }
 872
 873         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 874                             LLVMConstInt(ctx->i32, swizzle, 0));
 875
 876         value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 877         if (tgsi_type_is_64bit(type)) {
 878                 LLVMValueRef value2;
 879                 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 880                                        ctx->i32_1);
 881                 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 882                 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 883         }
 884
 885         return LLVMBuildBitCast(gallivm->builder, value,
 886                                 tgsi2llvmtype(bld_base, type), "");
 887 }
 888
 889 /**
 890  * Store to LDS.
 891  *
 892  * \param swizzle       offset (typically 0..3)
 893  * \param dw_addr       address in dwords
 894  * \param value         value to store
 895  */
 896 static void lds_store(struct lp_build_tgsi_context *bld_base,
 897                       unsigned swizzle, LLVMValueRef dw_addr,
 898                       LLVMValueRef value)
 899 {
 900         struct si_shader_context *ctx = si_shader_context(bld_base);
 901         struct gallivm_state *gallivm = &ctx->gallivm;
 902
 903         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 904                             LLVMConstInt(ctx->i32, swizzle, 0));
 905
 906         value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
 907         ac_build_indexed_store(&ctx->ac, ctx->lds,
 908                                dw_addr, value);
 909 }
 910
 911 static LLVMValueRef fetch_input_tcs(
 912         struct lp_build_tgsi_context *bld_base,
 913         const struct tgsi_full_src_register *reg,
 914         enum tgsi_opcode_type type, unsigned swizzle)
 915 {
 916         struct si_shader_context *ctx = si_shader_context(bld_base);
 917         LLVMValueRef dw_addr, stride;
 918
 919         stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 920         dw_addr = get_tcs_in_current_patch_offset(ctx);
 921         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 922
 923         return lds_load(bld_base, type, swizzle, dw_addr);
 924 }
 925
 926 static LLVMValueRef fetch_output_tcs(
 927                 struct lp_build_tgsi_context *bld_base,
 928                 const struct tgsi_full_src_register *reg,
 929                 enum tgsi_opcode_type type, unsigned swizzle)
 930 {
 931         struct si_shader_context *ctx = si_shader_context(bld_base);
 932         LLVMValueRef dw_addr, stride;
 933
 934         if (reg->Register.Dimension) {
 935                 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
 936                 dw_addr = get_tcs_out_current_patch_offset(ctx);
 937                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 938         } else {
 939                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 940                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
 941         }
 942
 943         return lds_load(bld_base, type, swizzle, dw_addr);
 944 }
 945
 946 static LLVMValueRef fetch_input_tes(
 947         struct lp_build_tgsi_context *bld_base,
 948         const struct tgsi_full_src_register *reg,
 949         enum tgsi_opcode_type type, unsigned swizzle)
 950 {
 951         struct si_shader_context *ctx = si_shader_context(bld_base);
 952         LLVMValueRef rw_buffers, buffer, base, addr;
 953
 954         rw_buffers = LLVMGetParam(ctx->main_fn,
 955                                   ctx->param_rw_buffers);
 956         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
 957                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
 958
 959         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 960         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
 961
 962         return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
 963 }
 964
 965 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
 966                              const struct tgsi_full_instruction *inst,
 967                              const struct tgsi_opcode_info *info,
 968                              LLVMValueRef dst[4])
 969 {
 970         struct si_shader_context *ctx = si_shader_context(bld_base);
 971         struct gallivm_state *gallivm = &ctx->gallivm;
 972         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 973         const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
 974         unsigned chan_index;
 975         LLVMValueRef dw_addr, stride;
 976         LLVMValueRef rw_buffers, buffer, base, buf_addr;
 977         LLVMValueRef values[4];
 978         bool skip_lds_store;
 979         bool is_tess_factor = false;
 980
 981         /* Only handle per-patch and per-vertex outputs here.
 982          * Vectors will be lowered to scalars and this function will be called again.
 983          */
 984         if (reg->Register.File != TGSI_FILE_OUTPUT ||
 985             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
 986                 si_llvm_emit_store(bld_base, inst, info, dst);
 987                 return;
 988         }
 989
 990         if (reg->Register.Dimension) {
 991                 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
 992                 dw_addr = get_tcs_out_current_patch_offset(ctx);
 993                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
 994                 skip_lds_store = !sh_info->reads_pervertex_outputs;
 995         } else {
 996                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 997                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
 998                 skip_lds_store = !sh_info->reads_perpatch_outputs;
 999
1000                 if (!reg->Register.Indirect) {
1001                         int name = sh_info->output_semantic_name[reg->Register.Index];
1002
1003                         /* Always write tess factors into LDS for the TCS epilog. */
1004                         if (name == TGSI_SEMANTIC_TESSINNER ||
1005                             name == TGSI_SEMANTIC_TESSOUTER) {
1006                                 skip_lds_store = false;
1007                                 is_tess_factor = true;
1008                         }
1009                 }
1010         }
1011
1012         rw_buffers = LLVMGetParam(ctx->main_fn,
1013                                   ctx->param_rw_buffers);
1014         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1015                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1016
1017         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1018         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1019
1020
1021         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1022                 LLVMValueRef value = dst[chan_index];
1023
1024                 if (inst->Instruction.Saturate)
1025                         value = ac_build_clamp(&ctx->ac, value);
1026
1027                 /* Skip LDS stores if there is no LDS read of this output. */
1028                 if (!skip_lds_store)
1029                         lds_store(bld_base, chan_index, dw_addr, value);
1030
1031                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1032                 values[chan_index] = value;
1033
1034                 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1035                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1036                                                     buf_addr, base,
1037                                                     4 * chan_index, 1, 0, true, false);
1038                 }
1039         }
1040
1041         if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1042                 LLVMValueRef value = lp_build_gather_values(gallivm,
1043                                                             values, 4);
1044                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1045                                             base, 0, 1, 0, true, false);
1046         }
1047 }
1048
1049 static LLVMValueRef fetch_input_gs(
1050         struct lp_build_tgsi_context *bld_base,
1051         const struct tgsi_full_src_register *reg,
1052         enum tgsi_opcode_type type,
1053         unsigned swizzle)
1054 {
1055         struct si_shader_context *ctx = si_shader_context(bld_base);
1056         struct si_shader *shader = ctx->shader;
1057         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1058         struct gallivm_state *gallivm = &ctx->gallivm;
1059         LLVMValueRef vtx_offset, soffset;
1060         unsigned vtx_offset_param;
1061         struct tgsi_shader_info *info = &shader->selector->info;
1062         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1063         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1064         unsigned param;
1065         LLVMValueRef value;
1066
1067         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1068                 return get_primitive_id(bld_base, swizzle);
1069
1070         if (!reg->Register.Dimension)
1071                 return NULL;
1072
1073         if (swizzle == ~0) {
1074                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1075                 unsigned chan;
1076                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1077                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1078                 }
1079                 return lp_build_gather_values(gallivm, values,
1080                                               TGSI_NUM_CHANNELS);
1081         }
1082
1083         /* Get the vertex offset parameter */
1084         vtx_offset_param = reg->Dimension.Index;
1085         if (vtx_offset_param < 2) {
1086                 vtx_offset_param += ctx->param_gs_vtx0_offset;
1087         } else {
1088                 assert(vtx_offset_param < 6);
1089                 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1090         }
1091         vtx_offset = lp_build_mul_imm(uint,
1092                                       LLVMGetParam(ctx->main_fn,
1093                                                    vtx_offset_param),
1094                                       4);
1095
1096         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1097         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1098
1099         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1100                                      vtx_offset, soffset, 0, 1, 0, true);
1101         if (tgsi_type_is_64bit(type)) {
1102                 LLVMValueRef value2;
1103                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1104
1105                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1106                                               ctx->i32_0, vtx_offset, soffset,
1107                                               0, 1, 0, true);
1108                 return si_llvm_emit_fetch_64bit(bld_base, type,
1109                                                 value, value2);
1110         }
1111         return LLVMBuildBitCast(gallivm->builder,
1112                                 value,
1113                                 tgsi2llvmtype(bld_base, type), "");
1114 }
1115
1116 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1117 {
1118         switch (interpolate) {
1119         case TGSI_INTERPOLATE_CONSTANT:
1120                 return 0;
1121
1122         case TGSI_INTERPOLATE_LINEAR:
1123                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1124                         return SI_PARAM_LINEAR_SAMPLE;
1125                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1126                         return SI_PARAM_LINEAR_CENTROID;
1127                 else
1128                         return SI_PARAM_LINEAR_CENTER;
1129                 break;
1130         case TGSI_INTERPOLATE_COLOR:
1131         case TGSI_INTERPOLATE_PERSPECTIVE:
1132                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1133                         return SI_PARAM_PERSP_SAMPLE;
1134                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1135                         return SI_PARAM_PERSP_CENTROID;
1136                 else
1137                         return SI_PARAM_PERSP_CENTER;
1138                 break;
1139         default:
1140                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1141                 return -1;
1142         }
1143 }
1144
1145 /**
1146  * Interpolate a fragment shader input.
1147  *
1148  * @param ctx           context
1149  * @param input_index           index of the input in hardware
1150  * @param semantic_name         TGSI_SEMANTIC_*
1151  * @param semantic_index        semantic index
1152  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1153  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1154  * @param interp_param          interpolation weights (i,j)
1155  * @param prim_mask             SI_PARAM_PRIM_MASK
1156  * @param face                  SI_PARAM_FRONT_FACE
1157  * @param result                the return value (4 components)
1158  */
1159 static void interp_fs_input(struct si_shader_context *ctx,
1160                             unsigned input_index,
1161                             unsigned semantic_name,
1162                             unsigned semantic_index,
1163                             unsigned num_interp_inputs,
1164                             unsigned colors_read_mask,
1165                             LLVMValueRef interp_param,
1166                             LLVMValueRef prim_mask,
1167                             LLVMValueRef face,
1168                             LLVMValueRef result[4])
1169 {
1170         struct gallivm_state *gallivm = &ctx->gallivm;
1171         LLVMValueRef attr_number;
1172         LLVMValueRef i, j;
1173
1174         unsigned chan;
1175
1176         /* fs.constant returns the param from the middle vertex, so it's not
1177          * really useful for flat shading. It's meant to be used for custom
1178          * interpolation (but the intrinsic can't fetch from the other two
1179          * vertices).
1180          *
1181          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1182          * to do the right thing. The only reason we use fs.constant is that
1183          * fs.interp cannot be used on integers, because they can be equal
1184          * to NaN.
1185          *
1186          * When interp is false we will use fs.constant or for newer llvm,
1187          * amdgcn.interp.mov.
1188          */
1189         bool interp = interp_param != NULL;
1190
1191         attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1192
1193         if (interp) {
1194                 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1195                                                 LLVMVectorType(ctx->f32, 2), "");
1196
1197                 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1198                                                 ctx->i32_0, "");
1199                 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1200                                                 ctx->i32_1, "");
1201         }
1202
1203         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1204             ctx->shader->key.part.ps.prolog.color_two_side) {
1205                 LLVMValueRef is_face_positive;
1206                 LLVMValueRef back_attr_number;
1207
1208                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1209                  * otherwise it's at offset "num_inputs".
1210                  */
1211                 unsigned back_attr_offset = num_interp_inputs;
1212                 if (semantic_index == 1 && colors_read_mask & 0xf)
1213                         back_attr_offset += 1;
1214
1215                 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1216
1217                 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1218                                                  face, ctx->i32_0, "");
1219
1220                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1221                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1222                         LLVMValueRef front, back;
1223
1224                         if (interp) {
1225                                 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1226                                                         attr_number, prim_mask,
1227                                                         i, j);
1228                                 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1229                                                         back_attr_number, prim_mask,
1230                                                         i, j);
1231                         } else {
1232                                 front = ac_build_fs_interp_mov(&ctx->ac,
1233                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1234                                         llvm_chan, attr_number, prim_mask);
1235                                 back = ac_build_fs_interp_mov(&ctx->ac,
1236                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1237                                         llvm_chan, back_attr_number, prim_mask);
1238                         }
1239
1240                         result[chan] = LLVMBuildSelect(gallivm->builder,
1241                                                 is_face_positive,
1242                                                 front,
1243                                                 back,
1244                                                 "");
1245                 }
1246         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1247                 if (interp) {
1248                         result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1249                                                        attr_number, prim_mask, i, j);
1250                 } else {
1251                         result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1252                                                            LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1253                                                            attr_number, prim_mask);
1254                 }
1255                 result[1] =
1256                 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1257                 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1258         } else {
1259                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1260                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1261
1262                         if (interp) {
1263                                 result[chan] = ac_build_fs_interp(&ctx->ac,
1264                                         llvm_chan, attr_number, prim_mask, i, j);
1265                         } else {
1266                                 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1267                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1268                                         llvm_chan, attr_number, prim_mask);
1269                         }
1270                 }
1271         }
1272 }
1273
1274 static void declare_input_fs(
1275         struct si_shader_context *ctx,
1276         unsigned input_index,
1277         const struct tgsi_full_declaration *decl,
1278         LLVMValueRef out[4])
1279 {
1280         struct lp_build_context *base = &ctx->bld_base.base;
1281         struct si_shader *shader = ctx->shader;
1282         LLVMValueRef main_fn = ctx->main_fn;
1283         LLVMValueRef interp_param = NULL;
1284         int interp_param_idx;
1285
1286         /* Get colors from input VGPRs (set by the prolog). */
1287         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1288                 unsigned i = decl->Semantic.Index;
1289                 unsigned colors_read = shader->selector->info.colors_read;
1290                 unsigned mask = colors_read >> (i * 4);
1291                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1292                                   (i ? util_bitcount(colors_read & 0xf) : 0);
1293
1294                 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1295                 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1296                 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1297                 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1298                 return;
1299         }
1300
1301         interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1302                                                      decl->Interp.Location);
1303         if (interp_param_idx == -1)
1304                 return;
1305         else if (interp_param_idx) {
1306                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1307         }
1308
1309         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1310             decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1311             ctx->shader->key.part.ps.prolog.flatshade_colors)
1312                 interp_param = NULL; /* load the constant color */
1313
1314         interp_fs_input(ctx, input_index, decl->Semantic.Name,
1315                         decl->Semantic.Index, shader->selector->info.num_inputs,
1316                         shader->selector->info.colors_read, interp_param,
1317                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1318                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1319                         &out[0]);
1320 }
1321
1322 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1323 {
1324         return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1325 }
1326
1327
1328 /**
1329  * Load a dword from a constant buffer.
1330  */
1331 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1332                                       LLVMValueRef resource,
1333                                       LLVMValueRef offset)
1334 {
1335         LLVMBuilderRef builder = ctx->gallivm.builder;
1336         LLVMValueRef args[2] = {resource, offset};
1337
1338         return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1339                                   LP_FUNC_ATTR_READNONE |
1340                                   LP_FUNC_ATTR_LEGACY);
1341 }
1342
1343 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1344 {
1345         struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1346         struct gallivm_state *gallivm = &ctx->gallivm;
1347         LLVMBuilderRef builder = gallivm->builder;
1348         LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1349         LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1350         LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1351
1352         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1353         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1354         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1355
1356         LLVMValueRef pos[4] = {
1357                 buffer_load_const(ctx, resource, offset0),
1358                 buffer_load_const(ctx, resource, offset1),
1359                 LLVMConstReal(ctx->f32, 0),
1360                 LLVMConstReal(ctx->f32, 0)
1361         };
1362
1363         return lp_build_gather_values(gallivm, pos, 4);
1364 }
1365
1366 static void declare_system_value(struct si_shader_context *ctx,
1367                                  unsigned index,
1368                                  const struct tgsi_full_declaration *decl)
1369 {
1370         struct lp_build_context *bld = &ctx->bld_base.base;
1371         struct gallivm_state *gallivm = &ctx->gallivm;
1372         LLVMValueRef value = 0;
1373
1374         assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1375
1376         switch (decl->Semantic.Name) {
1377         case TGSI_SEMANTIC_INSTANCEID:
1378                 value = LLVMGetParam(ctx->main_fn,
1379                                      ctx->param_instance_id);
1380                 break;
1381
1382         case TGSI_SEMANTIC_VERTEXID:
1383                 value = LLVMBuildAdd(gallivm->builder,
1384                                      LLVMGetParam(ctx->main_fn,
1385                                                   ctx->param_vertex_id),
1386                                      LLVMGetParam(ctx->main_fn,
1387                                                   ctx->param_base_vertex), "");
1388                 break;
1389
1390         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1391                 /* Unused. Clarify the meaning in indexed vs. non-indexed
1392                  * draws if this is ever used again. */
1393                 assert(false);
1394                 break;
1395
1396         case TGSI_SEMANTIC_BASEVERTEX:
1397         {
1398                 /* For non-indexed draws, the base vertex set by the driver
1399                  * (for direct draws) or the CP (for indirect draws) is the
1400                  * first vertex ID, but GLSL expects 0 to be returned.
1401                  */
1402                 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1403                 LLVMValueRef indexed;
1404
1405                 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1406                 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1407
1408                 value = LLVMBuildSelect(gallivm->builder, indexed,
1409                                         LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1410                                         ctx->i32_0, "");
1411                 break;
1412         }
1413
1414         case TGSI_SEMANTIC_BASEINSTANCE:
1415                 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1416                 break;
1417
1418         case TGSI_SEMANTIC_DRAWID:
1419                 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1420                 break;
1421
1422         case TGSI_SEMANTIC_INVOCATIONID:
1423                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1424                         value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1425                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1426                         value = LLVMGetParam(ctx->main_fn,
1427                                              ctx->param_gs_instance_id);
1428                 else
1429                         assert(!"INVOCATIONID not implemented");
1430                 break;
1431
1432         case TGSI_SEMANTIC_POSITION:
1433         {
1434                 LLVMValueRef pos[4] = {
1435                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1436                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1437                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1438                         lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1439                                                  LLVMGetParam(ctx->main_fn,
1440                                                               SI_PARAM_POS_W_FLOAT)),
1441                 };
1442                 value = lp_build_gather_values(gallivm, pos, 4);
1443                 break;
1444         }
1445
1446         case TGSI_SEMANTIC_FACE:
1447                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1448                 break;
1449
1450         case TGSI_SEMANTIC_SAMPLEID:
1451                 value = get_sample_id(ctx);
1452                 break;
1453
1454         case TGSI_SEMANTIC_SAMPLEPOS: {
1455                 LLVMValueRef pos[4] = {
1456                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1457                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1458                         LLVMConstReal(ctx->f32, 0),
1459                         LLVMConstReal(ctx->f32, 0)
1460                 };
1461                 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1462                                                   TGSI_OPCODE_FRC, pos[0]);
1463                 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1464                                                   TGSI_OPCODE_FRC, pos[1]);
1465                 value = lp_build_gather_values(gallivm, pos, 4);
1466                 break;
1467         }
1468
1469         case TGSI_SEMANTIC_SAMPLEMASK:
1470                 /* This can only occur with the OpenGL Core profile, which
1471                  * doesn't support smoothing.
1472                  */
1473                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1474                 break;
1475
1476         case TGSI_SEMANTIC_TESSCOORD:
1477         {
1478                 LLVMValueRef coord[4] = {
1479                         LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1480                         LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1481                         bld->zero,
1482                         bld->zero
1483                 };
1484
1485                 /* For triangles, the vector should be (u, v, 1-u-v). */
1486                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1487                     PIPE_PRIM_TRIANGLES)
1488                         coord[2] = lp_build_sub(bld, bld->one,
1489                                                 lp_build_add(bld, coord[0], coord[1]));
1490
1491                 value = lp_build_gather_values(gallivm, coord, 4);
1492                 break;
1493         }
1494
1495         case TGSI_SEMANTIC_VERTICESIN:
1496                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1497                         value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1498                 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1499                         value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 7);
1500                 else
1501                         assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1502                 break;
1503
1504         case TGSI_SEMANTIC_TESSINNER:
1505         case TGSI_SEMANTIC_TESSOUTER:
1506         {
1507                 LLVMValueRef rw_buffers, buffer, base, addr;
1508                 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1509
1510                 rw_buffers = LLVMGetParam(ctx->main_fn,
1511                                           ctx->param_rw_buffers);
1512                 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1513                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1514
1515                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1516                 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1517                                           LLVMConstInt(ctx->i32, param, 0));
1518
1519                 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1520                                     ~0, buffer, base, addr, true);
1521
1522                 break;
1523         }
1524
1525         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1526         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1527         {
1528                 LLVMValueRef buf, slot, val[4];
1529                 int i, offset;
1530
1531                 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1532                 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1533                 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1534                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1535
1536                 for (i = 0; i < 4; i++)
1537                         val[i] = buffer_load_const(ctx, buf,
1538                                                    LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1539                 value = lp_build_gather_values(gallivm, val, 4);
1540                 break;
1541         }
1542
1543         case TGSI_SEMANTIC_PRIMID:
1544                 value = get_primitive_id(&ctx->bld_base, 0);
1545                 break;
1546
1547         case TGSI_SEMANTIC_GRID_SIZE:
1548                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
1549                 break;
1550
1551         case TGSI_SEMANTIC_BLOCK_SIZE:
1552         {
1553                 LLVMValueRef values[3];
1554                 unsigned i;
1555                 unsigned *properties = ctx->shader->selector->info.properties;
1556
1557                 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1558                         unsigned sizes[3] = {
1559                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1560                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1561                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1562                         };
1563
1564                         for (i = 0; i < 3; ++i)
1565                                 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1566
1567                         value = lp_build_gather_values(gallivm, values, 3);
1568                 } else {
1569                         value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
1570                 }
1571                 break;
1572         }
1573
1574         case TGSI_SEMANTIC_BLOCK_ID:
1575                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
1576                 break;
1577
1578         case TGSI_SEMANTIC_THREAD_ID:
1579                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
1580                 break;
1581
1582         case TGSI_SEMANTIC_HELPER_INVOCATION:
1583                 if (HAVE_LLVM >= 0x0309) {
1584                         value = lp_build_intrinsic(gallivm->builder,
1585                                                    "llvm.amdgcn.ps.live",
1586                                                    ctx->i1, NULL, 0,
1587                                                    LP_FUNC_ATTR_READNONE);
1588                         value = LLVMBuildNot(gallivm->builder, value, "");
1589                         value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1590                 } else {
1591                         assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
1592                         return;
1593                 }
1594                 break;
1595
1596         case TGSI_SEMANTIC_SUBGROUP_SIZE:
1597                 value = LLVMConstInt(ctx->i32, 64, 0);
1598                 break;
1599
1600         case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1601                 value = ac_get_thread_id(&ctx->ac);
1602                 break;
1603
1604         case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1605         {
1606                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1607                 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1608                 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1609                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1610                 break;
1611         }
1612
1613         case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1614         case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1615         case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1616         case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1617         {
1618                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1619                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1620                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1621                         /* All bits set except LSB */
1622                         value = LLVMConstInt(ctx->i64, -2, 0);
1623                 } else {
1624                         /* All bits set */
1625                         value = LLVMConstInt(ctx->i64, -1, 0);
1626                 }
1627                 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1628                 value = LLVMBuildShl(gallivm->builder, value, id, "");
1629                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1630                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1631                         value = LLVMBuildNot(gallivm->builder, value, "");
1632                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1633                 break;
1634         }
1635
1636         default:
1637                 assert(!"unknown system value");
1638                 return;
1639         }
1640
1641         ctx->system_values[index] = value;
1642 }
1643
1644 static void declare_compute_memory(struct si_shader_context *ctx,
1645                                    const struct tgsi_full_declaration *decl)
1646 {
1647         struct si_shader_selector *sel = ctx->shader->selector;
1648         struct gallivm_state *gallivm = &ctx->gallivm;
1649
1650         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1651         LLVMValueRef var;
1652
1653         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1654         assert(decl->Range.First == decl->Range.Last);
1655         assert(!ctx->shared_memory);
1656
1657         var = LLVMAddGlobalInAddressSpace(gallivm->module,
1658                                           LLVMArrayType(ctx->i8, sel->local_size),
1659                                           "compute_lds",
1660                                           LOCAL_ADDR_SPACE);
1661         LLVMSetAlignment(var, 4);
1662
1663         ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1664 }
1665
1666 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1667 {
1668         LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1669                                              ctx->param_const_buffers);
1670
1671         return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1672                                         LLVMConstInt(ctx->i32, i, 0));
1673 }
1674
1675 static LLVMValueRef fetch_constant(
1676         struct lp_build_tgsi_context *bld_base,
1677         const struct tgsi_full_src_register *reg,
1678         enum tgsi_opcode_type type,
1679         unsigned swizzle)
1680 {
1681         struct si_shader_context *ctx = si_shader_context(bld_base);
1682         struct lp_build_context *base = &bld_base->base;
1683         const struct tgsi_ind_register *ireg = &reg->Indirect;
1684         unsigned buf, idx;
1685
1686         LLVMValueRef addr, bufp;
1687         LLVMValueRef result;
1688
1689         if (swizzle == LP_CHAN_ALL) {
1690                 unsigned chan;
1691                 LLVMValueRef values[4];
1692                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1693                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1694
1695                 return lp_build_gather_values(&ctx->gallivm, values, 4);
1696         }
1697
1698         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1699         idx = reg->Register.Index * 4 + swizzle;
1700
1701         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1702                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1703                 LLVMValueRef index;
1704                 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1705                                                    reg->Dimension.Index,
1706                                                    SI_NUM_CONST_BUFFERS);
1707                 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1708         } else
1709                 bufp = load_const_buffer_desc(ctx, buf);
1710
1711         if (reg->Register.Indirect) {
1712                 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1713                 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1714                 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1715                 addr = lp_build_add(&bld_base->uint_bld, addr,
1716                                     LLVMConstInt(ctx->i32, idx * 4, 0));
1717         } else {
1718                 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1719         }
1720
1721         result = buffer_load_const(ctx, bufp, addr);
1722
1723         if (!tgsi_type_is_64bit(type))
1724                 result = bitcast(bld_base, type, result);
1725         else {
1726                 LLVMValueRef addr2, result2;
1727
1728                 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1729                                      LLVMConstInt(ctx->i32, 4, 0));
1730                 result2 = buffer_load_const(ctx, bufp, addr2);
1731
1732                 result = si_llvm_emit_fetch_64bit(bld_base, type,
1733                                                   result, result2);
1734         }
1735         return result;
1736 }
1737
1738 /* Upper 16 bits must be zero. */
1739 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1740                                            LLVMValueRef val[2])
1741 {
1742         return LLVMBuildOr(ctx->gallivm.builder, val[0],
1743                            LLVMBuildShl(ctx->gallivm.builder, val[1],
1744                                         LLVMConstInt(ctx->i32, 16, 0),
1745                                         ""), "");
1746 }
1747
1748 /* Upper 16 bits are ignored and will be dropped. */
1749 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1750                                                     LLVMValueRef val[2])
1751 {
1752         LLVMValueRef v[2] = {
1753                 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1754                              LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1755                 val[1],
1756         };
1757         return si_llvm_pack_two_int16(ctx, v);
1758 }
1759
1760 /* Initialize arguments for the shader export intrinsic */
1761 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1762                                      LLVMValueRef *values,
1763                                      unsigned target,
1764                                      struct ac_export_args *args)
1765 {
1766         struct si_shader_context *ctx = si_shader_context(bld_base);
1767         struct lp_build_context *base = &bld_base->base;
1768         LLVMBuilderRef builder = ctx->gallivm.builder;
1769         LLVMValueRef val[4];
1770         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1771         unsigned chan;
1772         bool is_int8, is_int10;
1773
1774         /* Default is 0xf. Adjusted below depending on the format. */
1775         args->enabled_channels = 0xf; /* writemask */
1776
1777         /* Specify whether the EXEC mask represents the valid mask */
1778         args->valid_mask = 0;
1779
1780         /* Specify whether this is the last export */
1781         args->done = 0;
1782
1783         /* Specify the target we are exporting */
1784         args->target = target;
1785
1786         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1787                 const struct si_shader_key *key = &ctx->shader->key;
1788                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1789                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1790
1791                 assert(cbuf >= 0 && cbuf < 8);
1792                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1793                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1794                 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1795         }
1796
1797         args->compr = false;
1798         args->out[0] = base->undef;
1799         args->out[1] = base->undef;
1800         args->out[2] = base->undef;
1801         args->out[3] = base->undef;
1802
1803         switch (spi_shader_col_format) {
1804         case V_028714_SPI_SHADER_ZERO:
1805                 args->enabled_channels = 0; /* writemask */
1806                 args->target = V_008DFC_SQ_EXP_NULL;
1807                 break;
1808
1809         case V_028714_SPI_SHADER_32_R:
1810                 args->enabled_channels = 1; /* writemask */
1811                 args->out[0] = values[0];
1812                 break;
1813
1814         case V_028714_SPI_SHADER_32_GR:
1815                 args->enabled_channels = 0x3; /* writemask */
1816                 args->out[0] = values[0];
1817                 args->out[1] = values[1];
1818                 break;
1819
1820         case V_028714_SPI_SHADER_32_AR:
1821                 args->enabled_channels = 0x9; /* writemask */
1822                 args->out[0] = values[0];
1823                 args->out[3] = values[3];
1824                 break;
1825
1826         case V_028714_SPI_SHADER_FP16_ABGR:
1827                 args->compr = 1; /* COMPR flag */
1828
1829                 for (chan = 0; chan < 2; chan++) {
1830                         LLVMValueRef pack_args[2] = {
1831                                 values[2 * chan],
1832                                 values[2 * chan + 1]
1833                         };
1834                         LLVMValueRef packed;
1835
1836                         packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1837                         args->out[chan] =
1838                                 LLVMBuildBitCast(ctx->gallivm.builder,
1839                                                  packed, ctx->f32, "");
1840                 }
1841                 break;
1842
1843         case V_028714_SPI_SHADER_UNORM16_ABGR:
1844                 for (chan = 0; chan < 4; chan++) {
1845                         val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1846                         val[chan] = LLVMBuildFMul(builder, val[chan],
1847                                                   LLVMConstReal(ctx->f32, 65535), "");
1848                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1849                                                   LLVMConstReal(ctx->f32, 0.5), "");
1850                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
1851                                                     ctx->i32, "");
1852                 }
1853
1854                 args->compr = 1; /* COMPR flag */
1855                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1856                                   si_llvm_pack_two_int16(ctx, val));
1857                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1858                                   si_llvm_pack_two_int16(ctx, val+2));
1859                 break;
1860
1861         case V_028714_SPI_SHADER_SNORM16_ABGR:
1862                 for (chan = 0; chan < 4; chan++) {
1863                         /* Clamp between [-1, 1]. */
1864                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1865                                                               values[chan],
1866                                                               LLVMConstReal(ctx->f32, 1));
1867                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1868                                                               val[chan],
1869                                                               LLVMConstReal(ctx->f32, -1));
1870                         /* Convert to a signed integer in [-32767, 32767]. */
1871                         val[chan] = LLVMBuildFMul(builder, val[chan],
1872                                                   LLVMConstReal(ctx->f32, 32767), "");
1873                         /* If positive, add 0.5, else add -0.5. */
1874                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1875                                         LLVMBuildSelect(builder,
1876                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
1877                                                               val[chan], base->zero, ""),
1878                                                 LLVMConstReal(ctx->f32, 0.5),
1879                                                 LLVMConstReal(ctx->f32, -0.5), ""), "");
1880                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1881                 }
1882
1883                 args->compr = 1; /* COMPR flag */
1884                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1885                                   si_llvm_pack_two_int32_as_int16(ctx, val));
1886                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1887                                   si_llvm_pack_two_int32_as_int16(ctx, val+2));
1888                 break;
1889
1890         case V_028714_SPI_SHADER_UINT16_ABGR: {
1891                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1892                         is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1893                 LLVMValueRef max_alpha =
1894                         !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1895
1896                 /* Clamp. */
1897                 for (chan = 0; chan < 4; chan++) {
1898                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1899                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1900                                         val[chan],
1901                                         chan == 3 ? max_alpha : max_rgb);
1902                 }
1903
1904                 args->compr = 1; /* COMPR flag */
1905                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1906                                   si_llvm_pack_two_int16(ctx, val));
1907                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1908                                   si_llvm_pack_two_int16(ctx, val+2));
1909                 break;
1910         }
1911
1912         case V_028714_SPI_SHADER_SINT16_ABGR: {
1913                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1914                         is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1915                 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1916                         is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1917                 LLVMValueRef max_alpha =
1918                         !is_int10 ? max_rgb : ctx->i32_1;
1919                 LLVMValueRef min_alpha =
1920                         !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1921
1922                 /* Clamp. */
1923                 for (chan = 0; chan < 4; chan++) {
1924                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1925                         val[chan] = lp_build_emit_llvm_binary(bld_base,
1926                                         TGSI_OPCODE_IMIN,
1927                                         val[chan], chan == 3 ? max_alpha : max_rgb);
1928                         val[chan] = lp_build_emit_llvm_binary(bld_base,
1929                                         TGSI_OPCODE_IMAX,
1930                                         val[chan], chan == 3 ? min_alpha : min_rgb);
1931                 }
1932
1933                 args->compr = 1; /* COMPR flag */
1934                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1935                                   si_llvm_pack_two_int32_as_int16(ctx, val));
1936                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1937                                   si_llvm_pack_two_int32_as_int16(ctx, val+2));
1938                 break;
1939         }
1940
1941         case V_028714_SPI_SHADER_32_ABGR:
1942                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1943                 break;
1944         }
1945 }
1946
1947 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1948                           LLVMValueRef alpha)
1949 {
1950         struct si_shader_context *ctx = si_shader_context(bld_base);
1951
1952         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1953                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
1954                                 SI_PARAM_ALPHA_REF);
1955
1956                 LLVMValueRef alpha_pass =
1957                         lp_build_cmp(&bld_base->base,
1958                                      ctx->shader->key.part.ps.epilog.alpha_func,
1959                                      alpha, alpha_ref);
1960                 LLVMValueRef arg =
1961                         lp_build_select(&bld_base->base,
1962                                         alpha_pass,
1963                                         LLVMConstReal(ctx->f32, 1.0f),
1964                                         LLVMConstReal(ctx->f32, -1.0f));
1965
1966                 ac_build_kill(&ctx->ac, arg);
1967         } else {
1968                 ac_build_kill(&ctx->ac, NULL);
1969         }
1970 }
1971
1972 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
1973                                                   LLVMValueRef alpha,
1974                                                   unsigned samplemask_param)
1975 {
1976         struct si_shader_context *ctx = si_shader_context(bld_base);
1977         struct gallivm_state *gallivm = &ctx->gallivm;
1978         LLVMValueRef coverage;
1979
1980         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
1981         coverage = LLVMGetParam(ctx->main_fn,
1982                                 samplemask_param);
1983         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
1984
1985         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
1986                                    ctx->i32,
1987                                    &coverage, 1, LP_FUNC_ATTR_READNONE);
1988
1989         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
1990                                    ctx->f32, "");
1991
1992         coverage = LLVMBuildFMul(gallivm->builder, coverage,
1993                                  LLVMConstReal(ctx->f32,
1994                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
1995
1996         return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
1997 }
1998
1999 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2000                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
2001 {
2002         struct si_shader_context *ctx = si_shader_context(bld_base);
2003         struct lp_build_context *base = &bld_base->base;
2004         unsigned reg_index;
2005         unsigned chan;
2006         unsigned const_chan;
2007         LLVMValueRef base_elt;
2008         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2009         LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2010                                                    SI_VS_CONST_CLIP_PLANES, 0);
2011         LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2012
2013         for (reg_index = 0; reg_index < 2; reg_index ++) {
2014                 struct ac_export_args *args = &pos[2 + reg_index];
2015
2016                 args->out[0] =
2017                 args->out[1] =
2018                 args->out[2] =
2019                 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2020
2021                 /* Compute dot products of position and user clip plane vectors */
2022                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2023                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2024                                 LLVMValueRef addr =
2025                                         LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2026                                                                 const_chan) * 4, 0);
2027                                 base_elt = buffer_load_const(ctx, const_resource,
2028                                                              addr);
2029                                 args->out[chan] =
2030                                         lp_build_add(base, args->out[chan],
2031                                                      lp_build_mul(base, base_elt,
2032                                                                   out_elts[const_chan]));
2033                         }
2034                 }
2035
2036                 args->enabled_channels = 0xf;
2037                 args->valid_mask = 0;
2038                 args->done = 0;
2039                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2040                 args->compr = 0;
2041         }
2042 }
2043
2044 static void si_dump_streamout(struct pipe_stream_output_info *so)
2045 {
2046         unsigned i;
2047
2048         if (so->num_outputs)
2049                 fprintf(stderr, "STREAMOUT\n");
2050
2051         for (i = 0; i < so->num_outputs; i++) {
2052                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2053                                 so->output[i].start_component;
2054                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2055                         i, so->output[i].output_buffer,
2056                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2057                         so->output[i].register_index,
2058                         mask & 1 ? "x" : "",
2059                         mask & 2 ? "y" : "",
2060                         mask & 4 ? "z" : "",
2061                         mask & 8 ? "w" : "");
2062         }
2063 }
2064
2065 static void emit_streamout_output(struct si_shader_context *ctx,
2066                                   LLVMValueRef const *so_buffers,
2067                                   LLVMValueRef const *so_write_offsets,
2068                                   struct pipe_stream_output *stream_out,
2069                                   struct si_shader_output_values *shader_out)
2070 {
2071         struct gallivm_state *gallivm = &ctx->gallivm;
2072         LLVMBuilderRef builder = gallivm->builder;
2073         unsigned buf_idx = stream_out->output_buffer;
2074         unsigned start = stream_out->start_component;
2075         unsigned num_comps = stream_out->num_components;
2076         LLVMValueRef out[4];
2077
2078         assert(num_comps && num_comps <= 4);
2079         if (!num_comps || num_comps > 4)
2080                 return;
2081
2082         /* Load the output as int. */
2083         for (int j = 0; j < num_comps; j++) {
2084                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2085
2086                 out[j] = LLVMBuildBitCast(builder,
2087                                           shader_out->values[start + j],
2088                                 ctx->i32, "");
2089         }
2090
2091         /* Pack the output. */
2092         LLVMValueRef vdata = NULL;
2093
2094         switch (num_comps) {
2095         case 1: /* as i32 */
2096                 vdata = out[0];
2097                 break;
2098         case 2: /* as v2i32 */
2099         case 3: /* as v4i32 (aligned to 4) */
2100         case 4: /* as v4i32 */
2101                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2102                 for (int j = 0; j < num_comps; j++) {
2103                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2104                                                        LLVMConstInt(ctx->i32, j, 0), "");
2105                 }
2106                 break;
2107         }
2108
2109         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2110                                     vdata, num_comps,
2111                                     so_write_offsets[buf_idx],
2112                                     ctx->i32_0,
2113                                     stream_out->dst_offset * 4, 1, 1, true, false);
2114 }
2115
2116 /**
2117  * Write streamout data to buffers for vertex stream @p stream (different
2118  * vertex streams can occur for GS copy shaders).
2119  */
2120 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2121                                    struct si_shader_output_values *outputs,
2122                                    unsigned noutput, unsigned stream)
2123 {
2124         struct si_shader_selector *sel = ctx->shader->selector;
2125         struct pipe_stream_output_info *so = &sel->so;
2126         struct gallivm_state *gallivm = &ctx->gallivm;
2127         LLVMBuilderRef builder = gallivm->builder;
2128         int i;
2129         struct lp_build_if_state if_ctx;
2130
2131         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2132         LLVMValueRef so_vtx_count =
2133                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2134
2135         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2136
2137         /* can_emit = tid < so_vtx_count; */
2138         LLVMValueRef can_emit =
2139                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2140
2141         /* Emit the streamout code conditionally. This actually avoids
2142          * out-of-bounds buffer access. The hw tells us via the SGPR
2143          * (so_vtx_count) which threads are allowed to emit streamout data. */
2144         lp_build_if(&if_ctx, gallivm, can_emit);
2145         {
2146                 /* The buffer offset is computed as follows:
2147                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2148                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2149                  *                attrib_offset
2150                  */
2151
2152                 LLVMValueRef so_write_index =
2153                         LLVMGetParam(ctx->main_fn,
2154                                      ctx->param_streamout_write_index);
2155
2156                 /* Compute (streamout_write_index + thread_id). */
2157                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2158
2159                 /* Load the descriptor and compute the write offset for each
2160                  * enabled buffer. */
2161                 LLVMValueRef so_write_offset[4] = {};
2162                 LLVMValueRef so_buffers[4];
2163                 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2164                                                     ctx->param_rw_buffers);
2165
2166                 for (i = 0; i < 4; i++) {
2167                         if (!so->stride[i])
2168                                 continue;
2169
2170                         LLVMValueRef offset = LLVMConstInt(ctx->i32,
2171                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
2172
2173                         so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2174
2175                         LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2176                                                               ctx->param_streamout_offset[i]);
2177                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2178
2179                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2180                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2181                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2182                 }
2183
2184                 /* Write streamout data. */
2185                 for (i = 0; i < so->num_outputs; i++) {
2186                         unsigned reg = so->output[i].register_index;
2187
2188                         if (reg >= noutput)
2189                                 continue;
2190
2191                         if (stream != so->output[i].stream)
2192                                 continue;
2193
2194                         emit_streamout_output(ctx, so_buffers, so_write_offset,
2195                                               &so->output[i], &outputs[reg]);
2196                 }
2197         }
2198         lp_build_endif(&if_ctx);
2199 }
2200
2201
2202 /* Generate export instructions for hardware VS shader stage */
2203 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2204                               struct si_shader_output_values *outputs,
2205                               unsigned noutput)
2206 {
2207         struct si_shader_context *ctx = si_shader_context(bld_base);
2208         struct si_shader *shader = ctx->shader;
2209         struct lp_build_context *base = &bld_base->base;
2210         struct ac_export_args args, pos_args[4] = {};
2211         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2212         unsigned semantic_name, semantic_index;
2213         unsigned target;
2214         unsigned param_count = 0;
2215         unsigned pos_idx;
2216         int i;
2217
2218         for (i = 0; i < noutput; i++) {
2219                 semantic_name = outputs[i].semantic_name;
2220                 semantic_index = outputs[i].semantic_index;
2221                 bool export_param = true;
2222
2223                 switch (semantic_name) {
2224                 case TGSI_SEMANTIC_POSITION: /* ignore these */
2225                 case TGSI_SEMANTIC_PSIZE:
2226                 case TGSI_SEMANTIC_CLIPVERTEX:
2227                 case TGSI_SEMANTIC_EDGEFLAG:
2228                         break;
2229                 case TGSI_SEMANTIC_GENERIC:
2230                 case TGSI_SEMANTIC_CLIPDIST:
2231                         if (shader->key.opt.hw_vs.kill_outputs &
2232                             (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2233                                 export_param = false;
2234                         break;
2235                 default:
2236                         if (shader->key.opt.hw_vs.kill_outputs2 &
2237                             (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2238                                 export_param = false;
2239                         break;
2240                 }
2241
2242                 if (outputs[i].vertex_stream[0] != 0 &&
2243                     outputs[i].vertex_stream[1] != 0 &&
2244                     outputs[i].vertex_stream[2] != 0 &&
2245                     outputs[i].vertex_stream[3] != 0)
2246                         export_param = false;
2247
2248 handle_semantic:
2249                 /* Select the correct target */
2250                 switch(semantic_name) {
2251                 case TGSI_SEMANTIC_PSIZE:
2252                         psize_value = outputs[i].values[0];
2253                         continue;
2254                 case TGSI_SEMANTIC_EDGEFLAG:
2255                         edgeflag_value = outputs[i].values[0];
2256                         continue;
2257                 case TGSI_SEMANTIC_LAYER:
2258                         layer_value = outputs[i].values[0];
2259                         semantic_name = TGSI_SEMANTIC_GENERIC;
2260                         goto handle_semantic;
2261                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2262                         viewport_index_value = outputs[i].values[0];
2263                         semantic_name = TGSI_SEMANTIC_GENERIC;
2264                         goto handle_semantic;
2265                 case TGSI_SEMANTIC_POSITION:
2266                         target = V_008DFC_SQ_EXP_POS;
2267                         break;
2268                 case TGSI_SEMANTIC_CLIPDIST:
2269                         if (shader->key.opt.hw_vs.clip_disable) {
2270                                 semantic_name = TGSI_SEMANTIC_GENERIC;
2271                                 goto handle_semantic;
2272                         }
2273                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2274                         break;
2275                 case TGSI_SEMANTIC_CLIPVERTEX:
2276                         if (shader->key.opt.hw_vs.clip_disable)
2277                                 continue;
2278                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2279                         continue;
2280                 case TGSI_SEMANTIC_COLOR:
2281                 case TGSI_SEMANTIC_BCOLOR:
2282                 case TGSI_SEMANTIC_PRIMID:
2283                 case TGSI_SEMANTIC_FOG:
2284                 case TGSI_SEMANTIC_TEXCOORD:
2285                 case TGSI_SEMANTIC_GENERIC:
2286                         if (!export_param)
2287                                 continue;
2288                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2289                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2290                         shader->info.vs_output_param_offset[i] = param_count;
2291                         param_count++;
2292                         break;
2293                 default:
2294                         target = 0;
2295                         fprintf(stderr,
2296                                 "Warning: SI unhandled vs output type:%d\n",
2297                                 semantic_name);
2298                 }
2299
2300                 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2301
2302                 if (target >= V_008DFC_SQ_EXP_POS &&
2303                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
2304                         memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2305                                &args, sizeof(args));
2306                 } else {
2307                         ac_build_export(&ctx->ac, &args);
2308                 }
2309
2310                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2311                         semantic_name = TGSI_SEMANTIC_GENERIC;
2312                         goto handle_semantic;
2313                 }
2314         }
2315
2316         shader->info.nr_param_exports = param_count;
2317
2318         /* We need to add the position output manually if it's missing. */
2319         if (!pos_args[0].out[0]) {
2320                 pos_args[0].enabled_channels = 0xf; /* writemask */
2321                 pos_args[0].valid_mask = 0; /* EXEC mask */
2322                 pos_args[0].done = 0; /* last export? */
2323                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2324                 pos_args[0].compr = 0; /* COMPR flag */
2325                 pos_args[0].out[0] = base->zero; /* X */
2326                 pos_args[0].out[1] = base->zero; /* Y */
2327                 pos_args[0].out[2] = base->zero; /* Z */
2328                 pos_args[0].out[3] = base->one;  /* W */
2329         }
2330
2331         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2332         if (shader->selector->info.writes_psize ||
2333             shader->selector->info.writes_edgeflag ||
2334             shader->selector->info.writes_viewport_index ||
2335             shader->selector->info.writes_layer) {
2336                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2337                                                (shader->selector->info.writes_edgeflag << 1) |
2338                                                (shader->selector->info.writes_layer << 2) |
2339                                                (shader->selector->info.writes_viewport_index << 3);
2340                 pos_args[1].valid_mask = 0; /* EXEC mask */
2341                 pos_args[1].done = 0; /* last export? */
2342                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2343                 pos_args[1].compr = 0; /* COMPR flag */
2344                 pos_args[1].out[0] = base->zero; /* X */
2345                 pos_args[1].out[1] = base->zero; /* Y */
2346                 pos_args[1].out[2] = base->zero; /* Z */
2347                 pos_args[1].out[3] = base->zero; /* W */
2348
2349                 if (shader->selector->info.writes_psize)
2350                         pos_args[1].out[0] = psize_value;
2351
2352                 if (shader->selector->info.writes_edgeflag) {
2353                         /* The output is a float, but the hw expects an integer
2354                          * with the first bit containing the edge flag. */
2355                         edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2356                                                          edgeflag_value,
2357                                                          ctx->i32, "");
2358                         edgeflag_value = lp_build_min(&bld_base->int_bld,
2359                                                       edgeflag_value,
2360                                                       ctx->i32_1);
2361
2362                         /* The LLVM intrinsic expects a float. */
2363                         pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2364                                                           edgeflag_value,
2365                                                           ctx->f32, "");
2366                 }
2367
2368                 if (shader->selector->info.writes_layer)
2369                         pos_args[1].out[2] = layer_value;
2370
2371                 if (shader->selector->info.writes_viewport_index)
2372                         pos_args[1].out[3] = viewport_index_value;
2373         }
2374
2375         for (i = 0; i < 4; i++)
2376                 if (pos_args[i].out[0])
2377                         shader->info.nr_pos_exports++;
2378
2379         pos_idx = 0;
2380         for (i = 0; i < 4; i++) {
2381                 if (!pos_args[i].out[0])
2382                         continue;
2383
2384                 /* Specify the target we are exporting */
2385                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2386
2387                 if (pos_idx == shader->info.nr_pos_exports)
2388                         /* Specify that this is the last export */
2389                         pos_args[i].done = 1;
2390
2391                 ac_build_export(&ctx->ac, &pos_args[i]);
2392         }
2393 }
2394
2395 /**
2396  * Forward all outputs from the vertex shader to the TES. This is only used
2397  * for the fixed function TCS.
2398  */
2399 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2400 {
2401         struct si_shader_context *ctx = si_shader_context(bld_base);
2402         struct gallivm_state *gallivm = &ctx->gallivm;
2403         LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2404         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2405         uint64_t inputs;
2406
2407         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2408
2409         rw_buffers = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2410         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2411                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2412
2413         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2414
2415         lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2416         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2417                                          lds_vertex_stride, "");
2418         lds_base = get_tcs_in_current_patch_offset(ctx);
2419         lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2420
2421         inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2422         while (inputs) {
2423                 unsigned i = u_bit_scan64(&inputs);
2424
2425                 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2426                                             LLVMConstInt(ctx->i32, 4 * i, 0),
2427                                              "");
2428
2429                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2430                                               get_rel_patch_id(ctx),
2431                                               invocation_id,
2432                                               LLVMConstInt(ctx->i32, i, 0));
2433
2434                 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2435                                               lds_ptr);
2436
2437                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2438                                             buffer_offset, 0, 1, 0, true, false);
2439         }
2440 }
2441
2442 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2443                                   LLVMValueRef rel_patch_id,
2444                                   LLVMValueRef invocation_id,
2445                                   LLVMValueRef tcs_out_current_patch_data_offset)
2446 {
2447         struct si_shader_context *ctx = si_shader_context(bld_base);
2448         struct gallivm_state *gallivm = &ctx->gallivm;
2449         struct si_shader *shader = ctx->shader;
2450         unsigned tess_inner_index, tess_outer_index;
2451         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2452         LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
2453         unsigned stride, outer_comps, inner_comps, i;
2454         struct lp_build_if_state if_ctx, inner_if_ctx;
2455
2456         si_llvm_emit_barrier(NULL, bld_base, NULL);
2457
2458         /* Do this only for invocation 0, because the tess levels are per-patch,
2459          * not per-vertex.
2460          *
2461          * This can't jump, because invocation 0 executes this. It should
2462          * at least mask out the loads and stores for other invocations.
2463          */
2464         lp_build_if(&if_ctx, gallivm,
2465                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2466                                   invocation_id, ctx->i32_0, ""));
2467
2468         /* Determine the layout of one tess factor element in the buffer. */
2469         switch (shader->key.part.tcs.epilog.prim_mode) {
2470         case PIPE_PRIM_LINES:
2471                 stride = 2; /* 2 dwords, 1 vec2 store */
2472                 outer_comps = 2;
2473                 inner_comps = 0;
2474                 break;
2475         case PIPE_PRIM_TRIANGLES:
2476                 stride = 4; /* 4 dwords, 1 vec4 store */
2477                 outer_comps = 3;
2478                 inner_comps = 1;
2479                 break;
2480         case PIPE_PRIM_QUADS:
2481                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2482                 outer_comps = 4;
2483                 inner_comps = 2;
2484                 break;
2485         default:
2486                 assert(0);
2487                 return;
2488         }
2489
2490         /* Load tess_inner and tess_outer from LDS.
2491          * Any invocation can write them, so we can't get them from a temporary.
2492          */
2493         tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2494         tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2495
2496         lds_base = tcs_out_current_patch_data_offset;
2497         lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2498                                  LLVMConstInt(ctx->i32,
2499                                               tess_inner_index * 4, 0), "");
2500         lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2501                                  LLVMConstInt(ctx->i32,
2502                                               tess_outer_index * 4, 0), "");
2503
2504         for (i = 0; i < 4; i++) {
2505                 inner[i] = LLVMGetUndef(ctx->i32);
2506                 outer[i] = LLVMGetUndef(ctx->i32);
2507         }
2508
2509         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2510                 /* For isolines, the hardware expects tess factors in the
2511                  * reverse order from what GLSL / TGSI specify.
2512                  */
2513                 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2514                 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2515         } else {
2516                 for (i = 0; i < outer_comps; i++) {
2517                         outer[i] = out[i] =
2518                                 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2519                 }
2520                 for (i = 0; i < inner_comps; i++) {
2521                         inner[i] = out[outer_comps+i] =
2522                                 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2523                 }
2524         }
2525
2526         /* Convert the outputs to vectors for stores. */
2527         vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2528         vec1 = NULL;
2529
2530         if (stride > 4)
2531                 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2532
2533         /* Get the buffer. */
2534         rw_buffers = LLVMGetParam(ctx->main_fn,
2535                                   ctx->param_rw_buffers);
2536         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2537                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0));
2538
2539         /* Get the offset. */
2540         tf_base = LLVMGetParam(ctx->main_fn,
2541                                ctx->param_tcs_factor_offset);
2542         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2543                                   LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2544
2545         lp_build_if(&inner_if_ctx, gallivm,
2546                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2547                                   rel_patch_id, ctx->i32_0, ""));
2548
2549         /* Store the dynamic HS control word. */
2550         ac_build_buffer_store_dword(&ctx->ac, buffer,
2551                                     LLVMConstInt(ctx->i32, 0x80000000, 0),
2552                                     1, ctx->i32_0, tf_base,
2553                                     0, 1, 0, true, false);
2554
2555         lp_build_endif(&inner_if_ctx);
2556
2557         /* Store the tessellation factors. */
2558         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2559                                     MIN2(stride, 4), byteoffset, tf_base,
2560                                     4, 1, 0, true, false);
2561         if (vec1)
2562                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2563                                             stride - 4, byteoffset, tf_base,
2564                                             20, 1, 0, true, false);
2565
2566         /* Store the tess factors into the offchip buffer if TES reads them. */
2567         if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2568                 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2569                 LLVMValueRef tf_inner_offset;
2570                 unsigned param_outer, param_inner;
2571
2572                 buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2573                                 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2574                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2575
2576                 param_outer = si_shader_io_get_unique_index(
2577                                       TGSI_SEMANTIC_TESSOUTER, 0);
2578                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2579                                         LLVMConstInt(ctx->i32, param_outer, 0));
2580
2581                 outer_vec = lp_build_gather_values(gallivm, outer,
2582                                                    util_next_power_of_two(outer_comps));
2583
2584                 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2585                                             outer_comps, tf_outer_offset,
2586                                             base, 0, 1, 0, true, false);
2587                 if (inner_comps) {
2588                         param_inner = si_shader_io_get_unique_index(
2589                                               TGSI_SEMANTIC_TESSINNER, 0);
2590                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2591                                         LLVMConstInt(ctx->i32, param_inner, 0));
2592
2593                         inner_vec = inner_comps == 1 ? inner[0] :
2594                                     lp_build_gather_values(gallivm, inner, inner_comps);
2595                         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2596                                                     inner_comps, tf_inner_offset,
2597                                                     base, 0, 1, 0, true, false);
2598                 }
2599         }
2600
2601         lp_build_endif(&if_ctx);
2602 }
2603
2604 /* This only writes the tessellation factor levels. */
2605 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2606 {
2607         struct si_shader_context *ctx = si_shader_context(bld_base);
2608         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2609         LLVMValueRef offchip_soffset, offchip_layout;
2610
2611         si_copy_tcs_inputs(bld_base);
2612
2613         rel_patch_id = get_rel_patch_id(ctx);
2614         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2615         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2616
2617         /* Return epilog parameters from this function. */
2618         LLVMBuilderRef builder = ctx->gallivm.builder;
2619         LLVMValueRef ret = ctx->return_value;
2620         LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2621         unsigned vgpr;
2622
2623         /* RW_BUFFERS pointer */
2624         rw_buffers = LLVMGetParam(ctx->main_fn,
2625                                   ctx->param_rw_buffers);
2626         rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2627         rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2628         rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2629                                       ctx->i32_0, "");
2630         rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2631                                       ctx->i32_1, "");
2632         ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2633         ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2634
2635         /* Tess offchip and factor buffer soffset are after user SGPRs. */
2636         offchip_layout = LLVMGetParam(ctx->main_fn,
2637                                       ctx->param_tcs_offchip_layout);
2638         offchip_soffset = LLVMGetParam(ctx->main_fn,
2639                                        ctx->param_tcs_offchip_offset);
2640         tf_soffset = LLVMGetParam(ctx->main_fn,
2641                                   ctx->param_tcs_factor_offset);
2642         ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2643                                    GFX6_SGPR_TCS_OFFCHIP_LAYOUT, "");
2644         ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
2645                                    GFX6_TCS_NUM_USER_SGPR, "");
2646         ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2647                                    GFX6_TCS_NUM_USER_SGPR + 1, "");
2648
2649         /* VGPRs */
2650         rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2651         invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2652         tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2653
2654         vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2655         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2656         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2657         ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2658         ctx->return_value = ret;
2659 }
2660
2661 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2662 {
2663         struct si_shader_context *ctx = si_shader_context(bld_base);
2664         struct si_shader *shader = ctx->shader;
2665         struct tgsi_shader_info *info = &shader->selector->info;
2666         struct gallivm_state *gallivm = &ctx->gallivm;
2667         unsigned i, chan;
2668         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2669                                               ctx->param_rel_auto_id);
2670         LLVMValueRef vertex_dw_stride =
2671                 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2672         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2673                                                  vertex_dw_stride, "");
2674
2675         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2676          * its inputs from it. */
2677         for (i = 0; i < info->num_outputs; i++) {
2678                 LLVMValueRef *out_ptr = ctx->outputs[i];
2679                 unsigned name = info->output_semantic_name[i];
2680                 unsigned index = info->output_semantic_index[i];
2681
2682                 /* The ARB_shader_viewport_layer_array spec contains the
2683                  * following issue:
2684                  *
2685                  *    2) What happens if gl_ViewportIndex or gl_Layer is
2686                  *    written in the vertex shader and a geometry shader is
2687                  *    present?
2688                  *
2689                  *    RESOLVED: The value written by the last vertex processing
2690                  *    stage is used. If the last vertex processing stage
2691                  *    (vertex, tessellation evaluation or geometry) does not
2692                  *    statically assign to gl_ViewportIndex or gl_Layer, index
2693                  *    or layer zero is assumed.
2694                  *
2695                  * So writes to those outputs in VS-as-LS are simply ignored.
2696                  */
2697                 if (name == TGSI_SEMANTIC_LAYER ||
2698                     name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2699                         continue;
2700
2701                 int param = si_shader_io_get_unique_index(name, index);
2702                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2703                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
2704
2705                 for (chan = 0; chan < 4; chan++) {
2706                         lds_store(bld_base, chan, dw_addr,
2707                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2708                 }
2709         }
2710 }
2711
2712 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2713 {
2714         struct si_shader_context *ctx = si_shader_context(bld_base);
2715         struct gallivm_state *gallivm = &ctx->gallivm;
2716         struct si_shader *es = ctx->shader;
2717         struct tgsi_shader_info *info = &es->selector->info;
2718         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2719                                             ctx->param_es2gs_offset);
2720         unsigned chan;
2721         int i;
2722
2723         for (i = 0; i < info->num_outputs; i++) {
2724                 LLVMValueRef *out_ptr = ctx->outputs[i];
2725                 int param_index;
2726
2727                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2728                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2729                         continue;
2730
2731                 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2732                                                             info->output_semantic_index[i]);
2733
2734                 for (chan = 0; chan < 4; chan++) {
2735                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2736                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2737
2738                         ac_build_buffer_store_dword(&ctx->ac,
2739                                                     ctx->esgs_ring,
2740                                                     out_val, 1, NULL, soffset,
2741                                                     (4 * param_index + chan) * 4,
2742                                                     1, 1, true, true);
2743                 }
2744         }
2745 }
2746
2747 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2748 {
2749         struct si_shader_context *ctx = si_shader_context(bld_base);
2750
2751         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2752                          LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
2753 }
2754
2755 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2756 {
2757         struct si_shader_context *ctx = si_shader_context(bld_base);
2758         struct gallivm_state *gallivm = &ctx->gallivm;
2759         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2760         struct si_shader_output_values *outputs = NULL;
2761         int i,j;
2762
2763         assert(!ctx->shader->is_gs_copy_shader);
2764
2765         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2766
2767         /* Vertex color clamping.
2768          *
2769          * This uses a state constant loaded in a user data SGPR and
2770          * an IF statement is added that clamps all colors if the constant
2771          * is true.
2772          */
2773         if (ctx->type == PIPE_SHADER_VERTEX) {
2774                 struct lp_build_if_state if_ctx;
2775                 LLVMValueRef cond = NULL;
2776                 LLVMValueRef addr, val;
2777
2778                 for (i = 0; i < info->num_outputs; i++) {
2779                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2780                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2781                                 continue;
2782
2783                         /* We've found a color. */
2784                         if (!cond) {
2785                                 /* The state is in the first bit of the user SGPR. */
2786                                 cond = LLVMGetParam(ctx->main_fn,
2787                                                     ctx->param_vs_state_bits);
2788                                 cond = LLVMBuildTrunc(gallivm->builder, cond,
2789                                                       ctx->i1, "");
2790                                 lp_build_if(&if_ctx, gallivm, cond);
2791                         }
2792
2793                         for (j = 0; j < 4; j++) {
2794                                 addr = ctx->outputs[i][j];
2795                                 val = LLVMBuildLoad(gallivm->builder, addr, "");
2796                                 val = ac_build_clamp(&ctx->ac, val);
2797                                 LLVMBuildStore(gallivm->builder, val, addr);
2798                         }
2799                 }
2800
2801                 if (cond)
2802                         lp_build_endif(&if_ctx);
2803         }
2804
2805         for (i = 0; i < info->num_outputs; i++) {
2806                 outputs[i].semantic_name = info->output_semantic_name[i];
2807                 outputs[i].semantic_index = info->output_semantic_index[i];
2808
2809                 for (j = 0; j < 4; j++) {
2810                         outputs[i].values[j] =
2811                                 LLVMBuildLoad(gallivm->builder,
2812                                               ctx->outputs[i][j],
2813                                               "");
2814                         outputs[i].vertex_stream[j] =
2815                                 (info->output_streams[i] >> (2 * j)) & 3;
2816                 }
2817
2818         }
2819
2820         /* Return the primitive ID from the LLVM function. */
2821         ctx->return_value =
2822                 LLVMBuildInsertValue(gallivm->builder,
2823                                      ctx->return_value,
2824                                      bitcast(bld_base, TGSI_TYPE_FLOAT,
2825                                              get_primitive_id(bld_base, 0)),
2826                                      VS_EPILOG_PRIMID_LOC, "");
2827
2828         if (ctx->shader->selector->so.num_outputs)
2829                 si_llvm_emit_streamout(ctx, outputs, i, 0);
2830         si_llvm_export_vs(bld_base, outputs, i);
2831         FREE(outputs);
2832 }
2833
2834 struct si_ps_exports {
2835         unsigned num;
2836         struct ac_export_args args[10];
2837 };
2838
2839 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
2840                                     bool writes_samplemask)
2841 {
2842         if (writes_z) {
2843                 /* Z needs 32 bits. */
2844                 if (writes_samplemask)
2845                         return V_028710_SPI_SHADER_32_ABGR;
2846                 else if (writes_stencil)
2847                         return V_028710_SPI_SHADER_32_GR;
2848                 else
2849                         return V_028710_SPI_SHADER_32_R;
2850         } else if (writes_stencil || writes_samplemask) {
2851                 /* Both stencil and sample mask need only 16 bits. */
2852                 return V_028710_SPI_SHADER_UINT16_ABGR;
2853         } else {
2854                 return V_028710_SPI_SHADER_ZERO;
2855         }
2856 }
2857
2858 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2859                             LLVMValueRef depth, LLVMValueRef stencil,
2860                             LLVMValueRef samplemask, struct si_ps_exports *exp)
2861 {
2862         struct si_shader_context *ctx = si_shader_context(bld_base);
2863         struct lp_build_context *base = &bld_base->base;
2864         struct ac_export_args args;
2865         unsigned mask = 0;
2866         unsigned format = si_get_spi_shader_z_format(depth != NULL,
2867                                                      stencil != NULL,
2868                                                      samplemask != NULL);
2869
2870         assert(depth || stencil || samplemask);
2871
2872         args.valid_mask = 1; /* whether the EXEC mask is valid */
2873         args.done = 1; /* DONE bit */
2874
2875         /* Specify the target we are exporting */
2876         args.target = V_008DFC_SQ_EXP_MRTZ;
2877
2878         args.compr = 0; /* COMP flag */
2879         args.out[0] = base->undef; /* R, depth */
2880         args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2881         args.out[2] = base->undef; /* B, sample mask */
2882         args.out[3] = base->undef; /* A, alpha to mask */
2883
2884         if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
2885                 assert(!depth);
2886                 args.compr = 1; /* COMPR flag */
2887
2888                 if (stencil) {
2889                         /* Stencil should be in X[23:16]. */
2890                         stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
2891                         stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
2892                                                LLVMConstInt(ctx->i32, 16, 0), "");
2893                         args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
2894                         mask |= 0x3;
2895                 }
2896                 if (samplemask) {
2897                         /* SampleMask should be in Y[15:0]. */
2898                         args.out[1] = samplemask;
2899                         mask |= 0xc;
2900                 }
2901         } else {
2902                 if (depth) {
2903                         args.out[0] = depth;
2904                         mask |= 0x1;
2905                 }
2906                 if (stencil) {
2907                         args.out[1] = stencil;
2908                         mask |= 0x2;
2909                 }
2910                 if (samplemask) {
2911                         args.out[2] = samplemask;
2912                         mask |= 0x4;
2913                 }
2914         }
2915
2916         /* SI (except OLAND and HAINAN) has a bug that it only looks
2917          * at the X writemask component. */
2918         if (ctx->screen->b.chip_class == SI &&
2919             ctx->screen->b.family != CHIP_OLAND &&
2920             ctx->screen->b.family != CHIP_HAINAN)
2921                 mask |= 0x1;
2922
2923         /* Specify which components to enable */
2924         args.enabled_channels = mask;
2925
2926         memcpy(&exp->args[exp->num++], &args, sizeof(args));
2927 }
2928
2929 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2930                                 LLVMValueRef *color, unsigned index,
2931                                 unsigned samplemask_param,
2932                                 bool is_last, struct si_ps_exports *exp)
2933 {
2934         struct si_shader_context *ctx = si_shader_context(bld_base);
2935         struct lp_build_context *base = &bld_base->base;
2936         int i;
2937
2938         /* Clamp color */
2939         if (ctx->shader->key.part.ps.epilog.clamp_color)
2940                 for (i = 0; i < 4; i++)
2941                         color[i] = ac_build_clamp(&ctx->ac, color[i]);
2942
2943         /* Alpha to one */
2944         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
2945                 color[3] = base->one;
2946
2947         /* Alpha test */
2948         if (index == 0 &&
2949             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2950                 si_alpha_test(bld_base, color[3]);
2951
2952         /* Line & polygon smoothing */
2953         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
2954                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2955                                                          samplemask_param);
2956
2957         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2958         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
2959                 struct ac_export_args args[8];
2960                 int c, last = -1;
2961
2962                 /* Get the export arguments, also find out what the last one is. */
2963                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
2964                         si_llvm_init_export_args(bld_base, color,
2965                                                  V_008DFC_SQ_EXP_MRT + c, &args[c]);
2966                         if (args[c].enabled_channels)
2967                                 last = c;
2968                 }
2969
2970                 /* Emit all exports. */
2971                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
2972                         if (is_last && last == c) {
2973                                 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
2974                                 args[c].done = 1; /* DONE bit */
2975                         } else if (!args[c].enabled_channels)
2976                                 continue; /* unnecessary NULL export */
2977
2978                         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
2979                 }
2980         } else {
2981                 struct ac_export_args args;
2982
2983                 /* Export */
2984                 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
2985                                          &args);
2986                 if (is_last) {
2987                         args.valid_mask = 1; /* whether the EXEC mask is valid */
2988                         args.done = 1; /* DONE bit */
2989                 } else if (!args.enabled_channels)
2990                         return; /* unnecessary NULL export */
2991
2992                 memcpy(&exp->args[exp->num++], &args, sizeof(args));
2993         }
2994 }
2995
2996 static void si_emit_ps_exports(struct si_shader_context *ctx,
2997                                struct si_ps_exports *exp)
2998 {
2999         for (unsigned i = 0; i < exp->num; i++)
3000                 ac_build_export(&ctx->ac, &exp->args[i]);
3001 }
3002
3003 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3004 {
3005         struct si_shader_context *ctx = si_shader_context(bld_base);
3006         struct lp_build_context *base = &bld_base->base;
3007         struct ac_export_args args;
3008
3009         args.enabled_channels = 0x0; /* enabled channels */
3010         args.valid_mask = 1; /* whether the EXEC mask is valid */
3011         args.done = 1; /* DONE bit */
3012         args.target = V_008DFC_SQ_EXP_NULL;
3013         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3014         args.out[0] = base->undef; /* R */
3015         args.out[1] = base->undef; /* G */
3016         args.out[2] = base->undef; /* B */
3017         args.out[3] = base->undef; /* A */
3018
3019         ac_build_export(&ctx->ac, &args);
3020 }
3021
3022 /**
3023  * Return PS outputs in this order:
3024  *
3025  * v[0:3] = color0.xyzw
3026  * v[4:7] = color1.xyzw
3027  * ...
3028  * vN+0 = Depth
3029  * vN+1 = Stencil
3030  * vN+2 = SampleMask
3031  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3032  *
3033  * The alpha-ref SGPR is returned via its original location.
3034  */
3035 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3036 {
3037         struct si_shader_context *ctx = si_shader_context(bld_base);
3038         struct si_shader *shader = ctx->shader;
3039         struct tgsi_shader_info *info = &shader->selector->info;
3040         LLVMBuilderRef builder = ctx->gallivm.builder;
3041         unsigned i, j, first_vgpr, vgpr;
3042
3043         LLVMValueRef color[8][4] = {};
3044         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3045         LLVMValueRef ret;
3046
3047         /* Read the output values. */
3048         for (i = 0; i < info->num_outputs; i++) {
3049                 unsigned semantic_name = info->output_semantic_name[i];
3050                 unsigned semantic_index = info->output_semantic_index[i];
3051
3052                 switch (semantic_name) {
3053                 case TGSI_SEMANTIC_COLOR:
3054                         assert(semantic_index < 8);
3055                         for (j = 0; j < 4; j++) {
3056                                 LLVMValueRef ptr = ctx->outputs[i][j];
3057                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3058                                 color[semantic_index][j] = result;
3059                         }
3060                         break;
3061                 case TGSI_SEMANTIC_POSITION:
3062                         depth = LLVMBuildLoad(builder,
3063                                               ctx->outputs[i][2], "");
3064                         break;
3065                 case TGSI_SEMANTIC_STENCIL:
3066                         stencil = LLVMBuildLoad(builder,
3067                                                 ctx->outputs[i][1], "");
3068                         break;
3069                 case TGSI_SEMANTIC_SAMPLEMASK:
3070                         samplemask = LLVMBuildLoad(builder,
3071                                                    ctx->outputs[i][0], "");
3072                         break;
3073                 default:
3074                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3075                                 semantic_name);
3076                 }
3077         }
3078
3079         /* Fill the return structure. */
3080         ret = ctx->return_value;
3081
3082         /* Set SGPRs. */
3083         ret = LLVMBuildInsertValue(builder, ret,
3084                                    bitcast(bld_base, TGSI_TYPE_SIGNED,
3085                                            LLVMGetParam(ctx->main_fn,
3086                                                         SI_PARAM_ALPHA_REF)),
3087                                    SI_SGPR_ALPHA_REF, "");
3088
3089         /* Set VGPRs */
3090         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3091         for (i = 0; i < ARRAY_SIZE(color); i++) {
3092                 if (!color[i][0])
3093                         continue;
3094
3095                 for (j = 0; j < 4; j++)
3096                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3097         }
3098         if (depth)
3099                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3100         if (stencil)
3101                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3102         if (samplemask)
3103                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3104
3105         /* Add the input sample mask for smoothing at the end. */
3106         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3107                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3108         ret = LLVMBuildInsertValue(builder, ret,
3109                                    LLVMGetParam(ctx->main_fn,
3110                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3111
3112         ctx->return_value = ret;
3113 }
3114
3115 /**
3116  * Given a v8i32 resource descriptor for a buffer, extract the size of the
3117  * buffer in number of elements and return it as an i32.
3118  */
3119 static LLVMValueRef get_buffer_size(
3120         struct lp_build_tgsi_context *bld_base,
3121         LLVMValueRef descriptor)
3122 {
3123         struct si_shader_context *ctx = si_shader_context(bld_base);
3124         struct gallivm_state *gallivm = &ctx->gallivm;
3125         LLVMBuilderRef builder = gallivm->builder;
3126         LLVMValueRef size =
3127                 LLVMBuildExtractElement(builder, descriptor,
3128                                         LLVMConstInt(ctx->i32, 2, 0), "");
3129
3130         if (ctx->screen->b.chip_class == VI) {
3131                 /* On VI, the descriptor contains the size in bytes,
3132                  * but TXQ must return the size in elements.
3133                  * The stride is always non-zero for resources using TXQ.
3134                  */
3135                 LLVMValueRef stride =
3136                         LLVMBuildExtractElement(builder, descriptor,
3137                                                 ctx->i32_1, "");
3138                 stride = LLVMBuildLShr(builder, stride,
3139                                        LLVMConstInt(ctx->i32, 16, 0), "");
3140                 stride = LLVMBuildAnd(builder, stride,
3141                                       LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3142
3143                 size = LLVMBuildUDiv(builder, size, stride, "");
3144         }
3145
3146         return size;
3147 }
3148
3149 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3150                                 struct lp_build_tgsi_context *bld_base,
3151                                 struct lp_build_emit_data *emit_data);
3152
3153 /* Prevent optimizations (at least of memory accesses) across the current
3154  * point in the program by emitting empty inline assembly that is marked as
3155  * having side effects.
3156  *
3157  * Optionally, a value can be passed through the inline assembly to prevent
3158  * LLVM from hoisting calls to ReadNone functions.
3159  */
3160 static void emit_optimization_barrier(struct si_shader_context *ctx,
3161                                       LLVMValueRef *pvgpr)
3162 {
3163         static int counter = 0;
3164
3165         LLVMBuilderRef builder = ctx->gallivm.builder;
3166         char code[16];
3167
3168         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3169
3170         if (!pvgpr) {
3171                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3172                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3173                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3174         } else {
3175                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3176                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3177                 LLVMValueRef vgpr = *pvgpr;
3178                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3179                 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3180                 LLVMValueRef vgpr0;
3181
3182                 assert(vgpr_size % 4 == 0);
3183
3184                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3185                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3186                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3187                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3188                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3189
3190                 *pvgpr = vgpr;
3191         }
3192 }
3193
3194 /* Combine these with & instead of |. */
3195 #define NOOP_WAITCNT 0xf7f
3196 #define LGKM_CNT 0x07f
3197 #define VM_CNT 0xf70
3198
3199 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3200 {
3201         struct gallivm_state *gallivm = &ctx->gallivm;
3202         LLVMBuilderRef builder = gallivm->builder;
3203         LLVMValueRef args[1] = {
3204                 LLVMConstInt(ctx->i32, simm16, 0)
3205         };
3206         lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3207                            ctx->voidt, args, 1, 0);
3208 }
3209
3210 static void membar_emit(
3211                 const struct lp_build_tgsi_action *action,
3212                 struct lp_build_tgsi_context *bld_base,
3213                 struct lp_build_emit_data *emit_data)
3214 {
3215         struct si_shader_context *ctx = si_shader_context(bld_base);
3216         LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3217         unsigned flags = LLVMConstIntGetZExtValue(src0);
3218         unsigned waitcnt = NOOP_WAITCNT;
3219
3220         if (flags & TGSI_MEMBAR_THREAD_GROUP)
3221                 waitcnt &= VM_CNT & LGKM_CNT;
3222
3223         if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3224                      TGSI_MEMBAR_SHADER_BUFFER |
3225                      TGSI_MEMBAR_SHADER_IMAGE))
3226                 waitcnt &= VM_CNT;
3227
3228         if (flags & TGSI_MEMBAR_SHARED)
3229                 waitcnt &= LGKM_CNT;
3230
3231         if (waitcnt != NOOP_WAITCNT)
3232                 emit_waitcnt(ctx, waitcnt);
3233 }
3234
3235 static void clock_emit(
3236                 const struct lp_build_tgsi_action *action,
3237                 struct lp_build_tgsi_context *bld_base,
3238                 struct lp_build_emit_data *emit_data)
3239 {
3240         struct si_shader_context *ctx = si_shader_context(bld_base);
3241         struct gallivm_state *gallivm = &ctx->gallivm;
3242         LLVMValueRef tmp;
3243
3244         tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3245                                  ctx->i64, NULL, 0, 0);
3246         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3247
3248         emit_data->output[0] =
3249                 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3250         emit_data->output[1] =
3251                 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3252 }
3253
3254 static LLVMValueRef
3255 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3256                          const struct tgsi_full_src_register *reg)
3257 {
3258         LLVMValueRef index;
3259         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3260                                              ctx->param_shader_buffers);
3261
3262         if (!reg->Register.Indirect)
3263                 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3264         else
3265                 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3266                                                    reg->Register.Index,
3267                                                    SI_NUM_SHADER_BUFFERS);
3268
3269         return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3270 }
3271
3272 static bool tgsi_is_array_sampler(unsigned target)
3273 {
3274         return target == TGSI_TEXTURE_1D_ARRAY ||
3275                target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3276                target == TGSI_TEXTURE_2D_ARRAY ||
3277                target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3278                target == TGSI_TEXTURE_CUBE_ARRAY ||
3279                target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3280                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3281 }
3282
3283 static bool tgsi_is_array_image(unsigned target)
3284 {
3285         return target == TGSI_TEXTURE_3D ||
3286                target == TGSI_TEXTURE_CUBE ||
3287                target == TGSI_TEXTURE_1D_ARRAY ||
3288                target == TGSI_TEXTURE_2D_ARRAY ||
3289                target == TGSI_TEXTURE_CUBE_ARRAY ||
3290                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3291 }
3292
3293 /**
3294  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3295  *
3296  * At least on Tonga, executing image stores on images with DCC enabled and
3297  * non-trivial can eventually lead to lockups. This can occur when an
3298  * application binds an image as read-only but then uses a shader that writes
3299  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3300  * program termination) in this case, but it doesn't cost much to be a bit
3301  * nicer: disabling DCC in the shader still leads to undefined results but
3302  * avoids the lockup.
3303  */
3304 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3305                                   LLVMValueRef rsrc)
3306 {
3307         if (ctx->screen->b.chip_class <= CIK) {
3308                 return rsrc;
3309         } else {
3310                 LLVMBuilderRef builder = ctx->gallivm.builder;
3311                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3312                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3313                 LLVMValueRef tmp;
3314
3315                 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3316                 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3317                 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3318         }
3319 }
3320
3321 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3322 {
3323         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3324                                CONST_ADDR_SPACE);
3325 }
3326
3327 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3328                                     LLVMValueRef list, LLVMValueRef index,
3329                                     unsigned target)
3330 {
3331         LLVMBuilderRef builder = ctx->gallivm.builder;
3332
3333         if (target == TGSI_TEXTURE_BUFFER) {
3334                 index = LLVMBuildMul(builder, index,
3335                                      LLVMConstInt(ctx->i32, 2, 0), "");
3336                 index = LLVMBuildAdd(builder, index,
3337                                      ctx->i32_1, "");
3338                 list = LLVMBuildPointerCast(builder, list,
3339                                             const_array(ctx->v4i32, 0), "");
3340         }
3341
3342         return ac_build_indexed_load_const(&ctx->ac, list, index);
3343 }
3344
3345 /**
3346  * Load the resource descriptor for \p image.
3347  */
3348 static void
3349 image_fetch_rsrc(
3350         struct lp_build_tgsi_context *bld_base,
3351         const struct tgsi_full_src_register *image,
3352         bool is_store, unsigned target,
3353         LLVMValueRef *rsrc)
3354 {
3355         struct si_shader_context *ctx = si_shader_context(bld_base);
3356         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3357                                              ctx->param_images);
3358         LLVMValueRef index;
3359         bool dcc_off = is_store;
3360
3361         assert(image->Register.File == TGSI_FILE_IMAGE);
3362
3363         if (!image->Register.Indirect) {
3364                 const struct tgsi_shader_info *info = bld_base->info;
3365                 unsigned images_writemask = info->images_store |
3366                                             info->images_atomic;
3367
3368                 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3369
3370                 if (images_writemask & (1 << image->Register.Index))
3371                         dcc_off = true;
3372         } else {
3373                 /* From the GL_ARB_shader_image_load_store extension spec:
3374                  *
3375                  *    If a shader performs an image load, store, or atomic
3376                  *    operation using an image variable declared as an array,
3377                  *    and if the index used to select an individual element is
3378                  *    negative or greater than or equal to the size of the
3379                  *    array, the results of the operation are undefined but may
3380                  *    not lead to termination.
3381                  */
3382                 index = get_bounded_indirect_index(ctx, &image->Indirect,
3383                                                    image->Register.Index,
3384                                                    SI_NUM_IMAGES);
3385         }
3386
3387         *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3388         if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3389                 *rsrc = force_dcc_off(ctx, *rsrc);
3390 }
3391
3392 static LLVMValueRef image_fetch_coords(
3393                 struct lp_build_tgsi_context *bld_base,
3394                 const struct tgsi_full_instruction *inst,
3395                 unsigned src, LLVMValueRef desc)
3396 {
3397         struct si_shader_context *ctx = si_shader_context(bld_base);
3398         struct gallivm_state *gallivm = &ctx->gallivm;
3399         LLVMBuilderRef builder = gallivm->builder;
3400         unsigned target = inst->Memory.Texture;
3401         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3402         LLVMValueRef coords[4];
3403         LLVMValueRef tmp;
3404         int chan;
3405
3406         for (chan = 0; chan < num_coords; ++chan) {
3407                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3408                 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3409                 coords[chan] = tmp;
3410         }
3411
3412         if (ctx->screen->b.chip_class >= GFX9) {
3413                 /* 1D textures are allocated and used as 2D on GFX9. */
3414                 if (target == TGSI_TEXTURE_1D) {
3415                         coords[1] = ctx->i32_0;
3416                         num_coords++;
3417                 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3418                         coords[2] = coords[1];
3419                         coords[1] = ctx->i32_0;
3420                         num_coords++;
3421                 } else if (target == TGSI_TEXTURE_2D) {
3422                         /* The hw can't bind a slice of a 3D image as a 2D
3423                          * image, because it ignores BASE_ARRAY if the target
3424                          * is 3D. The workaround is to read BASE_ARRAY and set
3425                          * it as the 3rd address operand for all 2D images.
3426                          */
3427                         LLVMValueRef first_layer, const5, mask;
3428
3429                         const5 = LLVMConstInt(ctx->i32, 5, 0);
3430                         mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3431                         first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3432                         first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3433
3434                         coords[2] = first_layer;
3435                         num_coords++;
3436                 }
3437         }
3438
3439         if (num_coords == 1)
3440                 return coords[0];
3441
3442         if (num_coords == 3) {
3443                 /* LLVM has difficulties lowering 3-element vectors. */
3444                 coords[3] = bld_base->uint_bld.undef;
3445                 num_coords = 4;
3446         }
3447
3448         return lp_build_gather_values(gallivm, coords, num_coords);
3449 }
3450
3451 /**
3452  * Append the extra mode bits that are used by image load and store.
3453  */
3454 static void image_append_args(
3455                 struct si_shader_context *ctx,
3456                 struct lp_build_emit_data * emit_data,
3457                 unsigned target,
3458                 bool atomic,
3459                 bool force_glc)
3460 {
3461         const struct tgsi_full_instruction *inst = emit_data->inst;
3462         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3463         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3464         LLVMValueRef r128 = i1false;
3465         LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3466         LLVMValueRef glc =
3467                 force_glc ||
3468                 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3469                 i1true : i1false;
3470         LLVMValueRef slc = i1false;
3471         LLVMValueRef lwe = i1false;
3472
3473         if (atomic || (HAVE_LLVM <= 0x0309)) {
3474                 emit_data->args[emit_data->arg_count++] = r128;
3475                 emit_data->args[emit_data->arg_count++] = da;
3476                 if (!atomic) {
3477                         emit_data->args[emit_data->arg_count++] = glc;
3478                 }
3479                 emit_data->args[emit_data->arg_count++] = slc;
3480                 return;
3481         }
3482
3483         /* HAVE_LLVM >= 0x0400 */
3484         emit_data->args[emit_data->arg_count++] = glc;
3485         emit_data->args[emit_data->arg_count++] = slc;
3486         emit_data->args[emit_data->arg_count++] = lwe;
3487         emit_data->args[emit_data->arg_count++] = da;
3488 }
3489
3490 /**
3491  * Append the resource and indexing arguments for buffer intrinsics.
3492  *
3493  * \param rsrc the v4i32 buffer resource
3494  * \param index index into the buffer (stride-based)
3495  * \param offset byte offset into the buffer
3496  */
3497 static void buffer_append_args(
3498                 struct si_shader_context *ctx,
3499                 struct lp_build_emit_data *emit_data,
3500                 LLVMValueRef rsrc,
3501                 LLVMValueRef index,
3502                 LLVMValueRef offset,
3503                 bool atomic,
3504                 bool force_glc)
3505 {
3506         const struct tgsi_full_instruction *inst = emit_data->inst;
3507         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3508         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3509
3510         emit_data->args[emit_data->arg_count++] = rsrc;
3511         emit_data->args[emit_data->arg_count++] = index; /* vindex */
3512         emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3513         if (!atomic) {
3514                 emit_data->args[emit_data->arg_count++] =
3515                         force_glc ||
3516                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3517                         i1true : i1false; /* glc */
3518         }
3519         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3520 }
3521
3522 static void load_fetch_args(
3523                 struct lp_build_tgsi_context * bld_base,
3524                 struct lp_build_emit_data * emit_data)
3525 {
3526         struct si_shader_context *ctx = si_shader_context(bld_base);
3527         struct gallivm_state *gallivm = &ctx->gallivm;
3528         const struct tgsi_full_instruction * inst = emit_data->inst;
3529         unsigned target = inst->Memory.Texture;
3530         LLVMValueRef rsrc;
3531
3532         emit_data->dst_type = ctx->v4f32;
3533
3534         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3535                 LLVMBuilderRef builder = gallivm->builder;
3536                 LLVMValueRef offset;
3537                 LLVMValueRef tmp;
3538
3539                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3540
3541                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3542                 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3543
3544                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3545                                    offset, false, false);
3546         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3547                 LLVMValueRef coords;
3548
3549                 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3550                 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3551
3552                 if (target == TGSI_TEXTURE_BUFFER) {
3553                         buffer_append_args(ctx, emit_data, rsrc, coords,
3554                                            ctx->i32_0, false, false);
3555                 } else {
3556                         emit_data->args[0] = coords;
3557                         emit_data->args[1] = rsrc;
3558                         emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3559                         emit_data->arg_count = 3;
3560
3561                         image_append_args(ctx, emit_data, target, false, false);
3562                 }
3563         }
3564 }
3565
3566 static unsigned get_load_intr_attribs(bool readonly_memory)
3567 {
3568         /* READNONE means writes can't affect it, while READONLY means that
3569          * writes can affect it. */
3570         return readonly_memory && HAVE_LLVM >= 0x0400 ?
3571                                  LP_FUNC_ATTR_READNONE :
3572                                  LP_FUNC_ATTR_READONLY;
3573 }
3574
3575 static unsigned get_store_intr_attribs(bool writeonly_memory)
3576 {
3577         return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3578                                   LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3579                                   LP_FUNC_ATTR_WRITEONLY;
3580 }
3581
3582 static void load_emit_buffer(struct si_shader_context *ctx,
3583                              struct lp_build_emit_data *emit_data,
3584                              bool readonly_memory)
3585 {
3586         const struct tgsi_full_instruction *inst = emit_data->inst;
3587         struct gallivm_state *gallivm = &ctx->gallivm;
3588         LLVMBuilderRef builder = gallivm->builder;
3589         uint writemask = inst->Dst[0].Register.WriteMask;
3590         uint count = util_last_bit(writemask);
3591         const char *intrinsic_name;
3592         LLVMTypeRef dst_type;
3593
3594         switch (count) {
3595         case 1:
3596                 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3597                 dst_type = ctx->f32;
3598                 break;
3599         case 2:
3600                 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3601                 dst_type = LLVMVectorType(ctx->f32, 2);
3602                 break;
3603         default: // 3 & 4
3604                 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3605                 dst_type = ctx->v4f32;
3606                 count = 4;
3607         }
3608
3609         emit_data->output[emit_data->chan] = lp_build_intrinsic(
3610                         builder, intrinsic_name, dst_type,
3611                         emit_data->args, emit_data->arg_count,
3612                         get_load_intr_attribs(readonly_memory));
3613 }
3614
3615 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3616                                    const struct tgsi_full_instruction *inst,
3617                                    LLVMTypeRef type, int arg)
3618 {
3619         struct gallivm_state *gallivm = &ctx->gallivm;
3620         LLVMBuilderRef builder = gallivm->builder;
3621         LLVMValueRef offset, ptr;
3622         int addr_space;
3623
3624         offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3625         offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3626
3627         ptr = ctx->shared_memory;
3628         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3629         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3630         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3631
3632         return ptr;
3633 }
3634
3635 static void load_emit_memory(
3636                 struct si_shader_context *ctx,
3637                 struct lp_build_emit_data *emit_data)
3638 {
3639         const struct tgsi_full_instruction *inst = emit_data->inst;
3640         struct gallivm_state *gallivm = &ctx->gallivm;
3641         LLVMBuilderRef builder = gallivm->builder;
3642         unsigned writemask = inst->Dst[0].Register.WriteMask;
3643         LLVMValueRef channels[4], ptr, derived_ptr, index;
3644         int chan;
3645
3646         ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3647
3648         for (chan = 0; chan < 4; ++chan) {
3649                 if (!(writemask & (1 << chan))) {
3650                         channels[chan] = LLVMGetUndef(ctx->f32);
3651                         continue;
3652                 }
3653
3654                 index = LLVMConstInt(ctx->i32, chan, 0);
3655                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3656                 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3657         }
3658         emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3659 }
3660
3661 /**
3662  * Return true if the memory accessed by a LOAD or STORE instruction is
3663  * read-only or write-only, respectively.
3664  *
3665  * \param shader_buffers_reverse_access_mask
3666  *      For LOAD, set this to (store | atomic) slot usage in the shader.
3667  *      For STORE, set this to (load | atomic) slot usage in the shader.
3668  * \param images_reverse_access_mask  Same as above, but for images.
3669  */
3670 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3671                                   const struct tgsi_shader_info *info,
3672                                   unsigned shader_buffers_reverse_access_mask,
3673                                   unsigned images_reverse_access_mask)
3674 {
3675         /* RESTRICT means NOALIAS.
3676          * If there are no writes, we can assume the accessed memory is read-only.
3677          * If there are no reads, we can assume the accessed memory is write-only.
3678          */
3679         if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3680                 unsigned reverse_access_mask;
3681
3682                 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3683                         reverse_access_mask = shader_buffers_reverse_access_mask;
3684                 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3685                         reverse_access_mask = info->images_buffers &
3686                                               images_reverse_access_mask;
3687                 } else {
3688                         reverse_access_mask = ~info->images_buffers &
3689                                               images_reverse_access_mask;
3690                 }
3691
3692                 if (inst->Src[0].Register.Indirect) {
3693                         if (!reverse_access_mask)
3694                                 return true;
3695                 } else {
3696                         if (!(reverse_access_mask &
3697                               (1u << inst->Src[0].Register.Index)))
3698                                 return true;
3699                 }
3700         }
3701
3702         /* If there are no buffer writes (for both shader buffers & image
3703          * buffers), it implies that buffer memory is read-only.
3704          * If there are no buffer reads (for both shader buffers & image
3705          * buffers), it implies that buffer memory is write-only.
3706          *
3707          * Same for the case when there are no writes/reads for non-buffer
3708          * images.
3709          */
3710         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3711             (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3712              inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3713                 if (!shader_buffers_reverse_access_mask &&
3714                     !(info->images_buffers & images_reverse_access_mask))
3715                         return true;
3716         } else {
3717                 if (!(~info->images_buffers & images_reverse_access_mask))
3718                         return true;
3719         }
3720         return false;
3721 }
3722
3723 static void load_emit(
3724                 const struct lp_build_tgsi_action *action,
3725                 struct lp_build_tgsi_context *bld_base,
3726                 struct lp_build_emit_data *emit_data)
3727 {
3728         struct si_shader_context *ctx = si_shader_context(bld_base);
3729         struct gallivm_state *gallivm = &ctx->gallivm;
3730         LLVMBuilderRef builder = gallivm->builder;
3731         const struct tgsi_full_instruction * inst = emit_data->inst;
3732         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3733         char intrinsic_name[64];
3734         bool readonly_memory = false;
3735
3736         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3737                 load_emit_memory(ctx, emit_data);
3738                 return;
3739         }
3740
3741         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3742                 emit_waitcnt(ctx, VM_CNT);
3743
3744         readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3745                           is_oneway_access_only(inst, info,
3746                                                 info->shader_buffers_store |
3747                                                 info->shader_buffers_atomic,
3748                                                 info->images_store |
3749                                                 info->images_atomic);
3750
3751         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3752                 load_emit_buffer(ctx, emit_data, readonly_memory);
3753                 return;
3754         }
3755
3756         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3757                 emit_data->output[emit_data->chan] =
3758                         lp_build_intrinsic(
3759                                 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3760                                 emit_data->args, emit_data->arg_count,
3761                                 get_load_intr_attribs(readonly_memory));
3762         } else {
3763                 ac_get_image_intr_name("llvm.amdgcn.image.load",
3764                                        emit_data->dst_type,             /* vdata */
3765                                        LLVMTypeOf(emit_data->args[0]), /* coords */
3766                                        LLVMTypeOf(emit_data->args[1]), /* rsrc */
3767                                        intrinsic_name, sizeof(intrinsic_name));
3768
3769                 emit_data->output[emit_data->chan] =
3770                         lp_build_intrinsic(
3771                                 builder, intrinsic_name, emit_data->dst_type,
3772                                 emit_data->args, emit_data->arg_count,
3773                                 get_load_intr_attribs(readonly_memory));
3774         }
3775 }
3776
3777 static void store_fetch_args(
3778                 struct lp_build_tgsi_context * bld_base,
3779                 struct lp_build_emit_data * emit_data)
3780 {
3781         struct si_shader_context *ctx = si_shader_context(bld_base);
3782         struct gallivm_state *gallivm = &ctx->gallivm;
3783         LLVMBuilderRef builder = gallivm->builder;
3784         const struct tgsi_full_instruction * inst = emit_data->inst;
3785         struct tgsi_full_src_register memory;
3786         LLVMValueRef chans[4];
3787         LLVMValueRef data;
3788         LLVMValueRef rsrc;
3789         unsigned chan;
3790
3791         emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3792
3793         for (chan = 0; chan < 4; ++chan) {
3794                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3795         }
3796         data = lp_build_gather_values(gallivm, chans, 4);
3797
3798         emit_data->args[emit_data->arg_count++] = data;
3799
3800         memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3801
3802         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3803                 LLVMValueRef offset;
3804                 LLVMValueRef tmp;
3805
3806                 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3807
3808                 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3809                 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3810
3811                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3812                                    offset, false, false);
3813         } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3814                 unsigned target = inst->Memory.Texture;
3815                 LLVMValueRef coords;
3816
3817                 /* 8bit/16bit TC L1 write corruption bug on SI.
3818                  * All store opcodes not aligned to a dword are affected.
3819                  *
3820                  * The only way to get unaligned stores in radeonsi is through
3821                  * shader images.
3822                  */
3823                 bool force_glc = ctx->screen->b.chip_class == SI;
3824
3825                 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
3826                 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
3827
3828                 if (target == TGSI_TEXTURE_BUFFER) {
3829                         buffer_append_args(ctx, emit_data, rsrc, coords,
3830                                            ctx->i32_0, false, force_glc);
3831                 } else {
3832                         emit_data->args[1] = coords;
3833                         emit_data->args[2] = rsrc;
3834                         emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3835                         emit_data->arg_count = 4;
3836
3837                         image_append_args(ctx, emit_data, target, false, force_glc);
3838                 }
3839         }
3840 }
3841
3842 static void store_emit_buffer(
3843                 struct si_shader_context *ctx,
3844                 struct lp_build_emit_data *emit_data,
3845                 bool writeonly_memory)
3846 {
3847         const struct tgsi_full_instruction *inst = emit_data->inst;
3848         struct gallivm_state *gallivm = &ctx->gallivm;
3849         LLVMBuilderRef builder = gallivm->builder;
3850         LLVMValueRef base_data = emit_data->args[0];
3851         LLVMValueRef base_offset = emit_data->args[3];
3852         unsigned writemask = inst->Dst[0].Register.WriteMask;
3853
3854         while (writemask) {
3855                 int start, count;
3856                 const char *intrinsic_name;
3857                 LLVMValueRef data;
3858                 LLVMValueRef offset;
3859                 LLVMValueRef tmp;
3860
3861                 u_bit_scan_consecutive_range(&writemask, &start, &count);
3862
3863                 /* Due to an LLVM limitation, split 3-element writes
3864                  * into a 2-element and a 1-element write. */
3865                 if (count == 3) {
3866                         writemask |= 1 << (start + 2);
3867                         count = 2;
3868                 }
3869
3870                 if (count == 4) {
3871                         data = base_data;
3872                         intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3873                 } else if (count == 2) {
3874                         LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3875
3876                         tmp = LLVMBuildExtractElement(
3877                                 builder, base_data,
3878                                 LLVMConstInt(ctx->i32, start, 0), "");
3879                         data = LLVMBuildInsertElement(
3880                                 builder, LLVMGetUndef(v2f32), tmp,
3881                                 ctx->i32_0, "");
3882
3883                         tmp = LLVMBuildExtractElement(
3884                                 builder, base_data,
3885                                 LLVMConstInt(ctx->i32, start + 1, 0), "");
3886                         data = LLVMBuildInsertElement(
3887                                 builder, data, tmp, ctx->i32_1, "");
3888
3889                         intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3890                 } else {
3891                         assert(count == 1);
3892                         data = LLVMBuildExtractElement(
3893                                 builder, base_data,
3894                                 LLVMConstInt(ctx->i32, start, 0), "");
3895                         intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3896                 }
3897
3898                 offset = base_offset;
3899                 if (start != 0) {
3900                         offset = LLVMBuildAdd(
3901                                 builder, offset,
3902                                 LLVMConstInt(ctx->i32, start * 4, 0), "");
3903                 }
3904
3905                 emit_data->args[0] = data;
3906                 emit_data->args[3] = offset;
3907
3908                 lp_build_intrinsic(
3909                         builder, intrinsic_name, emit_data->dst_type,
3910                         emit_data->args, emit_data->arg_count,
3911                         get_store_intr_attribs(writeonly_memory));
3912         }
3913 }
3914
3915 static void store_emit_memory(
3916                 struct si_shader_context *ctx,
3917                 struct lp_build_emit_data *emit_data)
3918 {
3919         const struct tgsi_full_instruction *inst = emit_data->inst;
3920         struct gallivm_state *gallivm = &ctx->gallivm;
3921         LLVMBuilderRef builder = gallivm->builder;
3922         unsigned writemask = inst->Dst[0].Register.WriteMask;
3923         LLVMValueRef ptr, derived_ptr, data, index;
3924         int chan;
3925
3926         ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
3927
3928         for (chan = 0; chan < 4; ++chan) {
3929                 if (!(writemask & (1 << chan))) {
3930                         continue;
3931                 }
3932                 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
3933                 index = LLVMConstInt(ctx->i32, chan, 0);
3934                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3935                 LLVMBuildStore(builder, data, derived_ptr);
3936         }
3937 }
3938
3939 static void store_emit(
3940                 const struct lp_build_tgsi_action *action,
3941                 struct lp_build_tgsi_context *bld_base,
3942                 struct lp_build_emit_data *emit_data)
3943 {
3944         struct si_shader_context *ctx = si_shader_context(bld_base);
3945         struct gallivm_state *gallivm = &ctx->gallivm;
3946         LLVMBuilderRef builder = gallivm->builder;
3947         const struct tgsi_full_instruction * inst = emit_data->inst;
3948         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3949         unsigned target = inst->Memory.Texture;
3950         char intrinsic_name[64];
3951         bool writeonly_memory = false;
3952
3953         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3954                 store_emit_memory(ctx, emit_data);
3955                 return;
3956         }
3957
3958         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3959                 emit_waitcnt(ctx, VM_CNT);
3960
3961         writeonly_memory = is_oneway_access_only(inst, info,
3962                                                  info->shader_buffers_load |
3963                                                  info->shader_buffers_atomic,
3964                                                  info->images_load |
3965                                                  info->images_atomic);
3966
3967         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3968                 store_emit_buffer(ctx, emit_data, writeonly_memory);
3969                 return;
3970         }
3971
3972         if (target == TGSI_TEXTURE_BUFFER) {
3973                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3974                         builder, "llvm.amdgcn.buffer.store.format.v4f32",
3975                         emit_data->dst_type, emit_data->args,
3976                         emit_data->arg_count,
3977                         get_store_intr_attribs(writeonly_memory));
3978         } else {
3979                 ac_get_image_intr_name("llvm.amdgcn.image.store",
3980                                        LLVMTypeOf(emit_data->args[0]), /* vdata */
3981                                        LLVMTypeOf(emit_data->args[1]), /* coords */
3982                                        LLVMTypeOf(emit_data->args[2]), /* rsrc */
3983                                        intrinsic_name, sizeof(intrinsic_name));
3984
3985                 emit_data->output[emit_data->chan] =
3986                         lp_build_intrinsic(
3987                                 builder, intrinsic_name, emit_data->dst_type,
3988                                 emit_data->args, emit_data->arg_count,
3989                                 get_store_intr_attribs(writeonly_memory));
3990         }
3991 }
3992
3993 static void atomic_fetch_args(
3994                 struct lp_build_tgsi_context * bld_base,
3995                 struct lp_build_emit_data * emit_data)
3996 {
3997         struct si_shader_context *ctx = si_shader_context(bld_base);
3998         struct gallivm_state *gallivm = &ctx->gallivm;
3999         LLVMBuilderRef builder = gallivm->builder;
4000         const struct tgsi_full_instruction * inst = emit_data->inst;
4001         LLVMValueRef data1, data2;
4002         LLVMValueRef rsrc;
4003         LLVMValueRef tmp;
4004
4005         emit_data->dst_type = ctx->f32;
4006
4007         tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4008         data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4009
4010         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4011                 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4012                 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4013         }
4014
4015         /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4016          * of arguments, which is reversed relative to TGSI (and GLSL)
4017          */
4018         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4019                 emit_data->args[emit_data->arg_count++] = data2;
4020         emit_data->args[emit_data->arg_count++] = data1;
4021
4022         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4023                 LLVMValueRef offset;
4024
4025                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4026
4027                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4028                 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4029
4030                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4031                                    offset, true, false);
4032         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4033                 unsigned target = inst->Memory.Texture;
4034                 LLVMValueRef coords;
4035
4036                 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4037                 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4038
4039                 if (target == TGSI_TEXTURE_BUFFER) {
4040                         buffer_append_args(ctx, emit_data, rsrc, coords,
4041                                            ctx->i32_0, true, false);
4042                 } else {
4043                         emit_data->args[emit_data->arg_count++] = coords;
4044                         emit_data->args[emit_data->arg_count++] = rsrc;
4045
4046                         image_append_args(ctx, emit_data, target, true, false);
4047                 }
4048         }
4049 }
4050
4051 static void atomic_emit_memory(struct si_shader_context *ctx,
4052                                struct lp_build_emit_data *emit_data) {
4053         struct gallivm_state *gallivm = &ctx->gallivm;
4054         LLVMBuilderRef builder = gallivm->builder;
4055         const struct tgsi_full_instruction * inst = emit_data->inst;
4056         LLVMValueRef ptr, result, arg;
4057
4058         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4059
4060         arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4061         arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4062
4063         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4064                 LLVMValueRef new_data;
4065                 new_data = lp_build_emit_fetch(&ctx->bld_base,
4066                                                inst, 3, 0);
4067
4068                 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4069
4070 #if HAVE_LLVM >= 0x309
4071                 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4072                                        LLVMAtomicOrderingSequentiallyConsistent,
4073                                        LLVMAtomicOrderingSequentiallyConsistent,
4074                                        false);
4075 #endif
4076
4077                 result = LLVMBuildExtractValue(builder, result, 0, "");
4078         } else {
4079                 LLVMAtomicRMWBinOp op;
4080
4081                 switch(inst->Instruction.Opcode) {
4082                         case TGSI_OPCODE_ATOMUADD:
4083                                 op = LLVMAtomicRMWBinOpAdd;
4084                                 break;
4085                         case TGSI_OPCODE_ATOMXCHG:
4086                                 op = LLVMAtomicRMWBinOpXchg;
4087                                 break;
4088                         case TGSI_OPCODE_ATOMAND:
4089                                 op = LLVMAtomicRMWBinOpAnd;
4090                                 break;
4091                         case TGSI_OPCODE_ATOMOR:
4092                                 op = LLVMAtomicRMWBinOpOr;
4093                                 break;
4094                         case TGSI_OPCODE_ATOMXOR:
4095                                 op = LLVMAtomicRMWBinOpXor;
4096                                 break;
4097                         case TGSI_OPCODE_ATOMUMIN:
4098                                 op = LLVMAtomicRMWBinOpUMin;
4099                                 break;
4100                         case TGSI_OPCODE_ATOMUMAX:
4101                                 op = LLVMAtomicRMWBinOpUMax;
4102                                 break;
4103                         case TGSI_OPCODE_ATOMIMIN:
4104                                 op = LLVMAtomicRMWBinOpMin;
4105                                 break;
4106                         case TGSI_OPCODE_ATOMIMAX:
4107                                 op = LLVMAtomicRMWBinOpMax;
4108                                 break;
4109                         default:
4110                                 unreachable("unknown atomic opcode");
4111                 }
4112
4113                 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4114                                        LLVMAtomicOrderingSequentiallyConsistent,
4115                                        false);
4116         }
4117         emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4118 }
4119
4120 static void atomic_emit(
4121                 const struct lp_build_tgsi_action *action,
4122                 struct lp_build_tgsi_context *bld_base,
4123                 struct lp_build_emit_data *emit_data)
4124 {
4125         struct si_shader_context *ctx = si_shader_context(bld_base);
4126         struct gallivm_state *gallivm = &ctx->gallivm;
4127         LLVMBuilderRef builder = gallivm->builder;
4128         const struct tgsi_full_instruction * inst = emit_data->inst;
4129         char intrinsic_name[40];
4130         LLVMValueRef tmp;
4131
4132         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4133                 atomic_emit_memory(ctx, emit_data);
4134                 return;
4135         }
4136
4137         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4138             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4139                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4140                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4141         } else {
4142                 LLVMValueRef coords;
4143                 char coords_type[8];
4144
4145                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4146                         coords = emit_data->args[2];
4147                 else
4148                         coords = emit_data->args[1];
4149
4150                 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4151                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4152                          "llvm.amdgcn.image.atomic.%s.%s",
4153                          action->intr_name, coords_type);
4154         }
4155
4156         tmp = lp_build_intrinsic(
4157                 builder, intrinsic_name, ctx->i32,
4158                 emit_data->args, emit_data->arg_count, 0);
4159         emit_data->output[emit_data->chan] =
4160                 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4161 }
4162
4163 static void set_tex_fetch_args(struct si_shader_context *ctx,
4164                                struct lp_build_emit_data *emit_data,
4165                                unsigned target,
4166                                LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4167                                LLVMValueRef *param, unsigned count,
4168                                unsigned dmask)
4169 {
4170         struct gallivm_state *gallivm = &ctx->gallivm;
4171         struct ac_image_args args = {};
4172
4173         /* Pad to power of two vector */
4174         while (count < util_next_power_of_two(count))
4175                 param[count++] = LLVMGetUndef(ctx->i32);
4176
4177         if (count > 1)
4178                 args.addr = lp_build_gather_values(gallivm, param, count);
4179         else
4180                 args.addr = param[0];
4181
4182         args.resource = res_ptr;
4183         args.sampler = samp_ptr;
4184         args.dmask = dmask;
4185         args.unorm = target == TGSI_TEXTURE_RECT ||
4186                      target == TGSI_TEXTURE_SHADOWRECT;
4187         args.da = tgsi_is_array_sampler(target);
4188
4189         /* Ugly, but we seem to have no other choice right now. */
4190         STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4191         memcpy(emit_data->args, &args, sizeof(args));
4192 }
4193
4194 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4195                                 unsigned target, LLVMValueRef out)
4196 {
4197         LLVMBuilderRef builder = ctx->gallivm.builder;
4198
4199         /* 1D textures are allocated and used as 2D on GFX9. */
4200         if (ctx->screen->b.chip_class >= GFX9 &&
4201             (target == TGSI_TEXTURE_1D_ARRAY ||
4202              target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4203                 LLVMValueRef layers =
4204                         LLVMBuildExtractElement(builder, out,
4205                                                 LLVMConstInt(ctx->i32, 2, 0), "");
4206                 out = LLVMBuildInsertElement(builder, out, layers,
4207                                              ctx->i32_1, "");
4208         }
4209
4210         /* Divide the number of layers by 6 to get the number of cubes. */
4211         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4212             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4213                 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4214
4215                 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4216                 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4217
4218                 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4219         }
4220         return out;
4221 }
4222
4223 static void resq_fetch_args(
4224                 struct lp_build_tgsi_context * bld_base,
4225                 struct lp_build_emit_data * emit_data)
4226 {
4227         struct si_shader_context *ctx = si_shader_context(bld_base);
4228         const struct tgsi_full_instruction *inst = emit_data->inst;
4229         const struct tgsi_full_src_register *reg = &inst->Src[0];
4230
4231         emit_data->dst_type = ctx->v4i32;
4232
4233         if (reg->Register.File == TGSI_FILE_BUFFER) {
4234                 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4235                 emit_data->arg_count = 1;
4236         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4237                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4238                                  &emit_data->args[0]);
4239                 emit_data->arg_count = 1;
4240         } else {
4241                 LLVMValueRef res_ptr;
4242                 unsigned image_target;
4243
4244                 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4245                         image_target = TGSI_TEXTURE_2D_ARRAY;
4246                 else
4247                         image_target = inst->Memory.Texture;
4248
4249                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4250                                  &res_ptr);
4251                 set_tex_fetch_args(ctx, emit_data, image_target,
4252                                    res_ptr, NULL, &ctx->i32_0, 1,
4253                                    0xf);
4254         }
4255 }
4256
4257 static void resq_emit(
4258                 const struct lp_build_tgsi_action *action,
4259                 struct lp_build_tgsi_context *bld_base,
4260                 struct lp_build_emit_data *emit_data)
4261 {
4262         struct si_shader_context *ctx = si_shader_context(bld_base);
4263         struct gallivm_state *gallivm = &ctx->gallivm;
4264         LLVMBuilderRef builder = gallivm->builder;
4265         const struct tgsi_full_instruction *inst = emit_data->inst;
4266         LLVMValueRef out;
4267
4268         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4269                 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4270                                               LLVMConstInt(ctx->i32, 2, 0), "");
4271         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4272                 out = get_buffer_size(bld_base, emit_data->args[0]);
4273         } else {
4274                 struct ac_image_args args;
4275
4276                 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4277                 args.opcode = ac_image_get_resinfo;
4278                 out = ac_build_image_opcode(&ctx->ac, &args);
4279
4280                 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4281         }
4282
4283         emit_data->output[emit_data->chan] = out;
4284 }
4285
4286 static const struct lp_build_tgsi_action tex_action;
4287
4288 enum desc_type {
4289         DESC_IMAGE,
4290         DESC_BUFFER,
4291         DESC_FMASK,
4292         DESC_SAMPLER,
4293 };
4294
4295 /**
4296  * Load an image view, fmask view. or sampler state descriptor.
4297  */
4298 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4299                                       LLVMValueRef list, LLVMValueRef index,
4300                                       enum desc_type type)
4301 {
4302         struct gallivm_state *gallivm = &ctx->gallivm;
4303         LLVMBuilderRef builder = gallivm->builder;
4304
4305         switch (type) {
4306         case DESC_IMAGE:
4307                 /* The image is at [0:7]. */
4308                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4309                 break;
4310         case DESC_BUFFER:
4311                 /* The buffer is in [4:7]. */
4312                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4313                 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4314                 list = LLVMBuildPointerCast(builder, list,
4315                                             const_array(ctx->v4i32, 0), "");
4316                 break;
4317         case DESC_FMASK:
4318                 /* The FMASK is at [8:15]. */
4319                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4320                 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4321                 break;
4322         case DESC_SAMPLER:
4323                 /* The sampler state is at [12:15]. */
4324                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4325                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4326                 list = LLVMBuildPointerCast(builder, list,
4327                                             const_array(ctx->v4i32, 0), "");
4328                 break;
4329         }
4330
4331         return ac_build_indexed_load_const(&ctx->ac, list, index);
4332 }
4333
4334 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4335  *
4336  * SI-CI:
4337  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4338  *   filtering manually. The driver sets img7 to a mask clearing
4339  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4340  *     s_and_b32 samp0, samp0, img7
4341  *
4342  * VI:
4343  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
4344  */
4345 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4346                                            LLVMValueRef res, LLVMValueRef samp)
4347 {
4348         LLVMBuilderRef builder = ctx->gallivm.builder;
4349         LLVMValueRef img7, samp0;
4350
4351         if (ctx->screen->b.chip_class >= VI)
4352                 return samp;
4353
4354         img7 = LLVMBuildExtractElement(builder, res,
4355                                        LLVMConstInt(ctx->i32, 7, 0), "");
4356         samp0 = LLVMBuildExtractElement(builder, samp,
4357                                         ctx->i32_0, "");
4358         samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4359         return LLVMBuildInsertElement(builder, samp, samp0,
4360                                       ctx->i32_0, "");
4361 }
4362
4363 static void tex_fetch_ptrs(
4364         struct lp_build_tgsi_context *bld_base,
4365         struct lp_build_emit_data *emit_data,
4366         LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4367 {
4368         struct si_shader_context *ctx = si_shader_context(bld_base);
4369         LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4370         const struct tgsi_full_instruction *inst = emit_data->inst;
4371         const struct tgsi_full_src_register *reg;
4372         unsigned target = inst->Texture.Texture;
4373         unsigned sampler_src;
4374         LLVMValueRef index;
4375
4376         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4377         reg = &emit_data->inst->Src[sampler_src];
4378
4379         if (reg->Register.Indirect) {
4380                 index = get_bounded_indirect_index(ctx,
4381                                                    &reg->Indirect,
4382                                                    reg->Register.Index,
4383                                                    SI_NUM_SAMPLERS);
4384         } else {
4385                 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4386         }
4387
4388         if (target == TGSI_TEXTURE_BUFFER)
4389                 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4390         else
4391                 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4392
4393         if (samp_ptr)
4394                 *samp_ptr = NULL;
4395         if (fmask_ptr)
4396                 *fmask_ptr = NULL;
4397
4398         if (target == TGSI_TEXTURE_2D_MSAA ||
4399             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4400                 if (fmask_ptr)
4401                         *fmask_ptr = load_sampler_desc(ctx, list, index,
4402                                                        DESC_FMASK);
4403         } else if (target != TGSI_TEXTURE_BUFFER) {
4404                 if (samp_ptr) {
4405                         *samp_ptr = load_sampler_desc(ctx, list, index,
4406                                                       DESC_SAMPLER);
4407                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4408                 }
4409         }
4410 }
4411
4412 static void txq_fetch_args(
4413         struct lp_build_tgsi_context *bld_base,
4414         struct lp_build_emit_data *emit_data)
4415 {
4416         struct si_shader_context *ctx = si_shader_context(bld_base);
4417         const struct tgsi_full_instruction *inst = emit_data->inst;
4418         unsigned target = inst->Texture.Texture;
4419         LLVMValueRef res_ptr;
4420         LLVMValueRef address;
4421
4422         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4423
4424         if (target == TGSI_TEXTURE_BUFFER) {
4425                 /* Read the size from the buffer descriptor directly. */
4426                 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4427                 return;
4428         }
4429
4430         /* Textures - set the mip level. */
4431         address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4432
4433         set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4434                            NULL, &address, 1, 0xf);
4435 }
4436
4437 static void txq_emit(const struct lp_build_tgsi_action *action,
4438                      struct lp_build_tgsi_context *bld_base,
4439                      struct lp_build_emit_data *emit_data)
4440 {
4441         struct si_shader_context *ctx = si_shader_context(bld_base);
4442         struct ac_image_args args;
4443         unsigned target = emit_data->inst->Texture.Texture;
4444
4445         if (target == TGSI_TEXTURE_BUFFER) {
4446                 /* Just return the buffer size. */
4447                 emit_data->output[emit_data->chan] = emit_data->args[0];
4448                 return;
4449         }
4450
4451         memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4452
4453         args.opcode = ac_image_get_resinfo;
4454         LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4455
4456         emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4457 }
4458
4459 static void tex_fetch_args(
4460         struct lp_build_tgsi_context *bld_base,
4461         struct lp_build_emit_data *emit_data)
4462 {
4463         struct si_shader_context *ctx = si_shader_context(bld_base);
4464         struct gallivm_state *gallivm = &ctx->gallivm;
4465         const struct tgsi_full_instruction *inst = emit_data->inst;
4466         unsigned opcode = inst->Instruction.Opcode;
4467         unsigned target = inst->Texture.Texture;
4468         LLVMValueRef coords[5], derivs[6];
4469         LLVMValueRef address[16];
4470         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4471         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4472         unsigned count = 0;
4473         unsigned chan;
4474         unsigned num_deriv_channels = 0;
4475         bool has_offset = inst->Texture.NumOffsets > 0;
4476         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4477         unsigned dmask = 0xf;
4478
4479         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4480
4481         if (target == TGSI_TEXTURE_BUFFER) {
4482                 emit_data->dst_type = ctx->v4f32;
4483                 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
4484                                                       ctx->v16i8, "");
4485                 emit_data->args[1] = ctx->i32_0;
4486                 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4487                 emit_data->arg_count = 3;
4488                 return;
4489         }
4490
4491         /* Fetch and project texture coordinates */
4492         coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4493         for (chan = 0; chan < 3; chan++ ) {
4494                 coords[chan] = lp_build_emit_fetch(bld_base,
4495                                                    emit_data->inst, 0,
4496                                                    chan);
4497                 if (opcode == TGSI_OPCODE_TXP)
4498                         coords[chan] = lp_build_emit_llvm_binary(bld_base,
4499                                                                  TGSI_OPCODE_DIV,
4500                                                                  coords[chan],
4501                                                                  coords[3]);
4502         }
4503
4504         if (opcode == TGSI_OPCODE_TXP)
4505                 coords[3] = bld_base->base.one;
4506
4507         /* Pack offsets. */
4508         if (has_offset &&
4509             opcode != TGSI_OPCODE_TXF &&
4510             opcode != TGSI_OPCODE_TXF_LZ) {
4511                 /* The offsets are six-bit signed integers packed like this:
4512                  *   X=[5:0], Y=[13:8], and Z=[21:16].
4513                  */
4514                 LLVMValueRef offset[3], pack;
4515
4516                 assert(inst->Texture.NumOffsets == 1);
4517
4518                 for (chan = 0; chan < 3; chan++) {
4519                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4520                                                                      emit_data->inst, 0, chan);
4521                         offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4522                                                     LLVMConstInt(ctx->i32, 0x3f, 0), "");
4523                         if (chan)
4524                                 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4525                                                             LLVMConstInt(ctx->i32, chan*8, 0), "");
4526                 }
4527
4528                 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4529                 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4530                 address[count++] = pack;
4531         }
4532
4533         /* Pack LOD bias value */
4534         if (opcode == TGSI_OPCODE_TXB)
4535                 address[count++] = coords[3];
4536         if (opcode == TGSI_OPCODE_TXB2)
4537                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4538
4539         /* Pack depth comparison value */
4540         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4541                 LLVMValueRef z;
4542
4543                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4544                         z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4545                 } else {
4546                         assert(ref_pos >= 0);
4547                         z = coords[ref_pos];
4548                 }
4549
4550                 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4551                  * so the depth comparison value isn't clamped for Z16 and
4552                  * Z24 anymore. Do it manually here.
4553                  *
4554                  * It's unnecessary if the original texture format was
4555                  * Z32_FLOAT, but we don't know that here.
4556                  */
4557                 if (ctx->screen->b.chip_class == VI)
4558                         z = ac_build_clamp(&ctx->ac, z);
4559
4560                 address[count++] = z;
4561         }
4562
4563         /* Pack user derivatives */
4564         if (opcode == TGSI_OPCODE_TXD) {
4565                 int param, num_src_deriv_channels, num_dst_deriv_channels;
4566
4567                 switch (target) {
4568                 case TGSI_TEXTURE_3D:
4569                         num_src_deriv_channels = 3;
4570                         num_dst_deriv_channels = 3;
4571                         num_deriv_channels = 3;
4572                         break;
4573                 case TGSI_TEXTURE_2D:
4574                 case TGSI_TEXTURE_SHADOW2D:
4575                 case TGSI_TEXTURE_RECT:
4576                 case TGSI_TEXTURE_SHADOWRECT:
4577                 case TGSI_TEXTURE_2D_ARRAY:
4578                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4579                         num_src_deriv_channels = 2;
4580                         num_dst_deriv_channels = 2;
4581                         num_deriv_channels = 2;
4582                         break;
4583                 case TGSI_TEXTURE_CUBE:
4584                 case TGSI_TEXTURE_SHADOWCUBE:
4585                 case TGSI_TEXTURE_CUBE_ARRAY:
4586                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4587                         /* Cube derivatives will be converted to 2D. */
4588                         num_src_deriv_channels = 3;
4589                         num_dst_deriv_channels = 3;
4590                         num_deriv_channels = 2;
4591                         break;
4592                 case TGSI_TEXTURE_1D:
4593                 case TGSI_TEXTURE_SHADOW1D:
4594                 case TGSI_TEXTURE_1D_ARRAY:
4595                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4596                         num_src_deriv_channels = 1;
4597
4598                         /* 1D textures are allocated and used as 2D on GFX9. */
4599                         if (ctx->screen->b.chip_class >= GFX9) {
4600                                 num_dst_deriv_channels = 2;
4601                                 num_deriv_channels = 2;
4602                         } else {
4603                                 num_dst_deriv_channels = 1;
4604                                 num_deriv_channels = 1;
4605                         }
4606                         break;
4607                 default:
4608                         unreachable("invalid target");
4609                 }
4610
4611                 for (param = 0; param < 2; param++) {
4612                         for (chan = 0; chan < num_src_deriv_channels; chan++)
4613                                 derivs[param * num_dst_deriv_channels + chan] =
4614                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
4615
4616                         /* Fill in the rest with zeros. */
4617                         for (chan = num_src_deriv_channels;
4618                              chan < num_dst_deriv_channels; chan++)
4619                                 derivs[param * num_dst_deriv_channels + chan] =
4620                                         bld_base->base.zero;
4621                 }
4622         }
4623
4624         if (target == TGSI_TEXTURE_CUBE ||
4625             target == TGSI_TEXTURE_CUBE_ARRAY ||
4626             target == TGSI_TEXTURE_SHADOWCUBE ||
4627             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4628                 ac_prepare_cube_coords(&ctx->ac,
4629                                        opcode == TGSI_OPCODE_TXD,
4630                                        target == TGSI_TEXTURE_CUBE_ARRAY ||
4631                                        target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4632                                        coords, derivs);
4633
4634         if (opcode == TGSI_OPCODE_TXD)
4635                 for (int i = 0; i < num_deriv_channels * 2; i++)
4636                         address[count++] = derivs[i];
4637
4638         /* Pack texture coordinates */
4639         address[count++] = coords[0];
4640         if (num_coords > 1)
4641                 address[count++] = coords[1];
4642         if (num_coords > 2)
4643                 address[count++] = coords[2];
4644
4645         /* 1D textures are allocated and used as 2D on GFX9. */
4646         if (ctx->screen->b.chip_class >= GFX9) {
4647                 LLVMValueRef filler;
4648
4649                 /* Use 0.5, so that we don't sample the border color. */
4650                 if (opcode == TGSI_OPCODE_TXF)
4651                         filler = ctx->i32_0;
4652                 else
4653                         filler = LLVMConstReal(ctx->f32, 0.5);
4654
4655                 if (target == TGSI_TEXTURE_1D ||
4656                     target == TGSI_TEXTURE_SHADOW1D) {
4657                         address[count++] = filler;
4658                 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4659                            target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4660                         address[count] = address[count - 1];
4661                         address[count - 1] = filler;
4662                         count++;
4663                 }
4664         }
4665
4666         /* Pack LOD or sample index */
4667         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4668                 address[count++] = coords[3];
4669         else if (opcode == TGSI_OPCODE_TXL2)
4670                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4671
4672         if (count > 16) {
4673                 assert(!"Cannot handle more than 16 texture address parameters");
4674                 count = 16;
4675         }
4676
4677         for (chan = 0; chan < count; chan++ ) {
4678                 address[chan] = LLVMBuildBitCast(gallivm->builder,
4679                                                  address[chan], ctx->i32, "");
4680         }
4681
4682         /* Adjust the sample index according to FMASK.
4683          *
4684          * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4685          * which is the identity mapping. Each nibble says which physical sample
4686          * should be fetched to get that sample.
4687          *
4688          * For example, 0x11111100 means there are only 2 samples stored and
4689          * the second sample covers 3/4 of the pixel. When reading samples 0
4690          * and 1, return physical sample 0 (determined by the first two 0s
4691          * in FMASK), otherwise return physical sample 1.
4692          *
4693          * The sample index should be adjusted as follows:
4694          *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
4695          */
4696         if (target == TGSI_TEXTURE_2D_MSAA ||
4697             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4698                 struct lp_build_emit_data txf_emit_data = *emit_data;
4699                 LLVMValueRef txf_address[4];
4700                 /* We only need .xy for non-arrays, and .xyz for arrays. */
4701                 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4702                 struct tgsi_full_instruction inst = {};
4703
4704                 memcpy(txf_address, address, sizeof(txf_address));
4705
4706                 /* Read FMASK using TXF_LZ. */
4707                 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4708                 inst.Texture.Texture = target;
4709                 txf_emit_data.inst = &inst;
4710                 txf_emit_data.chan = 0;
4711                 set_tex_fetch_args(ctx, &txf_emit_data,
4712                                    target, fmask_ptr, NULL,
4713                                    txf_address, txf_count, 0xf);
4714                 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4715
4716                 /* Initialize some constants. */
4717                 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4718                 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4719
4720                 /* Apply the formula. */
4721                 LLVMValueRef fmask =
4722                         LLVMBuildExtractElement(gallivm->builder,
4723                                                 txf_emit_data.output[0],
4724                                                 ctx->i32_0, "");
4725
4726                 unsigned sample_chan = txf_count; /* the sample index is last */
4727
4728                 LLVMValueRef sample_index4 =
4729                         LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4730
4731                 LLVMValueRef shifted_fmask =
4732                         LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4733
4734                 LLVMValueRef final_sample =
4735                         LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4736
4737                 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4738                  * resource descriptor is 0 (invalid),
4739                  */
4740                 LLVMValueRef fmask_desc =
4741                         LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4742                                          ctx->v8i32, "");
4743
4744                 LLVMValueRef fmask_word1 =
4745                         LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4746                                                 ctx->i32_1, "");
4747
4748                 LLVMValueRef word1_is_nonzero =
4749                         LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4750                                       fmask_word1, ctx->i32_0, "");
4751
4752                 /* Replace the MSAA sample index. */
4753                 address[sample_chan] =
4754                         LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4755                                         final_sample, address[sample_chan], "");
4756         }
4757
4758         if (opcode == TGSI_OPCODE_TXF ||
4759             opcode == TGSI_OPCODE_TXF_LZ) {
4760                 /* add tex offsets */
4761                 if (inst->Texture.NumOffsets) {
4762                         struct lp_build_context *uint_bld = &bld_base->uint_bld;
4763                         const struct tgsi_texture_offset *off = inst->TexOffsets;
4764
4765                         assert(inst->Texture.NumOffsets == 1);
4766
4767                         switch (target) {
4768                         case TGSI_TEXTURE_3D:
4769                                 address[2] = lp_build_add(uint_bld, address[2],
4770                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4771                                 /* fall through */
4772                         case TGSI_TEXTURE_2D:
4773                         case TGSI_TEXTURE_SHADOW2D:
4774                         case TGSI_TEXTURE_RECT:
4775                         case TGSI_TEXTURE_SHADOWRECT:
4776                         case TGSI_TEXTURE_2D_ARRAY:
4777                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
4778                                 address[1] =
4779                                         lp_build_add(uint_bld, address[1],
4780                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4781                                 /* fall through */
4782                         case TGSI_TEXTURE_1D:
4783                         case TGSI_TEXTURE_SHADOW1D:
4784                         case TGSI_TEXTURE_1D_ARRAY:
4785                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
4786                                 address[0] =
4787                                         lp_build_add(uint_bld, address[0],
4788                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4789                                 break;
4790                                 /* texture offsets do not apply to other texture targets */
4791                         }
4792                 }
4793         }
4794
4795         if (opcode == TGSI_OPCODE_TG4) {
4796                 unsigned gather_comp = 0;
4797
4798                 /* DMASK was repurposed for GATHER4. 4 components are always
4799                  * returned and DMASK works like a swizzle - it selects
4800                  * the component to fetch. The only valid DMASK values are
4801                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4802                  * (red,red,red,red) etc.) The ISA document doesn't mention
4803                  * this.
4804                  */
4805
4806                 /* Get the component index from src1.x for Gather4. */
4807                 if (!tgsi_is_shadow_target(target)) {
4808                         LLVMValueRef comp_imm;
4809                         struct tgsi_src_register src1 = inst->Src[1].Register;
4810
4811                         assert(src1.File == TGSI_FILE_IMMEDIATE);
4812
4813                         comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
4814                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4815                         gather_comp = CLAMP(gather_comp, 0, 3);
4816                 }
4817
4818                 dmask = 1 << gather_comp;
4819         }
4820
4821         set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4822                            samp_ptr, address, count, dmask);
4823 }
4824
4825 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
4826  * incorrectly forces nearest filtering if the texture format is integer.
4827  * The only effect it has on Gather4, which always returns 4 texels for
4828  * bilinear filtering, is that the final coordinates are off by 0.5 of
4829  * the texel size.
4830  *
4831  * The workaround is to subtract 0.5 from the unnormalized coordinates,
4832  * or (0.5 / size) from the normalized coordinates.
4833  */
4834 static void si_lower_gather4_integer(struct si_shader_context *ctx,
4835                                      struct ac_image_args *args,
4836                                      unsigned target)
4837 {
4838         LLVMBuilderRef builder = ctx->gallivm.builder;
4839         LLVMValueRef coord = args->addr;
4840         LLVMValueRef half_texel[2];
4841         /* Texture coordinates start after:
4842          *   {offset, bias, z-compare, derivatives}
4843          * Only the offset and z-compare can occur here.
4844          */
4845         unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
4846         int c;
4847
4848         if (target == TGSI_TEXTURE_RECT ||
4849             target == TGSI_TEXTURE_SHADOWRECT) {
4850                 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
4851         } else {
4852                 struct tgsi_full_instruction txq_inst = {};
4853                 struct lp_build_emit_data txq_emit_data = {};
4854
4855                 /* Query the texture size. */
4856                 txq_inst.Texture.Texture = target;
4857                 txq_emit_data.inst = &txq_inst;
4858                 txq_emit_data.dst_type = ctx->v4i32;
4859                 set_tex_fetch_args(ctx, &txq_emit_data, target,
4860                                    args->resource, NULL, &ctx->i32_0,
4861                                    1, 0xf);
4862                 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
4863
4864                 /* Compute -0.5 / size. */
4865                 for (c = 0; c < 2; c++) {
4866                         half_texel[c] =
4867                                 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
4868                                                         LLVMConstInt(ctx->i32, c, 0), "");
4869                         half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
4870                         half_texel[c] =
4871                                 lp_build_emit_llvm_unary(&ctx->bld_base,
4872                                                          TGSI_OPCODE_RCP, half_texel[c]);
4873                         half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
4874                                                       LLVMConstReal(ctx->f32, -0.5), "");
4875                 }
4876         }
4877
4878         for (c = 0; c < 2; c++) {
4879                 LLVMValueRef tmp;
4880                 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
4881
4882                 tmp = LLVMBuildExtractElement(builder, coord, index, "");
4883                 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4884                 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
4885                 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4886                 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
4887         }
4888
4889         args->addr = coord;
4890 }
4891
4892 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4893                                 struct lp_build_tgsi_context *bld_base,
4894                                 struct lp_build_emit_data *emit_data)
4895 {
4896         struct si_shader_context *ctx = si_shader_context(bld_base);
4897         const struct tgsi_full_instruction *inst = emit_data->inst;
4898         struct ac_image_args args;
4899         unsigned opcode = inst->Instruction.Opcode;
4900         unsigned target = inst->Texture.Texture;
4901
4902         if (target == TGSI_TEXTURE_BUFFER) {
4903                 emit_data->output[emit_data->chan] =
4904                         ac_build_buffer_load_format(&ctx->ac,
4905                                                     emit_data->args[0],
4906                                                     emit_data->args[2],
4907                                                     emit_data->args[1],
4908                                                     true);
4909                 return;
4910         }
4911
4912         memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4913
4914         args.opcode = ac_image_sample;
4915         args.compare = tgsi_is_shadow_target(target);
4916         args.offset = inst->Texture.NumOffsets > 0;
4917
4918         switch (opcode) {
4919         case TGSI_OPCODE_TXF:
4920         case TGSI_OPCODE_TXF_LZ:
4921                 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
4922                               target == TGSI_TEXTURE_2D_MSAA ||
4923                               target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4924                                       ac_image_load : ac_image_load_mip;
4925                 args.compare = false;
4926                 args.offset = false;
4927                 break;
4928         case TGSI_OPCODE_LODQ:
4929                 args.opcode = ac_image_get_lod;
4930                 args.compare = false;
4931                 args.offset = false;
4932                 break;
4933         case TGSI_OPCODE_TEX:
4934         case TGSI_OPCODE_TEX2:
4935         case TGSI_OPCODE_TXP:
4936                 if (ctx->type != PIPE_SHADER_FRAGMENT)
4937                         args.level_zero = true;
4938                 break;
4939         case TGSI_OPCODE_TEX_LZ:
4940                 args.level_zero = true;
4941                 break;
4942         case TGSI_OPCODE_TXB:
4943         case TGSI_OPCODE_TXB2:
4944                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4945                 args.bias = true;
4946                 break;
4947         case TGSI_OPCODE_TXL:
4948         case TGSI_OPCODE_TXL2:
4949                 args.lod = true;
4950                 break;
4951         case TGSI_OPCODE_TXD:
4952                 args.deriv = true;
4953                 break;
4954         case TGSI_OPCODE_TG4:
4955                 args.opcode = ac_image_gather4;
4956                 args.level_zero = true;
4957                 break;
4958         default:
4959                 assert(0);
4960                 return;
4961         }
4962
4963         /* The hardware needs special lowering for Gather4 with integer formats. */
4964         if (ctx->screen->b.chip_class <= VI &&
4965             opcode == TGSI_OPCODE_TG4) {
4966                 struct tgsi_shader_info *info = &ctx->shader->selector->info;
4967                 /* This will also work with non-constant indexing because of how
4968                  * glsl_to_tgsi works and we intent to preserve that behavior.
4969                  */
4970                 const unsigned src_idx = 2;
4971                 unsigned sampler = inst->Src[src_idx].Register.Index;
4972
4973                 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
4974
4975                 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
4976                     info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
4977                         si_lower_gather4_integer(ctx, &args, target);
4978         }
4979
4980         emit_data->output[emit_data->chan] =
4981                 ac_build_image_opcode(&ctx->ac, &args);
4982 }
4983
4984 static void si_llvm_emit_txqs(
4985         const struct lp_build_tgsi_action *action,
4986         struct lp_build_tgsi_context *bld_base,
4987         struct lp_build_emit_data *emit_data)
4988 {
4989         struct si_shader_context *ctx = si_shader_context(bld_base);
4990         struct gallivm_state *gallivm = &ctx->gallivm;
4991         LLVMBuilderRef builder = gallivm->builder;
4992         LLVMValueRef res, samples;
4993         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4994
4995         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4996
4997
4998         /* Read the samples from the descriptor directly. */
4999         res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5000         samples = LLVMBuildExtractElement(
5001                 builder, res,
5002                 LLVMConstInt(ctx->i32, 3, 0), "");
5003         samples = LLVMBuildLShr(builder, samples,
5004                                 LLVMConstInt(ctx->i32, 16, 0), "");
5005         samples = LLVMBuildAnd(builder, samples,
5006                                LLVMConstInt(ctx->i32, 0xf, 0), "");
5007         samples = LLVMBuildShl(builder, ctx->i32_1,
5008                                samples, "");
5009
5010         emit_data->output[emit_data->chan] = samples;
5011 }
5012
5013 static void si_llvm_emit_ddxy(
5014         const struct lp_build_tgsi_action *action,
5015         struct lp_build_tgsi_context *bld_base,
5016         struct lp_build_emit_data *emit_data)
5017 {
5018         struct si_shader_context *ctx = si_shader_context(bld_base);
5019         struct gallivm_state *gallivm = &ctx->gallivm;
5020         unsigned opcode = emit_data->info->opcode;
5021         LLVMValueRef val;
5022         int idx;
5023         unsigned mask;
5024
5025         if (opcode == TGSI_OPCODE_DDX_FINE)
5026                 mask = AC_TID_MASK_LEFT;
5027         else if (opcode == TGSI_OPCODE_DDY_FINE)
5028                 mask = AC_TID_MASK_TOP;
5029         else
5030                 mask = AC_TID_MASK_TOP_LEFT;
5031
5032         /* for DDX we want to next X pixel, DDY next Y pixel. */
5033         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5034
5035         val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5036         val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5037                             mask, idx, ctx->lds, val);
5038         emit_data->output[emit_data->chan] = val;
5039 }
5040
5041 /*
5042  * this takes an I,J coordinate pair,
5043  * and works out the X and Y derivatives.
5044  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5045  */
5046 static LLVMValueRef si_llvm_emit_ddxy_interp(
5047         struct lp_build_tgsi_context *bld_base,
5048         LLVMValueRef interp_ij)
5049 {
5050         struct si_shader_context *ctx = si_shader_context(bld_base);
5051         struct gallivm_state *gallivm = &ctx->gallivm;
5052         LLVMValueRef result[4], a;
5053         unsigned i;
5054
5055         for (i = 0; i < 2; i++) {
5056                 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5057                                             LLVMConstInt(ctx->i32, i, 0), "");
5058                 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5059                 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5060         }
5061
5062         return lp_build_gather_values(gallivm, result, 4);
5063 }
5064
5065 static void interp_fetch_args(
5066         struct lp_build_tgsi_context *bld_base,
5067         struct lp_build_emit_data *emit_data)
5068 {
5069         struct si_shader_context *ctx = si_shader_context(bld_base);
5070         struct gallivm_state *gallivm = &ctx->gallivm;
5071         const struct tgsi_full_instruction *inst = emit_data->inst;
5072
5073         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5074                 /* offset is in second src, first two channels */
5075                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5076                                                          emit_data->inst, 1,
5077                                                          TGSI_CHAN_X);
5078                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5079                                                          emit_data->inst, 1,
5080                                                          TGSI_CHAN_Y);
5081                 emit_data->arg_count = 2;
5082         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5083                 LLVMValueRef sample_position;
5084                 LLVMValueRef sample_id;
5085                 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5086
5087                 /* fetch sample ID, then fetch its sample position,
5088                  * and place into first two channels.
5089                  */
5090                 sample_id = lp_build_emit_fetch(bld_base,
5091                                                 emit_data->inst, 1, TGSI_CHAN_X);
5092                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5093                                              ctx->i32, "");
5094                 sample_position = load_sample_position(ctx, sample_id);
5095
5096                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5097                                                              sample_position,
5098                                                              ctx->i32_0, "");
5099
5100                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5101                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5102                                                              sample_position,
5103                                                              ctx->i32_1, "");
5104                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5105                 emit_data->arg_count = 2;
5106         }
5107 }
5108
5109 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5110                                 struct lp_build_tgsi_context *bld_base,
5111                                 struct lp_build_emit_data *emit_data)
5112 {
5113         struct si_shader_context *ctx = si_shader_context(bld_base);
5114         struct si_shader *shader = ctx->shader;
5115         struct gallivm_state *gallivm = &ctx->gallivm;
5116         LLVMValueRef interp_param;
5117         const struct tgsi_full_instruction *inst = emit_data->inst;
5118         int input_index = inst->Src[0].Register.Index;
5119         int chan;
5120         int i;
5121         LLVMValueRef attr_number;
5122         LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5123         int interp_param_idx;
5124         unsigned interp = shader->selector->info.input_interpolate[input_index];
5125         unsigned location;
5126
5127         assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5128
5129         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5130             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5131                 location = TGSI_INTERPOLATE_LOC_CENTER;
5132         else
5133                 location = TGSI_INTERPOLATE_LOC_CENTROID;
5134
5135         interp_param_idx = lookup_interp_param_index(interp, location);
5136         if (interp_param_idx == -1)
5137                 return;
5138         else if (interp_param_idx)
5139                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5140         else
5141                 interp_param = NULL;
5142
5143         attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5144
5145         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5146             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5147                 LLVMValueRef ij_out[2];
5148                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5149
5150                 /*
5151                  * take the I then J parameters, and the DDX/Y for it, and
5152                  * calculate the IJ inputs for the interpolator.
5153                  * temp1 = ddx * offset/sample.x + I;
5154                  * interp_param.I = ddy * offset/sample.y + temp1;
5155                  * temp1 = ddx * offset/sample.x + J;
5156                  * interp_param.J = ddy * offset/sample.y + temp1;
5157                  */
5158                 for (i = 0; i < 2; i++) {
5159                         LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5160                         LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5161                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5162                                                                       ddxy_out, ix_ll, "");
5163                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5164                                                                       ddxy_out, iy_ll, "");
5165                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5166                                                                          interp_param, ix_ll, "");
5167                         LLVMValueRef temp1, temp2;
5168
5169                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5170                                                      ctx->f32, "");
5171
5172                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5173
5174                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5175
5176                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5177
5178                         ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5179                 }
5180                 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5181         }
5182
5183         for (chan = 0; chan < 4; chan++) {
5184                 LLVMValueRef llvm_chan;
5185                 unsigned schan;
5186
5187                 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5188                 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5189
5190                 if (interp_param) {
5191                         interp_param = LLVMBuildBitCast(gallivm->builder,
5192                                 interp_param, LLVMVectorType(ctx->f32, 2), "");
5193                         LLVMValueRef i = LLVMBuildExtractElement(
5194                                 gallivm->builder, interp_param, ctx->i32_0, "");
5195                         LLVMValueRef j = LLVMBuildExtractElement(
5196                                 gallivm->builder, interp_param, ctx->i32_1, "");
5197                         emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5198                                 llvm_chan, attr_number, params,
5199                                 i, j);
5200                 } else {
5201                         emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5202                                 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5203                                 llvm_chan, attr_number, params);
5204                 }
5205         }
5206 }
5207
5208 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5209                                    LLVMValueRef value)
5210 {
5211         struct gallivm_state *gallivm = &ctx->gallivm;
5212         LLVMValueRef args[3] = {
5213                 value,
5214                 ctx->i32_0,
5215                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5216         };
5217
5218         /* We currently have no other way to prevent LLVM from lifting the icmp
5219          * calls to a dominating basic block.
5220          */
5221         emit_optimization_barrier(ctx, &args[0]);
5222
5223         if (LLVMTypeOf(args[0]) != ctx->i32)
5224                 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5225
5226         return lp_build_intrinsic(gallivm->builder,
5227                                   "llvm.amdgcn.icmp.i32",
5228                                   ctx->i64, args, 3,
5229                                   LP_FUNC_ATTR_NOUNWIND |
5230                                   LP_FUNC_ATTR_READNONE |
5231                                   LP_FUNC_ATTR_CONVERGENT);
5232 }
5233
5234 static void vote_all_emit(
5235         const struct lp_build_tgsi_action *action,
5236         struct lp_build_tgsi_context *bld_base,
5237         struct lp_build_emit_data *emit_data)
5238 {
5239         struct si_shader_context *ctx = si_shader_context(bld_base);
5240         struct gallivm_state *gallivm = &ctx->gallivm;
5241         LLVMValueRef active_set, vote_set;
5242         LLVMValueRef tmp;
5243
5244         active_set = si_emit_ballot(ctx, ctx->i32_1);
5245         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5246
5247         tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5248         emit_data->output[emit_data->chan] =
5249                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5250 }
5251
5252 static void vote_any_emit(
5253         const struct lp_build_tgsi_action *action,
5254         struct lp_build_tgsi_context *bld_base,
5255         struct lp_build_emit_data *emit_data)
5256 {
5257         struct si_shader_context *ctx = si_shader_context(bld_base);
5258         struct gallivm_state *gallivm = &ctx->gallivm;
5259         LLVMValueRef vote_set;
5260         LLVMValueRef tmp;
5261
5262         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5263
5264         tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5265                             vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5266         emit_data->output[emit_data->chan] =
5267                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5268 }
5269
5270 static void vote_eq_emit(
5271         const struct lp_build_tgsi_action *action,
5272         struct lp_build_tgsi_context *bld_base,
5273         struct lp_build_emit_data *emit_data)
5274 {
5275         struct si_shader_context *ctx = si_shader_context(bld_base);
5276         struct gallivm_state *gallivm = &ctx->gallivm;
5277         LLVMValueRef active_set, vote_set;
5278         LLVMValueRef all, none, tmp;
5279
5280         active_set = si_emit_ballot(ctx, ctx->i32_1);
5281         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5282
5283         all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5284         none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5285                              vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5286         tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5287         emit_data->output[emit_data->chan] =
5288                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5289 }
5290
5291 static void ballot_emit(
5292         const struct lp_build_tgsi_action *action,
5293         struct lp_build_tgsi_context *bld_base,
5294         struct lp_build_emit_data *emit_data)
5295 {
5296         struct si_shader_context *ctx = si_shader_context(bld_base);
5297         LLVMBuilderRef builder = ctx->gallivm.builder;
5298         LLVMValueRef tmp;
5299
5300         tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5301         tmp = si_emit_ballot(ctx, tmp);
5302         tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5303
5304         emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5305         emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5306 }
5307
5308 static void read_invoc_fetch_args(
5309         struct lp_build_tgsi_context *bld_base,
5310         struct lp_build_emit_data *emit_data)
5311 {
5312         emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5313                                                  0, emit_data->src_chan);
5314
5315         /* Always read the source invocation (= lane) from the X channel. */
5316         emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5317                                                  1, TGSI_CHAN_X);
5318         emit_data->arg_count = 2;
5319 }
5320
5321 static void read_lane_emit(
5322         const struct lp_build_tgsi_action *action,
5323         struct lp_build_tgsi_context *bld_base,
5324         struct lp_build_emit_data *emit_data)
5325 {
5326         struct si_shader_context *ctx = si_shader_context(bld_base);
5327         LLVMBuilderRef builder = ctx->gallivm.builder;
5328
5329         /* We currently have no other way to prevent LLVM from lifting the icmp
5330          * calls to a dominating basic block.
5331          */
5332         emit_optimization_barrier(ctx, &emit_data->args[0]);
5333
5334         for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5335                 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5336                                                       ctx->i32, "");
5337         }
5338
5339         emit_data->output[emit_data->chan] =
5340                 ac_build_intrinsic(&ctx->ac, action->intr_name,
5341                                    ctx->i32, emit_data->args, emit_data->arg_count,
5342                                    AC_FUNC_ATTR_READNONE |
5343                                    AC_FUNC_ATTR_CONVERGENT);
5344 }
5345
5346 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5347                                        struct lp_build_emit_data *emit_data)
5348 {
5349         struct si_shader_context *ctx = si_shader_context(bld_base);
5350         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5351         LLVMValueRef imm;
5352         unsigned stream;
5353
5354         assert(src0.File == TGSI_FILE_IMMEDIATE);
5355
5356         imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5357         stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5358         return stream;
5359 }
5360
5361 /* Emit one vertex from the geometry shader */
5362 static void si_llvm_emit_vertex(
5363         const struct lp_build_tgsi_action *action,
5364         struct lp_build_tgsi_context *bld_base,
5365         struct lp_build_emit_data *emit_data)
5366 {
5367         struct si_shader_context *ctx = si_shader_context(bld_base);
5368         struct lp_build_context *uint = &bld_base->uint_bld;
5369         struct si_shader *shader = ctx->shader;
5370         struct tgsi_shader_info *info = &shader->selector->info;
5371         struct gallivm_state *gallivm = &ctx->gallivm;
5372         struct lp_build_if_state if_state;
5373         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5374                                             ctx->param_gs2vs_offset);
5375         LLVMValueRef gs_next_vertex;
5376         LLVMValueRef can_emit, kill;
5377         unsigned chan, offset;
5378         int i;
5379         unsigned stream;
5380
5381         stream = si_llvm_get_stream(bld_base, emit_data);
5382
5383         /* Write vertex attribute values to GSVS ring */
5384         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5385                                        ctx->gs_next_vertex[stream],
5386                                        "");
5387
5388         /* If this thread has already emitted the declared maximum number of
5389          * vertices, skip the write: excessive vertex emissions are not
5390          * supposed to have any effect.
5391          *
5392          * If the shader has no writes to memory, kill it instead. This skips
5393          * further memory loads and may allow LLVM to skip to the end
5394          * altogether.
5395          */
5396         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5397                                  LLVMConstInt(ctx->i32,
5398                                               shader->selector->gs_max_out_vertices, 0), "");
5399
5400         bool use_kill = !info->writes_memory;
5401         if (use_kill) {
5402                 kill = lp_build_select(&bld_base->base, can_emit,
5403                                        LLVMConstReal(ctx->f32, 1.0f),
5404                                        LLVMConstReal(ctx->f32, -1.0f));
5405
5406                 ac_build_kill(&ctx->ac, kill);
5407         } else {
5408                 lp_build_if(&if_state, gallivm, can_emit);
5409         }
5410
5411         offset = 0;
5412         for (i = 0; i < info->num_outputs; i++) {
5413                 LLVMValueRef *out_ptr = ctx->outputs[i];
5414
5415                 for (chan = 0; chan < 4; chan++) {
5416                         if (!(info->output_usagemask[i] & (1 << chan)) ||
5417                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5418                                 continue;
5419
5420                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5421                         LLVMValueRef voffset =
5422                                 LLVMConstInt(ctx->i32, offset *
5423                                              shader->selector->gs_max_out_vertices, 0);
5424                         offset++;
5425
5426                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
5427                         voffset = lp_build_mul_imm(uint, voffset, 4);
5428
5429                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5430
5431                         ac_build_buffer_store_dword(&ctx->ac,
5432                                                     ctx->gsvs_ring[stream],
5433                                                     out_val, 1,
5434                                                     voffset, soffset, 0,
5435                                                     1, 1, true, true);
5436                 }
5437         }
5438
5439         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5440                                       ctx->i32_1);
5441
5442         LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5443
5444         /* Signal vertex emission */
5445         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5446                          LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
5447         if (!use_kill)
5448                 lp_build_endif(&if_state);
5449 }
5450
5451 /* Cut one primitive from the geometry shader */
5452 static void si_llvm_emit_primitive(
5453         const struct lp_build_tgsi_action *action,
5454         struct lp_build_tgsi_context *bld_base,
5455         struct lp_build_emit_data *emit_data)
5456 {
5457         struct si_shader_context *ctx = si_shader_context(bld_base);
5458         unsigned stream;
5459
5460         /* Signal primitive cut */
5461         stream = si_llvm_get_stream(bld_base, emit_data);
5462         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5463                          LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
5464 }
5465
5466 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5467                                  struct lp_build_tgsi_context *bld_base,
5468                                  struct lp_build_emit_data *emit_data)
5469 {
5470         struct si_shader_context *ctx = si_shader_context(bld_base);
5471         struct gallivm_state *gallivm = &ctx->gallivm;
5472
5473         /* SI only (thanks to a hw bug workaround):
5474          * The real barrier instruction isn’t needed, because an entire patch
5475          * always fits into a single wave.
5476          */
5477         if (HAVE_LLVM >= 0x0309 &&
5478             ctx->screen->b.chip_class == SI &&
5479             ctx->type == PIPE_SHADER_TESS_CTRL) {
5480                 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5481                 return;
5482         }
5483
5484         lp_build_intrinsic(gallivm->builder,
5485                            HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5486                                                : "llvm.AMDGPU.barrier.local",
5487                            ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5488 }
5489
5490 static const struct lp_build_tgsi_action tex_action = {
5491         .fetch_args = tex_fetch_args,
5492         .emit = build_tex_intrinsic,
5493 };
5494
5495 static const struct lp_build_tgsi_action interp_action = {
5496         .fetch_args = interp_fetch_args,
5497         .emit = build_interp_intrinsic,
5498 };
5499
5500 static void si_create_function(struct si_shader_context *ctx,
5501                                const char *name,
5502                                LLVMTypeRef *returns, unsigned num_returns,
5503                                LLVMTypeRef *params, unsigned num_params,
5504                                int last_sgpr)
5505 {
5506         int i;
5507
5508         si_llvm_create_func(ctx, name, returns, num_returns,
5509                             params, num_params);
5510         si_llvm_shader_type(ctx->main_fn, ctx->type);
5511         ctx->return_value = LLVMGetUndef(ctx->return_type);
5512
5513         for (i = 0; i <= last_sgpr; ++i) {
5514                 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5515
5516                 /* The combination of:
5517                  * - ByVal
5518                  * - dereferenceable
5519                  * - invariant.load
5520                  * allows the optimization passes to move loads and reduces
5521                  * SGPR spilling significantly.
5522                  */
5523                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5524                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5525                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5526                         ac_add_attr_dereferenceable(P, UINT64_MAX);
5527                 } else
5528                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5529         }
5530
5531         LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5532                                            "no-signed-zeros-fp-math",
5533                                            "true");
5534
5535         if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5536                 /* These were copied from some LLVM test. */
5537                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5538                                                    "less-precise-fpmad",
5539                                                    "true");
5540                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5541                                                    "no-infs-fp-math",
5542                                                    "true");
5543                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5544                                                    "no-nans-fp-math",
5545                                                    "true");
5546                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5547                                                    "unsafe-fp-math",
5548                                                    "true");
5549         }
5550 }
5551
5552 static void declare_streamout_params(struct si_shader_context *ctx,
5553                                      struct pipe_stream_output_info *so,
5554                                      LLVMTypeRef *params, LLVMTypeRef i32,
5555                                      unsigned *num_params)
5556 {
5557         int i;
5558
5559         /* Streamout SGPRs. */
5560         if (so->num_outputs) {
5561                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5562                         params[ctx->param_streamout_config = (*num_params)++] = i32;
5563                 else
5564                         ctx->param_streamout_config = *num_params - 1;
5565
5566                 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5567         }
5568         /* A streamout buffer offset is loaded if the stride is non-zero. */
5569         for (i = 0; i < 4; i++) {
5570                 if (!so->stride[i])
5571                         continue;
5572
5573                 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5574         }
5575 }
5576
5577 static unsigned llvm_get_type_size(LLVMTypeRef type)
5578 {
5579         LLVMTypeKind kind = LLVMGetTypeKind(type);
5580
5581         switch (kind) {
5582         case LLVMIntegerTypeKind:
5583                 return LLVMGetIntTypeWidth(type) / 8;
5584         case LLVMFloatTypeKind:
5585                 return 4;
5586         case LLVMPointerTypeKind:
5587                 return 8;
5588         case LLVMVectorTypeKind:
5589                 return LLVMGetVectorSize(type) *
5590                        llvm_get_type_size(LLVMGetElementType(type));
5591         case LLVMArrayTypeKind:
5592                 return LLVMGetArrayLength(type) *
5593                        llvm_get_type_size(LLVMGetElementType(type));
5594         default:
5595                 assert(0);
5596                 return 0;
5597         }
5598 }
5599
5600 static void declare_tess_lds(struct si_shader_context *ctx)
5601 {
5602         struct gallivm_state *gallivm = &ctx->gallivm;
5603
5604         unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5605         ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5606                 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5607                 "tess_lds");
5608 }
5609
5610 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5611 {
5612         const unsigned *properties = shader->selector->info.properties;
5613         unsigned max_work_group_size =
5614                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5615                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5616                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5617
5618         if (!max_work_group_size) {
5619                 /* This is a variable group size compute shader,
5620                  * compile it for the maximum possible group size.
5621                  */
5622                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5623         }
5624         return max_work_group_size;
5625 }
5626
5627 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
5628                                             LLVMTypeRef *params,
5629                                             unsigned *num_params,
5630                                             bool assign_params)
5631 {
5632         params[(*num_params)++] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5633         params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5634         params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5635         params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5636
5637         if (assign_params) {
5638                 ctx->param_const_buffers  = *num_params - 4;
5639                 ctx->param_samplers       = *num_params - 3;
5640                 ctx->param_images         = *num_params - 2;
5641                 ctx->param_shader_buffers = *num_params - 1;
5642         }
5643 }
5644
5645 static void declare_default_desc_pointers(struct si_shader_context *ctx,
5646                                           LLVMTypeRef *params,
5647                                           unsigned *num_params)
5648 {
5649         params[ctx->param_rw_buffers = (*num_params)++] =
5650                 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5651         declare_per_stage_desc_pointers(ctx, params, num_params, true);
5652 }
5653
5654 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
5655                                             LLVMTypeRef *params,
5656                                             unsigned *num_params)
5657 {
5658         params[ctx->param_vertex_buffers = (*num_params)++] =
5659                 const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5660         params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
5661         params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
5662         params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
5663         params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
5664 }
5665
5666 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
5667                                    LLVMTypeRef *params, unsigned *num_params,
5668                                    unsigned *num_prolog_vgprs)
5669 {
5670         struct si_shader *shader = ctx->shader;
5671
5672         params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
5673         params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
5674         params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
5675         params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5676
5677         if (!shader->is_gs_copy_shader) {
5678                 /* Vertex load indices. */
5679                 ctx->param_vertex_index0 = (*num_params);
5680                 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
5681                         params[(*num_params)++] = ctx->i32;
5682                 *num_prolog_vgprs += shader->selector->info.num_inputs;
5683         }
5684 }
5685
5686 enum {
5687         /* Convenient merged shader definitions. */
5688         SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
5689         SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
5690 };
5691
5692 static void create_function(struct si_shader_context *ctx)
5693 {
5694         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5695         struct gallivm_state *gallivm = &ctx->gallivm;
5696         struct si_shader *shader = ctx->shader;
5697         LLVMTypeRef params[100]; /* just make it large enough */
5698         LLVMTypeRef returns[16+32*4];
5699         unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5700         unsigned num_returns = 0;
5701         unsigned num_prolog_vgprs = 0;
5702         unsigned type = ctx->type;
5703
5704         /* Set MERGED shaders. */
5705         if (ctx->screen->b.chip_class >= GFX9) {
5706                 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
5707                         type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
5708                 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
5709                         type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
5710         }
5711
5712         LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
5713
5714         switch (type) {
5715         case PIPE_SHADER_VERTEX:
5716                 declare_default_desc_pointers(ctx, params, &num_params);
5717                 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5718
5719                 if (shader->key.as_es) {
5720                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5721                 } else if (shader->key.as_ls) {
5722                         /* no extra parameters */
5723                 } else {
5724                         if (shader->is_gs_copy_shader)
5725                                 num_params = ctx->param_rw_buffers + 1;
5726
5727                         /* The locations of the other parameters are assigned dynamically. */
5728                         declare_streamout_params(ctx, &shader->selector->so,
5729                                                  params, ctx->i32, &num_params);
5730                 }
5731
5732                 last_sgpr = num_params-1;
5733
5734                 /* VGPRs */
5735                 declare_vs_input_vgprs(ctx, params, &num_params,
5736                                        &num_prolog_vgprs);
5737
5738                 /* PrimitiveID output. */
5739                 if (!shader->is_gs_copy_shader &&
5740                     !shader->key.as_es && !shader->key.as_ls) {
5741                         for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5742                                 returns[num_returns++] = ctx->f32;
5743                 }
5744                 break;
5745
5746         case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
5747                 declare_default_desc_pointers(ctx, params, &num_params);
5748                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5749                 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5750                 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5751                 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5752                 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5753                 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5754                 last_sgpr = num_params - 1;
5755
5756                 /* VGPRs */
5757                 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5758                 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5759
5760                 /* param_tcs_offchip_offset and param_tcs_factor_offset are
5761                  * placed after the user SGPRs.
5762                  */
5763                 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
5764                         returns[num_returns++] = ctx->i32; /* SGPRs */
5765                 for (i = 0; i < 3; i++)
5766                         returns[num_returns++] = ctx->f32; /* VGPRs */
5767                 break;
5768
5769         case SI_SHADER_MERGED_VERTEX_TESSCTRL:
5770                 /* Merged stages have 8 system SGPRs at the beginning. */
5771                 params[num_params++] = ctx->i32; /* unused */
5772                 params[num_params++] = ctx->i32; /* unused */
5773                 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5774                 params[num_params++] = ctx->i32; /* wave thread counts for LS and HS */
5775                 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5776                 params[num_params++] = ctx->i32; /* scratch wave offset */
5777                 params[num_params++] = ctx->i32; /* unused */
5778                 params[num_params++] = ctx->i32; /* unused */
5779
5780                 params[ctx->param_rw_buffers = num_params++] =
5781                         const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5782                 declare_per_stage_desc_pointers(ctx, params, &num_params,
5783                                                 ctx->type == PIPE_SHADER_VERTEX);
5784                 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5785
5786                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5787                 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5788                 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5789                 params[num_params++] = ctx->i32; /* unused */
5790
5791                 declare_per_stage_desc_pointers(ctx, params, &num_params,
5792                                                 ctx->type == PIPE_SHADER_TESS_CTRL);
5793                 last_sgpr = num_params - 1;
5794
5795                 /* VGPRs (first TCS, then VS) */
5796                 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5797                 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5798
5799                 if (ctx->type == PIPE_SHADER_VERTEX) {
5800                         declare_vs_input_vgprs(ctx, params, &num_params,
5801                                                &num_prolog_vgprs);
5802
5803                         /* LS return values are inputs to the TCS main shader part. */
5804                         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
5805                                 returns[num_returns++] = ctx->i32; /* SGPRs */
5806                         for (i = 0; i < 2; i++)
5807                                 returns[num_returns++] = ctx->f32; /* VGPRs */
5808                 } else {
5809                         /* TCS return values are inputs to the TCS epilog.
5810                          *
5811                          * param_tcs_offchip_offset and param_tcs_factor_offset
5812                          * should be passed to the epilog.
5813                          */
5814                         for (i = 0; i <= ctx->param_tcs_factor_offset; i++)
5815                                 returns[num_returns++] = ctx->i32; /* SGPRs */
5816                         for (i = 0; i < 3; i++)
5817                                 returns[num_returns++] = ctx->f32; /* VGPRs */
5818                 }
5819                 break;
5820
5821         case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
5822                 assert(!"unimplemented merged ES-GS shader");
5823                 break;
5824
5825         case PIPE_SHADER_TESS_EVAL:
5826                 declare_default_desc_pointers(ctx, params, &num_params);
5827                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5828
5829                 if (shader->key.as_es) {
5830                         params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5831                         params[num_params++] = ctx->i32;
5832                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5833                 } else {
5834                         params[num_params++] = ctx->i32;
5835                         declare_streamout_params(ctx, &shader->selector->so,
5836                                                  params, ctx->i32, &num_params);
5837                         params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5838                 }
5839                 last_sgpr = num_params - 1;
5840
5841                 /* VGPRs */
5842                 params[ctx->param_tes_u = num_params++] = ctx->f32;
5843                 params[ctx->param_tes_v = num_params++] = ctx->f32;
5844                 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5845                 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5846
5847                 /* PrimitiveID output. */
5848                 if (!shader->key.as_es)
5849                         for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5850                                 returns[num_returns++] = ctx->f32;
5851                 break;
5852
5853         case PIPE_SHADER_GEOMETRY:
5854                 declare_default_desc_pointers(ctx, params, &num_params);
5855                 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
5856                 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
5857                 last_sgpr = num_params - 1;
5858
5859                 /* VGPRs */
5860                 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
5861                 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
5862                 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
5863                 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
5864                 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
5865                 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
5866                 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
5867                 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
5868                 break;
5869
5870         case PIPE_SHADER_FRAGMENT:
5871                 declare_default_desc_pointers(ctx, params, &num_params);
5872                 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5873                 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5874                 last_sgpr = SI_PARAM_PRIM_MASK;
5875                 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5876                 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5877                 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5878                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5879                 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5880                 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5881                 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5882                 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5883                 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5884                 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5885                 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5886                 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5887                 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5888                 shader->info.face_vgpr_index = 20;
5889                 params[SI_PARAM_ANCILLARY] = ctx->i32;
5890                 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5891                 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5892                 num_params = SI_PARAM_POS_FIXED_PT+1;
5893
5894                 /* Color inputs from the prolog. */
5895                 if (shader->selector->info.colors_read) {
5896                         unsigned num_color_elements =
5897                                 util_bitcount(shader->selector->info.colors_read);
5898
5899                         assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5900                         for (i = 0; i < num_color_elements; i++)
5901                                 params[num_params++] = ctx->f32;
5902
5903                         num_prolog_vgprs += num_color_elements;
5904                 }
5905
5906                 /* Outputs for the epilog. */
5907                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5908                 num_returns =
5909                         num_return_sgprs +
5910                         util_bitcount(shader->selector->info.colors_written) * 4 +
5911                         shader->selector->info.writes_z +
5912                         shader->selector->info.writes_stencil +
5913                         shader->selector->info.writes_samplemask +
5914                         1 /* SampleMaskIn */;
5915
5916                 num_returns = MAX2(num_returns,
5917                                    num_return_sgprs +
5918                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5919
5920                 for (i = 0; i < num_return_sgprs; i++)
5921                         returns[i] = ctx->i32;
5922                 for (; i < num_returns; i++)
5923                         returns[i] = ctx->f32;
5924                 break;
5925
5926         case PIPE_SHADER_COMPUTE:
5927                 declare_default_desc_pointers(ctx, params, &num_params);
5928                 params[SI_PARAM_GRID_SIZE] = v3i32;
5929                 params[SI_PARAM_BLOCK_SIZE] = v3i32;
5930                 params[SI_PARAM_BLOCK_ID] = v3i32;
5931                 last_sgpr = SI_PARAM_BLOCK_ID;
5932
5933                 params[SI_PARAM_THREAD_ID] = v3i32;
5934                 num_params = SI_PARAM_THREAD_ID + 1;
5935                 break;
5936         default:
5937                 assert(0 && "unimplemented shader");
5938                 return;
5939         }
5940
5941         assert(num_params <= ARRAY_SIZE(params));
5942
5943         si_create_function(ctx, "main", returns, num_returns, params,
5944                            num_params, last_sgpr);
5945
5946         /* Reserve register locations for VGPR inputs the PS prolog may need. */
5947         if (ctx->type == PIPE_SHADER_FRAGMENT &&
5948             ctx->separate_prolog) {
5949                 si_llvm_add_attribute(ctx->main_fn,
5950                                       "InitialPSInputAddr",
5951                                       S_0286D0_PERSP_SAMPLE_ENA(1) |
5952                                       S_0286D0_PERSP_CENTER_ENA(1) |
5953                                       S_0286D0_PERSP_CENTROID_ENA(1) |
5954                                       S_0286D0_LINEAR_SAMPLE_ENA(1) |
5955                                       S_0286D0_LINEAR_CENTER_ENA(1) |
5956                                       S_0286D0_LINEAR_CENTROID_ENA(1) |
5957                                       S_0286D0_FRONT_FACE_ENA(1) |
5958                                       S_0286D0_POS_FIXED_PT_ENA(1));
5959         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5960                 si_llvm_add_attribute(ctx->main_fn,
5961                                       "amdgpu-max-work-group-size",
5962                                       si_get_max_workgroup_size(shader));
5963         }
5964
5965         shader->info.num_input_sgprs = 0;
5966         shader->info.num_input_vgprs = 0;
5967
5968         for (i = 0; i <= last_sgpr; ++i)
5969                 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5970
5971         for (; i < num_params; ++i)
5972                 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5973
5974         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
5975         shader->info.num_input_vgprs -= num_prolog_vgprs;
5976
5977         if (!ctx->screen->has_ds_bpermute &&
5978             bld_base->info &&
5979             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5980              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5981              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5982              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5983              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5984              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5985                 ctx->lds =
5986                         LLVMAddGlobalInAddressSpace(gallivm->module,
5987                                                     LLVMArrayType(ctx->i32, 64),
5988                                                     "ddxy_lds",
5989                                                     LOCAL_ADDR_SPACE);
5990
5991         if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.as_ls) ||
5992             ctx->type == PIPE_SHADER_TESS_CTRL)
5993                 declare_tess_lds(ctx);
5994 }
5995
5996 /**
5997  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5998  * for later use.
5999  */
6000 static void preload_ring_buffers(struct si_shader_context *ctx)
6001 {
6002         struct gallivm_state *gallivm = &ctx->gallivm;
6003         LLVMBuilderRef builder = gallivm->builder;
6004
6005         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
6006                                             ctx->param_rw_buffers);
6007
6008         if ((ctx->type == PIPE_SHADER_VERTEX &&
6009              ctx->shader->key.as_es) ||
6010             (ctx->type == PIPE_SHADER_TESS_EVAL &&
6011              ctx->shader->key.as_es) ||
6012             ctx->type == PIPE_SHADER_GEOMETRY) {
6013                 unsigned ring =
6014                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
6015                                                              : SI_ES_RING_ESGS;
6016                 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
6017
6018                 ctx->esgs_ring =
6019                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6020         }
6021
6022         if (ctx->shader->is_gs_copy_shader) {
6023                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6024
6025                 ctx->gsvs_ring[0] =
6026                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6027         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
6028                 const struct si_shader_selector *sel = ctx->shader->selector;
6029                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6030                 LLVMValueRef base_ring;
6031
6032                 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6033
6034                 /* The conceptual layout of the GSVS ring is
6035                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
6036                  * but the real memory layout is swizzled across
6037                  * threads:
6038                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
6039                  *   t16v0c0 ..
6040                  * Override the buffer descriptor accordingly.
6041                  */
6042                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
6043                 uint64_t stream_offset = 0;
6044
6045                 for (unsigned stream = 0; stream < 4; ++stream) {
6046                         unsigned num_components;
6047                         unsigned stride;
6048                         unsigned num_records;
6049                         LLVMValueRef ring, tmp;
6050
6051                         num_components = sel->info.num_stream_output_components[stream];
6052                         if (!num_components)
6053                                 continue;
6054
6055                         stride = 4 * num_components * sel->gs_max_out_vertices;
6056
6057                         /* Limit on the stride field for <= CIK. */
6058                         assert(stride < (1 << 14));
6059
6060                         num_records = 64;
6061
6062                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
6063                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
6064                         tmp = LLVMBuildAdd(builder, tmp,
6065                                            LLVMConstInt(ctx->i64,
6066                                                         stream_offset, 0), "");
6067                         stream_offset += stride * 64;
6068
6069                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
6070                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
6071                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
6072                         tmp = LLVMBuildOr(builder, tmp,
6073                                 LLVMConstInt(ctx->i32,
6074                                              S_008F04_STRIDE(stride) |
6075                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
6076                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
6077                         ring = LLVMBuildInsertElement(builder, ring,
6078                                         LLVMConstInt(ctx->i32, num_records, 0),
6079                                         LLVMConstInt(ctx->i32, 2, 0), "");
6080                         ring = LLVMBuildInsertElement(builder, ring,
6081                                 LLVMConstInt(ctx->i32,
6082                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
6083                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6084                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
6085                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6086                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6087                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
6088                                              S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
6089                                              S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
6090                                              S_008F0C_ADD_TID_ENABLE(1),
6091                                              0),
6092                                 LLVMConstInt(ctx->i32, 3, 0), "");
6093                         ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
6094
6095                         ctx->gsvs_ring[stream] = ring;
6096                 }
6097         }
6098 }
6099
6100 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6101                                          LLVMValueRef param_rw_buffers,
6102                                          unsigned param_pos_fixed_pt)
6103 {
6104         struct gallivm_state *gallivm = &ctx->gallivm;
6105         LLVMBuilderRef builder = gallivm->builder;
6106         LLVMValueRef slot, desc, offset, row, bit, address[2];
6107
6108         /* Use the fixed-point gl_FragCoord input.
6109          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6110          * per coordinate to get the repeating effect.
6111          */
6112         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6113         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6114
6115         /* Load the buffer descriptor. */
6116         slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6117         desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6118
6119         /* The stipple pattern is 32x32, each row has 32 bits. */
6120         offset = LLVMBuildMul(builder, address[1],
6121                               LLVMConstInt(ctx->i32, 4, 0), "");
6122         row = buffer_load_const(ctx, desc, offset);
6123         row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6124         bit = LLVMBuildLShr(builder, row, address[0], "");
6125         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6126
6127         /* The intrinsic kills the thread if arg < 0. */
6128         bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6129                               LLVMConstReal(ctx->f32, -1), "");
6130         ac_build_kill(&ctx->ac, bit);
6131 }
6132
6133 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6134                                   struct si_shader_config *conf,
6135                                   unsigned symbol_offset)
6136 {
6137         unsigned i;
6138         const unsigned char *config =
6139                 ac_shader_binary_config_start(binary, symbol_offset);
6140         bool really_needs_scratch = false;
6141
6142         /* LLVM adds SGPR spills to the scratch size.
6143          * Find out if we really need the scratch buffer.
6144          */
6145         for (i = 0; i < binary->reloc_count; i++) {
6146                 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6147
6148                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6149                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6150                         really_needs_scratch = true;
6151                         break;
6152                 }
6153         }
6154
6155         /* XXX: We may be able to emit some of these values directly rather than
6156          * extracting fields to be emitted later.
6157          */
6158
6159         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6160                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6161                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6162                 switch (reg) {
6163                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6164                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6165                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6166                 case R_00B848_COMPUTE_PGM_RSRC1:
6167                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6168                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6169                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
6170                         conf->rsrc1 = value;
6171                         break;
6172                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6173                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6174                         break;
6175                 case R_00B84C_COMPUTE_PGM_RSRC2:
6176                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6177                         conf->rsrc2 = value;
6178                         break;
6179                 case R_0286CC_SPI_PS_INPUT_ENA:
6180                         conf->spi_ps_input_ena = value;
6181                         break;
6182                 case R_0286D0_SPI_PS_INPUT_ADDR:
6183                         conf->spi_ps_input_addr = value;
6184                         break;
6185                 case R_0286E8_SPI_TMPRING_SIZE:
6186                 case R_00B860_COMPUTE_TMPRING_SIZE:
6187                         /* WAVESIZE is in units of 256 dwords. */
6188                         if (really_needs_scratch)
6189                                 conf->scratch_bytes_per_wave =
6190                                         G_00B860_WAVESIZE(value) * 256 * 4;
6191                         break;
6192                 case 0x4: /* SPILLED_SGPRS */
6193                         conf->spilled_sgprs = value;
6194                         break;
6195                 case 0x8: /* SPILLED_VGPRS */
6196                         conf->spilled_vgprs = value;
6197                         break;
6198                 default:
6199                         {
6200                                 static bool printed;
6201
6202                                 if (!printed) {
6203                                         fprintf(stderr, "Warning: LLVM emitted unknown "
6204                                                 "config register: 0x%x\n", reg);
6205                                         printed = true;
6206                                 }
6207                         }
6208                         break;
6209                 }
6210         }
6211
6212         if (!conf->spi_ps_input_addr)
6213                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6214 }
6215
6216 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6217                         struct si_shader *shader,
6218                         struct si_shader_config *config,
6219                         uint64_t scratch_va)
6220 {
6221         unsigned i;
6222         uint32_t scratch_rsrc_dword0 = scratch_va;
6223         uint32_t scratch_rsrc_dword1 =
6224                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6225
6226         /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6227          * correctly.
6228          */
6229         if (HAVE_LLVM >= 0x0309)
6230                 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6231         else
6232                 scratch_rsrc_dword1 |=
6233                         S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6234
6235         for (i = 0 ; i < shader->binary.reloc_count; i++) {
6236                 const struct ac_shader_reloc *reloc =
6237                                         &shader->binary.relocs[i];
6238                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6239                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6240                         &scratch_rsrc_dword0, 4);
6241                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6242                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6243                         &scratch_rsrc_dword1, 4);
6244                 }
6245         }
6246 }
6247
6248 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6249 {
6250         unsigned size = shader->binary.code_size;
6251
6252         if (shader->prolog)
6253                 size += shader->prolog->binary.code_size;
6254         if (shader->previous_stage)
6255                 size += shader->previous_stage->binary.code_size;
6256         if (shader->epilog)
6257                 size += shader->epilog->binary.code_size;
6258         return size;
6259 }
6260
6261 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6262 {
6263         const struct ac_shader_binary *prolog =
6264                 shader->prolog ? &shader->prolog->binary : NULL;
6265         const struct ac_shader_binary *previous_stage =
6266                 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6267         const struct ac_shader_binary *epilog =
6268                 shader->epilog ? &shader->epilog->binary : NULL;
6269         const struct ac_shader_binary *mainb = &shader->binary;
6270         unsigned bo_size = si_get_shader_binary_size(shader) +
6271                            (!epilog ? mainb->rodata_size : 0);
6272         unsigned char *ptr;
6273
6274         assert(!prolog || !prolog->rodata_size);
6275         assert(!previous_stage || !previous_stage->rodata_size);
6276         assert((!prolog && !previous_stage && !epilog) || !mainb->rodata_size);
6277         assert(!epilog || !epilog->rodata_size);
6278
6279         /* GFX9 can fetch at most 128 bytes past the end of the shader.
6280          * Prevent VM faults.
6281          */
6282         if (sscreen->b.chip_class >= GFX9)
6283                 bo_size += 128;
6284
6285         r600_resource_reference(&shader->bo, NULL);
6286         shader->bo = (struct r600_resource*)
6287                      pipe_buffer_create(&sscreen->b.b, 0,
6288                                         PIPE_USAGE_IMMUTABLE,
6289                                         align(bo_size, SI_CPDMA_ALIGNMENT));
6290         if (!shader->bo)
6291                 return -ENOMEM;
6292
6293         /* Upload. */
6294         ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6295                                         PIPE_TRANSFER_READ_WRITE |
6296                                         PIPE_TRANSFER_UNSYNCHRONIZED);
6297
6298         if (prolog) {
6299                 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6300                 ptr += prolog->code_size;
6301         }
6302         if (previous_stage) {
6303                 util_memcpy_cpu_to_le32(ptr, previous_stage->code,
6304                                         previous_stage->code_size);
6305                 ptr += previous_stage->code_size;
6306         }
6307
6308         util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6309         ptr += mainb->code_size;
6310
6311         if (epilog)
6312                 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6313         else if (mainb->rodata_size > 0)
6314                 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6315
6316         sscreen->b.ws->buffer_unmap(shader->bo->buf);
6317         return 0;
6318 }
6319
6320 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6321                                        struct pipe_debug_callback *debug,
6322                                        const char *name, FILE *file)
6323 {
6324         char *line, *p;
6325         unsigned i, count;
6326
6327         if (binary->disasm_string) {
6328                 fprintf(file, "Shader %s disassembly:\n", name);
6329                 fprintf(file, "%s", binary->disasm_string);
6330
6331                 if (debug && debug->debug_message) {
6332                         /* Very long debug messages are cut off, so send the
6333                          * disassembly one line at a time. This causes more
6334                          * overhead, but on the plus side it simplifies
6335                          * parsing of resulting logs.
6336                          */
6337                         pipe_debug_message(debug, SHADER_INFO,
6338                                            "Shader Disassembly Begin");
6339
6340                         line = binary->disasm_string;
6341                         while (*line) {
6342                                 p = util_strchrnul(line, '\n');
6343                                 count = p - line;
6344
6345                                 if (count) {
6346                                         pipe_debug_message(debug, SHADER_INFO,
6347                                                            "%.*s", count, line);
6348                                 }
6349
6350                                 if (!*p)
6351                                         break;
6352                                 line = p + 1;
6353                         }
6354
6355                         pipe_debug_message(debug, SHADER_INFO,
6356                                            "Shader Disassembly End");
6357                 }
6358         } else {
6359                 fprintf(file, "Shader %s binary:\n", name);
6360                 for (i = 0; i < binary->code_size; i += 4) {
6361                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6362                                 binary->code[i + 3], binary->code[i + 2],
6363                                 binary->code[i + 1], binary->code[i]);
6364                 }
6365         }
6366 }
6367
6368 static void si_shader_dump_stats(struct si_screen *sscreen,
6369                                  struct si_shader *shader,
6370                                  struct pipe_debug_callback *debug,
6371                                  unsigned processor,
6372                                  FILE *file,
6373                                  bool check_debug_option)
6374 {
6375         struct si_shader_config *conf = &shader->config;
6376         unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6377         unsigned code_size = si_get_shader_binary_size(shader);
6378         unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6379         unsigned lds_per_wave = 0;
6380         unsigned max_simd_waves = 10;
6381
6382         /* Compute LDS usage for PS. */
6383         switch (processor) {
6384         case PIPE_SHADER_FRAGMENT:
6385                 /* The minimum usage per wave is (num_inputs * 48). The maximum
6386                  * usage is (num_inputs * 48 * 16).
6387                  * We can get anything in between and it varies between waves.
6388                  *
6389                  * The 48 bytes per input for a single primitive is equal to
6390                  * 4 bytes/component * 4 components/input * 3 points.
6391                  *
6392                  * Other stages don't know the size at compile time or don't
6393                  * allocate LDS per wave, but instead they do it per thread group.
6394                  */
6395                 lds_per_wave = conf->lds_size * lds_increment +
6396                                align(num_inputs * 48, lds_increment);
6397                 break;
6398         case PIPE_SHADER_COMPUTE:
6399                 if (shader->selector) {
6400                         unsigned max_workgroup_size =
6401                                 si_get_max_workgroup_size(shader);
6402                         lds_per_wave = (conf->lds_size * lds_increment) /
6403                                        DIV_ROUND_UP(max_workgroup_size, 64);
6404                 }
6405                 break;
6406         }
6407
6408         /* Compute the per-SIMD wave counts. */
6409         if (conf->num_sgprs) {
6410                 if (sscreen->b.chip_class >= VI)
6411                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6412                 else
6413                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6414         }
6415
6416         if (conf->num_vgprs)
6417                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6418
6419         /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6420          * 16KB makes some SIMDs unoccupied). */
6421         if (lds_per_wave)
6422                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6423
6424         if (!check_debug_option ||
6425             r600_can_dump_shader(&sscreen->b, processor)) {
6426                 if (processor == PIPE_SHADER_FRAGMENT) {
6427                         fprintf(file, "*** SHADER CONFIG ***\n"
6428                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6429                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
6430                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6431                 }
6432
6433                 fprintf(file, "*** SHADER STATS ***\n"
6434                         "SGPRS: %d\n"
6435                         "VGPRS: %d\n"
6436                         "Spilled SGPRs: %d\n"
6437                         "Spilled VGPRs: %d\n"
6438                         "Private memory VGPRs: %d\n"
6439                         "Code Size: %d bytes\n"
6440                         "LDS: %d blocks\n"
6441                         "Scratch: %d bytes per wave\n"
6442                         "Max Waves: %d\n"
6443                         "********************\n\n\n",
6444                         conf->num_sgprs, conf->num_vgprs,
6445                         conf->spilled_sgprs, conf->spilled_vgprs,
6446                         conf->private_mem_vgprs, code_size,
6447                         conf->lds_size, conf->scratch_bytes_per_wave,
6448                         max_simd_waves);
6449         }
6450
6451         pipe_debug_message(debug, SHADER_INFO,
6452                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6453                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6454                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
6455                            conf->num_sgprs, conf->num_vgprs, code_size,
6456                            conf->lds_size, conf->scratch_bytes_per_wave,
6457                            max_simd_waves, conf->spilled_sgprs,
6458                            conf->spilled_vgprs, conf->private_mem_vgprs);
6459 }
6460
6461 const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
6462 {
6463         switch (processor) {
6464         case PIPE_SHADER_VERTEX:
6465                 if (shader->key.as_es)
6466                         return "Vertex Shader as ES";
6467                 else if (shader->key.as_ls)
6468                         return "Vertex Shader as LS";
6469                 else
6470                         return "Vertex Shader as VS";
6471         case PIPE_SHADER_TESS_CTRL:
6472                 return "Tessellation Control Shader";
6473         case PIPE_SHADER_TESS_EVAL:
6474                 if (shader->key.as_es)
6475                         return "Tessellation Evaluation Shader as ES";
6476                 else
6477                         return "Tessellation Evaluation Shader as VS";
6478         case PIPE_SHADER_GEOMETRY:
6479                 if (shader->is_gs_copy_shader)
6480                         return "GS Copy Shader as VS";
6481                 else
6482                         return "Geometry Shader";
6483         case PIPE_SHADER_FRAGMENT:
6484                 return "Pixel Shader";
6485         case PIPE_SHADER_COMPUTE:
6486                 return "Compute Shader";
6487         default:
6488                 return "Unknown Shader";
6489         }
6490 }
6491
6492 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6493                     struct pipe_debug_callback *debug, unsigned processor,
6494                     FILE *file, bool check_debug_option)
6495 {
6496         if (!check_debug_option ||
6497             r600_can_dump_shader(&sscreen->b, processor))
6498                 si_dump_shader_key(processor, shader, file);
6499
6500         if (!check_debug_option && shader->binary.llvm_ir_string) {
6501                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6502                         si_get_shader_name(shader, processor));
6503                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6504         }
6505
6506         if (!check_debug_option ||
6507             (r600_can_dump_shader(&sscreen->b, processor) &&
6508              !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6509                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6510
6511                 if (shader->prolog)
6512                         si_shader_dump_disassembly(&shader->prolog->binary,
6513                                                    debug, "prolog", file);
6514                 if (shader->previous_stage)
6515                         si_shader_dump_disassembly(&shader->previous_stage->binary,
6516                                                    debug, "previous stage", file);
6517
6518                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6519
6520                 if (shader->epilog)
6521                         si_shader_dump_disassembly(&shader->epilog->binary,
6522                                                    debug, "epilog", file);
6523                 fprintf(file, "\n");
6524         }
6525
6526         si_shader_dump_stats(sscreen, shader, debug, processor, file,
6527                              check_debug_option);
6528 }
6529
6530 int si_compile_llvm(struct si_screen *sscreen,
6531                     struct ac_shader_binary *binary,
6532                     struct si_shader_config *conf,
6533                     LLVMTargetMachineRef tm,
6534                     LLVMModuleRef mod,
6535                     struct pipe_debug_callback *debug,
6536                     unsigned processor,
6537                     const char *name)
6538 {
6539         int r = 0;
6540         unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6541
6542         if (r600_can_dump_shader(&sscreen->b, processor)) {
6543                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6544
6545                 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6546                         fprintf(stderr, "%s LLVM IR:\n\n", name);
6547                         ac_dump_module(mod);
6548                         fprintf(stderr, "\n");
6549                 }
6550         }
6551
6552         if (sscreen->record_llvm_ir) {
6553                 char *ir = LLVMPrintModuleToString(mod);
6554                 binary->llvm_ir_string = strdup(ir);
6555                 LLVMDisposeMessage(ir);
6556         }
6557
6558         if (!si_replace_shader(count, binary)) {
6559                 r = si_llvm_compile(mod, binary, tm, debug);
6560                 if (r)
6561                         return r;
6562         }
6563
6564         si_shader_binary_read_config(binary, conf, 0);
6565
6566         /* Enable 64-bit and 16-bit denormals, because there is no performance
6567          * cost.
6568          *
6569          * If denormals are enabled, all floating-point output modifiers are
6570          * ignored.
6571          *
6572          * Don't enable denormals for 32-bit floats, because:
6573          * - Floating-point output modifiers would be ignored by the hw.
6574          * - Some opcodes don't support denormals, such as v_mad_f32. We would
6575          *   have to stop using those.
6576          * - SI & CI would be very slow.
6577          */
6578         conf->float_mode |= V_00B028_FP_64_DENORMS;
6579
6580         FREE(binary->config);
6581         FREE(binary->global_symbol_offsets);
6582         binary->config = NULL;
6583         binary->global_symbol_offsets = NULL;
6584
6585         /* Some shaders can't have rodata because their binaries can be
6586          * concatenated.
6587          */
6588         if (binary->rodata_size &&
6589             (processor == PIPE_SHADER_VERTEX ||
6590              processor == PIPE_SHADER_TESS_CTRL ||
6591              processor == PIPE_SHADER_TESS_EVAL ||
6592              processor == PIPE_SHADER_FRAGMENT)) {
6593                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6594                 return -EINVAL;
6595         }
6596
6597         return r;
6598 }
6599
6600 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6601 {
6602         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6603                 LLVMBuildRetVoid(ctx->gallivm.builder);
6604         else
6605                 LLVMBuildRet(ctx->gallivm.builder, ret);
6606 }
6607
6608 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6609 struct si_shader *
6610 si_generate_gs_copy_shader(struct si_screen *sscreen,
6611                            LLVMTargetMachineRef tm,
6612                            struct si_shader_selector *gs_selector,
6613                            struct pipe_debug_callback *debug)
6614 {
6615         struct si_shader_context ctx;
6616         struct si_shader *shader;
6617         struct gallivm_state *gallivm = &ctx.gallivm;
6618         LLVMBuilderRef builder;
6619         struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6620         struct lp_build_context *uint = &bld_base->uint_bld;
6621         struct si_shader_output_values *outputs;
6622         struct tgsi_shader_info *gsinfo = &gs_selector->info;
6623         int i, r;
6624
6625         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6626
6627         if (!outputs)
6628                 return NULL;
6629
6630         shader = CALLOC_STRUCT(si_shader);
6631         if (!shader) {
6632                 FREE(outputs);
6633                 return NULL;
6634         }
6635
6636
6637         shader->selector = gs_selector;
6638         shader->is_gs_copy_shader = true;
6639
6640         si_init_shader_ctx(&ctx, sscreen, tm);
6641         ctx.shader = shader;
6642         ctx.type = PIPE_SHADER_VERTEX;
6643
6644         builder = gallivm->builder;
6645
6646         create_function(&ctx);
6647         preload_ring_buffers(&ctx);
6648
6649         LLVMValueRef voffset =
6650                 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6651                                                     ctx.param_vertex_id), 4);
6652
6653         /* Fetch the vertex stream ID.*/
6654         LLVMValueRef stream_id;
6655
6656         if (gs_selector->so.num_outputs)
6657                 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6658         else
6659                 stream_id = ctx.i32_0;
6660
6661         /* Fill in output information. */
6662         for (i = 0; i < gsinfo->num_outputs; ++i) {
6663                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6664                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6665
6666                 for (int chan = 0; chan < 4; chan++) {
6667                         outputs[i].vertex_stream[chan] =
6668                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6669                 }
6670         }
6671
6672         LLVMBasicBlockRef end_bb;
6673         LLVMValueRef switch_inst;
6674
6675         end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6676         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6677
6678         for (int stream = 0; stream < 4; stream++) {
6679                 LLVMBasicBlockRef bb;
6680                 unsigned offset;
6681
6682                 if (!gsinfo->num_stream_output_components[stream])
6683                         continue;
6684
6685                 if (stream > 0 && !gs_selector->so.num_outputs)
6686                         continue;
6687
6688                 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6689                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
6690                 LLVMPositionBuilderAtEnd(builder, bb);
6691
6692                 /* Fetch vertex data from GSVS ring */
6693                 offset = 0;
6694                 for (i = 0; i < gsinfo->num_outputs; ++i) {
6695                         for (unsigned chan = 0; chan < 4; chan++) {
6696                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6697                                     outputs[i].vertex_stream[chan] != stream) {
6698                                         outputs[i].values[chan] = ctx.bld_base.base.undef;
6699                                         continue;
6700                                 }
6701
6702                                 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
6703                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
6704                                 offset++;
6705
6706                                 outputs[i].values[chan] =
6707                                         ac_build_buffer_load(&ctx.ac,
6708                                                              ctx.gsvs_ring[0], 1,
6709                                                              ctx.i32_0, voffset,
6710                                                              soffset, 0, 1, 1, true);
6711                         }
6712                 }
6713
6714                 /* Streamout and exports. */
6715                 if (gs_selector->so.num_outputs) {
6716                         si_llvm_emit_streamout(&ctx, outputs,
6717                                                gsinfo->num_outputs,
6718                                                stream);
6719                 }
6720
6721                 if (stream == 0)
6722                         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6723
6724                 LLVMBuildBr(builder, end_bb);
6725         }
6726
6727         LLVMPositionBuilderAtEnd(builder, end_bb);
6728
6729         LLVMBuildRetVoid(gallivm->builder);
6730
6731         /* Dump LLVM IR before any optimization passes */
6732         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6733             r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6734                 ac_dump_module(ctx.gallivm.module);
6735
6736         si_llvm_finalize_module(&ctx,
6737                 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6738
6739         r = si_compile_llvm(sscreen, &ctx.shader->binary,
6740                             &ctx.shader->config, ctx.tm,
6741                             ctx.gallivm.module,
6742                             debug, PIPE_SHADER_GEOMETRY,
6743                             "GS Copy Shader");
6744         if (!r) {
6745                 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6746                         fprintf(stderr, "GS Copy Shader:\n");
6747                 si_shader_dump(sscreen, ctx.shader, debug,
6748                                PIPE_SHADER_GEOMETRY, stderr, true);
6749                 r = si_shader_binary_upload(sscreen, ctx.shader);
6750         }
6751
6752         si_llvm_dispose(&ctx);
6753
6754         FREE(outputs);
6755
6756         if (r != 0) {
6757                 FREE(shader);
6758                 shader = NULL;
6759         }
6760         return shader;
6761 }
6762
6763 static void si_dump_shader_key_vs(struct si_shader_key *key,
6764                                   struct si_vs_prolog_bits *prolog,
6765                                   const char *prefix, FILE *f)
6766 {
6767         fprintf(f, "  %s.instance_divisors = {", prefix);
6768         for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
6769                 fprintf(f, !i ? "%u" : ", %u",
6770                         prolog->instance_divisors[i]);
6771         }
6772         fprintf(f, "}\n");
6773
6774         fprintf(f, "  mono.vs.fix_fetch = {");
6775         for (int i = 0; i < SI_MAX_ATTRIBS; i++)
6776                 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
6777         fprintf(f, "}\n");
6778 }
6779
6780 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
6781                                FILE *f)
6782 {
6783         struct si_shader_key *key = &shader->key;
6784
6785         fprintf(f, "SHADER KEY\n");
6786
6787         switch (processor) {
6788         case PIPE_SHADER_VERTEX:
6789                 si_dump_shader_key_vs(key, &key->part.vs.prolog,
6790                                       "part.vs.prolog", f);
6791                 fprintf(f, "  as_es = %u\n", key->as_es);
6792                 fprintf(f, "  as_ls = %u\n", key->as_ls);
6793                 fprintf(f, "  part.vs.epilog.export_prim_id = %u\n",
6794                         key->part.vs.epilog.export_prim_id);
6795                 break;
6796
6797         case PIPE_SHADER_TESS_CTRL:
6798                 if (shader->selector->screen->b.chip_class >= GFX9) {
6799                         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
6800                                               "part.tcs.ls_prolog", f);
6801                 }
6802                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
6803                 fprintf(f, "  mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
6804                 break;
6805
6806         case PIPE_SHADER_TESS_EVAL:
6807                 fprintf(f, "  part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
6808                 fprintf(f, "  as_es = %u\n", key->as_es);
6809                 break;
6810
6811         case PIPE_SHADER_GEOMETRY:
6812                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
6813                 break;
6814
6815         case PIPE_SHADER_COMPUTE:
6816                 break;
6817
6818         case PIPE_SHADER_FRAGMENT:
6819                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
6820                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
6821                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
6822                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
6823                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
6824                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
6825                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
6826                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
6827                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
6828                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
6829                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
6830                 fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
6831                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
6832                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
6833                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
6834                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
6835                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
6836                 break;
6837
6838         default:
6839                 assert(0);
6840         }
6841
6842         if ((processor == PIPE_SHADER_GEOMETRY ||
6843              processor == PIPE_SHADER_TESS_EVAL ||
6844              processor == PIPE_SHADER_VERTEX) &&
6845             !key->as_es && !key->as_ls) {
6846                 fprintf(f, "  opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
6847                 fprintf(f, "  opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
6848                 fprintf(f, "  opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
6849         }
6850 }
6851
6852 static void si_init_shader_ctx(struct si_shader_context *ctx,
6853                                struct si_screen *sscreen,
6854                                LLVMTargetMachineRef tm)
6855 {
6856         struct lp_build_tgsi_context *bld_base;
6857         struct lp_build_tgsi_action tmpl = {};
6858
6859         si_llvm_context_init(ctx, sscreen, tm);
6860
6861         bld_base = &ctx->bld_base;
6862         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6863
6864         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6865         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6866         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6867
6868         bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6869         bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
6870         bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6871         bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6872         bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6873         bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6874         bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6875         bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
6876         bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6877         bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6878         bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6879         bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6880         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6881         bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6882         bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6883         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6884
6885         bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6886         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6887         bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6888         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6889         bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6890         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6891
6892         tmpl.fetch_args = atomic_fetch_args;
6893         tmpl.emit = atomic_emit;
6894         bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6895         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6896         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6897         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6898         bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6899         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6900         bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6901         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6902         bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6903         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6904         bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6905         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6906         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6907         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6908         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6909         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6910         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6911         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6912         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6913         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6914
6915         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6916
6917         bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
6918
6919         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6920         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6921         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6922         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6923
6924         bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
6925         bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
6926         bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
6927         bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
6928         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
6929         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
6930         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
6931         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
6932         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
6933
6934         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6935         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6936         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6937 }
6938
6939 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
6940 {
6941         struct si_shader *shader = ctx->shader;
6942         struct tgsi_shader_info *info = &shader->selector->info;
6943
6944         if (ctx->type == PIPE_SHADER_FRAGMENT ||
6945             ctx->type == PIPE_SHADER_COMPUTE ||
6946             shader->key.as_es ||
6947             shader->key.as_ls)
6948                 return;
6949
6950         ac_eliminate_const_vs_outputs(&ctx->ac,
6951                                       ctx->main_fn,
6952                                       shader->info.vs_output_param_offset,
6953                                       info->num_outputs,
6954                                       &shader->info.nr_param_exports);
6955 }
6956
6957 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
6958 {
6959         ctx->shader->config.private_mem_vgprs = 0;
6960
6961         /* Process all LLVM instructions. */
6962         LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
6963         while (bb) {
6964                 LLVMValueRef next = LLVMGetFirstInstruction(bb);
6965
6966                 while (next) {
6967                         LLVMValueRef inst = next;
6968                         next = LLVMGetNextInstruction(next);
6969
6970                         if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
6971                                 continue;
6972
6973                         LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
6974                         /* No idea why LLVM aligns allocas to 4 elements. */
6975                         unsigned alignment = LLVMGetAlignment(inst);
6976                         unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
6977                         ctx->shader->config.private_mem_vgprs += dw_size;
6978                 }
6979                 bb = LLVMGetNextBasicBlock(bb);
6980         }
6981 }
6982
6983 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
6984                                  struct si_shader *shader)
6985 {
6986         struct si_shader_selector *sel = shader->selector;
6987         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6988
6989         switch (ctx->type) {
6990         case PIPE_SHADER_VERTEX:
6991                 ctx->load_input = declare_input_vs;
6992                 if (shader->key.as_ls)
6993                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6994                 else if (shader->key.as_es)
6995                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6996                 else
6997                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6998                 break;
6999         case PIPE_SHADER_TESS_CTRL:
7000                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
7001                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
7002                 bld_base->emit_store = store_output_tcs;
7003                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
7004                 break;
7005         case PIPE_SHADER_TESS_EVAL:
7006                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
7007                 if (shader->key.as_es)
7008                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7009                 else
7010                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7011                 break;
7012         case PIPE_SHADER_GEOMETRY:
7013                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
7014                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
7015                 break;
7016         case PIPE_SHADER_FRAGMENT:
7017                 ctx->load_input = declare_input_fs;
7018                 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
7019                 break;
7020         case PIPE_SHADER_COMPUTE:
7021                 ctx->declare_memory_region = declare_compute_memory;
7022                 break;
7023         default:
7024                 assert(!"Unsupported shader type");
7025                 return false;
7026         }
7027
7028         create_function(ctx);
7029         preload_ring_buffers(ctx);
7030
7031         if (ctx->type == PIPE_SHADER_GEOMETRY) {
7032                 int i;
7033                 for (i = 0; i < 4; i++) {
7034                         ctx->gs_next_vertex[i] =
7035                                 lp_build_alloca(&ctx->gallivm,
7036                                                 ctx->i32, "");
7037                 }
7038         }
7039
7040         if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
7041                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
7042                 return false;
7043         }
7044
7045         si_llvm_build_ret(ctx, ctx->return_value);
7046         return true;
7047 }
7048
7049 /**
7050  * Compute the VS prolog key, which contains all the information needed to
7051  * build the VS prolog function, and set shader->info bits where needed.
7052  *
7053  * \param info             Shader info of the vertex shader.
7054  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
7055  * \param prolog_key       Key of the VS prolog
7056  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
7057  * \param key              Output shader part key.
7058  */
7059 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
7060                                  unsigned num_input_sgprs,
7061                                  const struct si_vs_prolog_bits *prolog_key,
7062                                  struct si_shader *shader_out,
7063                                  union si_shader_part_key *key)
7064 {
7065         memset(key, 0, sizeof(*key));
7066         key->vs_prolog.states = *prolog_key;
7067         key->vs_prolog.num_input_sgprs = num_input_sgprs;
7068         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7069
7070         /* Set the instanceID flag. */
7071         for (unsigned i = 0; i < info->num_inputs; i++)
7072                 if (key->vs_prolog.states.instance_divisors[i])
7073                         shader_out->info.uses_instanceid = true;
7074 }
7075
7076 /**
7077  * Compute the VS epilog key, which contains all the information needed to
7078  * build the VS epilog function, and set the PrimitiveID output offset.
7079  */
7080 static void si_get_vs_epilog_key(struct si_shader *shader,
7081                                  struct si_vs_epilog_bits *states,
7082                                  union si_shader_part_key *key)
7083 {
7084         memset(key, 0, sizeof(*key));
7085         key->vs_epilog.states = *states;
7086
7087         /* Set up the PrimitiveID output. */
7088         if (shader->key.part.vs.epilog.export_prim_id) {
7089                 unsigned index = shader->selector->info.num_outputs;
7090                 unsigned offset = shader->info.nr_param_exports++;
7091
7092                 key->vs_epilog.prim_id_param_offset = offset;
7093                 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7094                 shader->info.vs_output_param_offset[index] = offset;
7095         }
7096 }
7097
7098 /**
7099  * Compute the PS prolog key, which contains all the information needed to
7100  * build the PS prolog function, and set related bits in shader->config.
7101  */
7102 static void si_get_ps_prolog_key(struct si_shader *shader,
7103                                  union si_shader_part_key *key,
7104                                  bool separate_prolog)
7105 {
7106         struct tgsi_shader_info *info = &shader->selector->info;
7107
7108         memset(key, 0, sizeof(*key));
7109         key->ps_prolog.states = shader->key.part.ps.prolog;
7110         key->ps_prolog.colors_read = info->colors_read;
7111         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7112         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7113         key->ps_prolog.wqm = info->uses_derivatives &&
7114                 (key->ps_prolog.colors_read ||
7115                  key->ps_prolog.states.force_persp_sample_interp ||
7116                  key->ps_prolog.states.force_linear_sample_interp ||
7117                  key->ps_prolog.states.force_persp_center_interp ||
7118                  key->ps_prolog.states.force_linear_center_interp ||
7119                  key->ps_prolog.states.bc_optimize_for_persp ||
7120                  key->ps_prolog.states.bc_optimize_for_linear);
7121
7122         if (info->colors_read) {
7123                 unsigned *color = shader->selector->color_attr_index;
7124
7125                 if (shader->key.part.ps.prolog.color_two_side) {
7126                         /* BCOLORs are stored after the last input. */
7127                         key->ps_prolog.num_interp_inputs = info->num_inputs;
7128                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7129                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7130                 }
7131
7132                 for (unsigned i = 0; i < 2; i++) {
7133                         unsigned interp = info->input_interpolate[color[i]];
7134                         unsigned location = info->input_interpolate_loc[color[i]];
7135
7136                         if (!(info->colors_read & (0xf << i*4)))
7137                                 continue;
7138
7139                         key->ps_prolog.color_attr_index[i] = color[i];
7140
7141                         if (shader->key.part.ps.prolog.flatshade_colors &&
7142                             interp == TGSI_INTERPOLATE_COLOR)
7143                                 interp = TGSI_INTERPOLATE_CONSTANT;
7144
7145                         switch (interp) {
7146                         case TGSI_INTERPOLATE_CONSTANT:
7147                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7148                                 break;
7149                         case TGSI_INTERPOLATE_PERSPECTIVE:
7150                         case TGSI_INTERPOLATE_COLOR:
7151                                 /* Force the interpolation location for colors here. */
7152                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7153                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7154                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
7155                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7156
7157                                 switch (location) {
7158                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7159                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
7160                                         shader->config.spi_ps_input_ena |=
7161                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
7162                                         break;
7163                                 case TGSI_INTERPOLATE_LOC_CENTER:
7164                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
7165                                         shader->config.spi_ps_input_ena |=
7166                                                 S_0286CC_PERSP_CENTER_ENA(1);
7167                                         break;
7168                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7169                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
7170                                         shader->config.spi_ps_input_ena |=
7171                                                 S_0286CC_PERSP_CENTROID_ENA(1);
7172                                         break;
7173                                 default:
7174                                         assert(0);
7175                                 }
7176                                 break;
7177                         case TGSI_INTERPOLATE_LINEAR:
7178                                 /* Force the interpolation location for colors here. */
7179                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7180                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7181                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
7182                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7183
7184                                 /* The VGPR assignment for non-monolithic shaders
7185                                  * works because InitialPSInputAddr is set on the
7186                                  * main shader and PERSP_PULL_MODEL is never used.
7187                                  */
7188                                 switch (location) {
7189                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7190                                         key->ps_prolog.color_interp_vgpr_index[i] =
7191                                                 separate_prolog ? 6 : 9;
7192                                         shader->config.spi_ps_input_ena |=
7193                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
7194                                         break;
7195                                 case TGSI_INTERPOLATE_LOC_CENTER:
7196                                         key->ps_prolog.color_interp_vgpr_index[i] =
7197                                                 separate_prolog ? 8 : 11;
7198                                         shader->config.spi_ps_input_ena |=
7199                                                 S_0286CC_LINEAR_CENTER_ENA(1);
7200                                         break;
7201                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7202                                         key->ps_prolog.color_interp_vgpr_index[i] =
7203                                                 separate_prolog ? 10 : 13;
7204                                         shader->config.spi_ps_input_ena |=
7205                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
7206                                         break;
7207                                 default:
7208                                         assert(0);
7209                                 }
7210                                 break;
7211                         default:
7212                                 assert(0);
7213                         }
7214                 }
7215         }
7216 }
7217
7218 /**
7219  * Check whether a PS prolog is required based on the key.
7220  */
7221 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7222 {
7223         return key->ps_prolog.colors_read ||
7224                key->ps_prolog.states.force_persp_sample_interp ||
7225                key->ps_prolog.states.force_linear_sample_interp ||
7226                key->ps_prolog.states.force_persp_center_interp ||
7227                key->ps_prolog.states.force_linear_center_interp ||
7228                key->ps_prolog.states.bc_optimize_for_persp ||
7229                key->ps_prolog.states.bc_optimize_for_linear ||
7230                key->ps_prolog.states.poly_stipple;
7231 }
7232
7233 /**
7234  * Compute the PS epilog key, which contains all the information needed to
7235  * build the PS epilog function.
7236  */
7237 static void si_get_ps_epilog_key(struct si_shader *shader,
7238                                  union si_shader_part_key *key)
7239 {
7240         struct tgsi_shader_info *info = &shader->selector->info;
7241         memset(key, 0, sizeof(*key));
7242         key->ps_epilog.colors_written = info->colors_written;
7243         key->ps_epilog.writes_z = info->writes_z;
7244         key->ps_epilog.writes_stencil = info->writes_stencil;
7245         key->ps_epilog.writes_samplemask = info->writes_samplemask;
7246         key->ps_epilog.states = shader->key.part.ps.epilog;
7247 }
7248
7249 /**
7250  * Build the GS prolog function. Rotate the input vertices for triangle strips
7251  * with adjacency.
7252  */
7253 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7254                                         union si_shader_part_key *key)
7255 {
7256         const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2;
7257         const unsigned num_vgprs = 8;
7258         struct gallivm_state *gallivm = &ctx->gallivm;
7259         LLVMBuilderRef builder = gallivm->builder;
7260         LLVMTypeRef params[32];
7261         LLVMTypeRef returns[32];
7262         LLVMValueRef func, ret;
7263
7264         for (unsigned i = 0; i < num_sgprs; ++i) {
7265                 params[i] = ctx->i32;
7266                 returns[i] = ctx->i32;
7267         }
7268
7269         for (unsigned i = 0; i < num_vgprs; ++i) {
7270                 params[num_sgprs + i] = ctx->i32;
7271                 returns[num_sgprs + i] = ctx->f32;
7272         }
7273
7274         /* Create the function. */
7275         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7276                            params, num_sgprs + num_vgprs, num_sgprs - 1);
7277         func = ctx->main_fn;
7278
7279         /* Copy inputs to outputs. This should be no-op, as the registers match,
7280          * but it will prevent the compiler from overwriting them unintentionally.
7281          */
7282         ret = ctx->return_value;
7283         for (unsigned i = 0; i < num_sgprs; i++) {
7284                 LLVMValueRef p = LLVMGetParam(func, i);
7285                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7286         }
7287         for (unsigned i = 0; i < num_vgprs; i++) {
7288                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7289                 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7290                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7291         }
7292
7293         if (key->gs_prolog.states.tri_strip_adj_fix) {
7294                 /* Remap the input vertices for every other primitive. */
7295                 const unsigned vtx_params[6] = {
7296                         num_sgprs,
7297                         num_sgprs + 1,
7298                         num_sgprs + 3,
7299                         num_sgprs + 4,
7300                         num_sgprs + 5,
7301                         num_sgprs + 6
7302                 };
7303                 LLVMValueRef prim_id, rotate;
7304
7305                 prim_id = LLVMGetParam(func, num_sgprs + 2);
7306                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7307
7308                 for (unsigned i = 0; i < 6; ++i) {
7309                         LLVMValueRef base, rotated, actual;
7310                         base = LLVMGetParam(func, vtx_params[i]);
7311                         rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
7312                         actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
7313                         actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
7314                         ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
7315                 }
7316         }
7317
7318         LLVMBuildRet(builder, ret);
7319 }
7320
7321 /**
7322  * Given a list of shader part functions, build a wrapper function that
7323  * runs them in sequence to form a monolithic shader.
7324  */
7325 static void si_build_wrapper_function(struct si_shader_context *ctx,
7326                                       LLVMValueRef *parts,
7327                                       unsigned num_parts,
7328                                       unsigned main_part)
7329 {
7330         struct gallivm_state *gallivm = &ctx->gallivm;
7331         LLVMBuilderRef builder = ctx->gallivm.builder;
7332         /* PS epilog has one arg per color component */
7333         LLVMTypeRef param_types[48];
7334         LLVMValueRef out[48];
7335         LLVMTypeRef function_type;
7336         unsigned num_params;
7337         unsigned num_out;
7338         MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7339         unsigned num_sgprs, num_vgprs;
7340         unsigned last_sgpr_param;
7341         unsigned gprs;
7342
7343         for (unsigned i = 0; i < num_parts; ++i) {
7344                 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7345                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7346         }
7347
7348         /* The parameters of the wrapper function correspond to those of the
7349          * first part in terms of SGPRs and VGPRs, but we use the types of the
7350          * main part to get the right types. This is relevant for the
7351          * dereferenceable attribute on descriptor table pointers.
7352          */
7353         num_sgprs = 0;
7354         num_vgprs = 0;
7355
7356         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7357         num_params = LLVMCountParamTypes(function_type);
7358
7359         for (unsigned i = 0; i < num_params; ++i) {
7360                 LLVMValueRef param = LLVMGetParam(parts[0], i);
7361
7362                 if (ac_is_sgpr_param(param)) {
7363                         assert(num_vgprs == 0);
7364                         num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7365                 } else {
7366                         num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7367                 }
7368         }
7369         assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7370
7371         num_params = 0;
7372         last_sgpr_param = 0;
7373         gprs = 0;
7374         while (gprs < num_sgprs + num_vgprs) {
7375                 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7376                 unsigned size;
7377
7378                 param_types[num_params] = LLVMTypeOf(param);
7379                 if (gprs < num_sgprs)
7380                         last_sgpr_param = num_params;
7381                 size = llvm_get_type_size(param_types[num_params]) / 4;
7382                 num_params++;
7383
7384                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7385                 assert(gprs + size <= num_sgprs + num_vgprs &&
7386                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
7387
7388                 gprs += size;
7389         }
7390
7391         si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
7392
7393         /* Record the arguments of the function as if they were an output of
7394          * a previous part.
7395          */
7396         num_out = 0;
7397         num_out_sgpr = 0;
7398
7399         for (unsigned i = 0; i < num_params; ++i) {
7400                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7401                 LLVMTypeRef param_type = LLVMTypeOf(param);
7402                 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7403                 unsigned size = llvm_get_type_size(param_type) / 4;
7404
7405                 if (size == 1) {
7406                         if (param_type != out_type)
7407                                 param = LLVMBuildBitCast(builder, param, out_type, "");
7408                         out[num_out++] = param;
7409                 } else {
7410                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7411
7412                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7413                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7414                                 param_type = ctx->i64;
7415                         }
7416
7417                         if (param_type != vector_type)
7418                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
7419
7420                         for (unsigned j = 0; j < size; ++j)
7421                                 out[num_out++] = LLVMBuildExtractElement(
7422                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7423                 }
7424
7425                 if (i <= last_sgpr_param)
7426                         num_out_sgpr = num_out;
7427         }
7428
7429         /* Now chain the parts. */
7430         for (unsigned part = 0; part < num_parts; ++part) {
7431                 LLVMValueRef in[48];
7432                 LLVMValueRef ret;
7433                 LLVMTypeRef ret_type;
7434                 unsigned out_idx = 0;
7435
7436                 num_params = LLVMCountParams(parts[part]);
7437                 assert(num_params <= ARRAY_SIZE(param_types));
7438
7439                 /* Derive arguments for the next part from outputs of the
7440                  * previous one.
7441                  */
7442                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7443                         LLVMValueRef param;
7444                         LLVMTypeRef param_type;
7445                         bool is_sgpr;
7446                         unsigned param_size;
7447                         LLVMValueRef arg = NULL;
7448
7449                         param = LLVMGetParam(parts[part], param_idx);
7450                         param_type = LLVMTypeOf(param);
7451                         param_size = llvm_get_type_size(param_type) / 4;
7452                         is_sgpr = ac_is_sgpr_param(param);
7453
7454                         if (is_sgpr) {
7455 #if HAVE_LLVM < 0x0400
7456                                 LLVMRemoveAttribute(param, LLVMByValAttribute);
7457 #else
7458                                 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7459                                 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7460 #endif
7461                                 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7462                         }
7463
7464                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7465                         assert(is_sgpr || out_idx >= num_out_sgpr);
7466
7467                         if (param_size == 1)
7468                                 arg = out[out_idx];
7469                         else
7470                                 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7471
7472                         if (LLVMTypeOf(arg) != param_type) {
7473                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7474                                         arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7475                                         arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7476                                 } else {
7477                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
7478                                 }
7479                         }
7480
7481                         in[param_idx] = arg;
7482                         out_idx += param_size;
7483                 }
7484
7485                 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7486                 ret_type = LLVMTypeOf(ret);
7487
7488                 /* Extract the returned GPRs. */
7489                 num_out = 0;
7490                 num_out_sgpr = 0;
7491
7492                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7493                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7494
7495                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7496
7497                         for (unsigned i = 0; i < ret_size; ++i) {
7498                                 LLVMValueRef val =
7499                                         LLVMBuildExtractValue(builder, ret, i, "");
7500
7501                                 out[num_out++] = val;
7502
7503                                 if (LLVMTypeOf(val) == ctx->i32) {
7504                                         assert(num_out_sgpr + 1 == num_out);
7505                                         num_out_sgpr = num_out;
7506                                 }
7507                         }
7508                 }
7509         }
7510
7511         LLVMBuildRetVoid(builder);
7512 }
7513
7514 int si_compile_tgsi_shader(struct si_screen *sscreen,
7515                            LLVMTargetMachineRef tm,
7516                            struct si_shader *shader,
7517                            bool is_monolithic,
7518                            struct pipe_debug_callback *debug)
7519 {
7520         struct si_shader_selector *sel = shader->selector;
7521         struct si_shader_context ctx;
7522         int r = -1;
7523
7524         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7525          * conversion fails. */
7526         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7527             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7528                 tgsi_dump(sel->tokens, 0);
7529                 si_dump_streamout(&sel->so);
7530         }
7531
7532         si_init_shader_ctx(&ctx, sscreen, tm);
7533         si_llvm_context_set_tgsi(&ctx, shader);
7534         ctx.separate_prolog = !is_monolithic;
7535
7536         memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7537                sizeof(shader->info.vs_output_param_offset));
7538
7539         shader->info.uses_instanceid = sel->info.uses_instanceid;
7540
7541         ctx.load_system_value = declare_system_value;
7542
7543         if (!si_compile_tgsi_main(&ctx, shader)) {
7544                 si_llvm_dispose(&ctx);
7545                 return -1;
7546         }
7547
7548         if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7549                 LLVMValueRef parts[3];
7550                 bool need_prolog;
7551                 bool need_epilog;
7552
7553                 need_prolog = sel->vs_needs_prolog;
7554                 need_epilog = !shader->key.as_es && !shader->key.as_ls;
7555
7556                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7557
7558                 if (need_prolog) {
7559                         union si_shader_part_key prolog_key;
7560                         si_get_vs_prolog_key(&sel->info,
7561                                              shader->info.num_input_sgprs,
7562                                              &shader->key.part.vs.prolog,
7563                                              shader, &prolog_key);
7564                         si_build_vs_prolog_function(&ctx, &prolog_key);
7565                         parts[0] = ctx.main_fn;
7566                 }
7567
7568                 if (need_epilog) {
7569                         union si_shader_part_key epilog_key;
7570                         si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
7571                         si_build_vs_epilog_function(&ctx, &epilog_key);
7572                         parts[need_prolog ? 2 : 1] = ctx.main_fn;
7573                 }
7574
7575                 si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
7576                                           need_prolog ? 1 : 0);
7577         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7578                 LLVMValueRef parts[2];
7579                 union si_shader_part_key epilog_key;
7580
7581                 parts[0] = ctx.main_fn;
7582
7583                 memset(&epilog_key, 0, sizeof(epilog_key));
7584                 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7585                 si_build_tcs_epilog_function(&ctx, &epilog_key);
7586                 parts[1] = ctx.main_fn;
7587
7588                 si_build_wrapper_function(&ctx, parts, 2, 0);
7589         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
7590                    !shader->key.as_es) {
7591                 LLVMValueRef parts[2];
7592                 union si_shader_part_key epilog_key;
7593
7594                 parts[0] = ctx.main_fn;
7595
7596                 si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
7597                 si_build_vs_epilog_function(&ctx, &epilog_key);
7598                 parts[1] = ctx.main_fn;
7599
7600                 si_build_wrapper_function(&ctx, parts, 2, 0);
7601         } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
7602                 LLVMValueRef parts[2];
7603                 union si_shader_part_key prolog_key;
7604
7605                 parts[1] = ctx.main_fn;
7606
7607                 memset(&prolog_key, 0, sizeof(prolog_key));
7608                 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7609                 si_build_gs_prolog_function(&ctx, &prolog_key);
7610                 parts[0] = ctx.main_fn;
7611
7612                 si_build_wrapper_function(&ctx, parts, 2, 1);
7613         } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
7614                 LLVMValueRef parts[3];
7615                 union si_shader_part_key prolog_key;
7616                 union si_shader_part_key epilog_key;
7617                 bool need_prolog;
7618
7619                 si_get_ps_prolog_key(shader, &prolog_key, false);
7620                 need_prolog = si_need_ps_prolog(&prolog_key);
7621
7622                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7623
7624                 if (need_prolog) {
7625                         si_build_ps_prolog_function(&ctx, &prolog_key);
7626                         parts[0] = ctx.main_fn;
7627                 }
7628
7629                 si_get_ps_epilog_key(shader, &epilog_key);
7630                 si_build_ps_epilog_function(&ctx, &epilog_key);
7631                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
7632
7633                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
7634         }
7635
7636         /* Dump LLVM IR before any optimization passes */
7637         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
7638             r600_can_dump_shader(&sscreen->b, ctx.type))
7639                 LLVMDumpModule(ctx.gallivm.module);
7640
7641         si_llvm_finalize_module(&ctx,
7642                                     r600_extra_shader_checks(&sscreen->b, ctx.type));
7643
7644         /* Post-optimization transformations and analysis. */
7645         si_eliminate_const_vs_outputs(&ctx);
7646
7647         if ((debug && debug->debug_message) ||
7648             r600_can_dump_shader(&sscreen->b, ctx.type))
7649                 si_count_scratch_private_memory(&ctx);
7650
7651         /* Compile to bytecode. */
7652         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
7653                             ctx.gallivm.module, debug, ctx.type, "TGSI shader");
7654         si_llvm_dispose(&ctx);
7655         if (r) {
7656                 fprintf(stderr, "LLVM failed to compile shader\n");
7657                 return r;
7658         }
7659
7660         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
7661          * LLVM 3.9svn has this bug.
7662          */
7663         if (sel->type == PIPE_SHADER_COMPUTE) {
7664                 unsigned wave_size = 64;
7665                 unsigned max_vgprs = 256;
7666                 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
7667                 unsigned max_sgprs_per_wave = 128;
7668                 unsigned max_block_threads = si_get_max_workgroup_size(shader);
7669                 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
7670                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
7671
7672                 max_vgprs = max_vgprs / min_waves_per_simd;
7673                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
7674
7675                 if (shader->config.num_sgprs > max_sgprs ||
7676                     shader->config.num_vgprs > max_vgprs) {
7677                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
7678                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
7679                                 shader->config.num_sgprs, shader->config.num_vgprs,
7680                                 max_sgprs, max_vgprs);
7681
7682                         /* Just terminate the process, because dependent
7683                          * shaders can hang due to bad input data, but use
7684                          * the env var to allow shader-db to work.
7685                          */
7686                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
7687                                 abort();
7688                 }
7689         }
7690
7691         /* Add the scratch offset to input SGPRs. */
7692         if (shader->config.scratch_bytes_per_wave)
7693                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
7694
7695         /* Calculate the number of fragment input VGPRs. */
7696         if (ctx.type == PIPE_SHADER_FRAGMENT) {
7697                 shader->info.num_input_vgprs = 0;
7698                 shader->info.face_vgpr_index = -1;
7699
7700                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
7701                         shader->info.num_input_vgprs += 2;
7702                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
7703                         shader->info.num_input_vgprs += 2;
7704                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
7705                         shader->info.num_input_vgprs += 2;
7706                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
7707                         shader->info.num_input_vgprs += 3;
7708                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
7709                         shader->info.num_input_vgprs += 2;
7710                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
7711                         shader->info.num_input_vgprs += 2;
7712                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
7713                         shader->info.num_input_vgprs += 2;
7714                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
7715                         shader->info.num_input_vgprs += 1;
7716                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
7717                         shader->info.num_input_vgprs += 1;
7718                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
7719                         shader->info.num_input_vgprs += 1;
7720                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
7721                         shader->info.num_input_vgprs += 1;
7722                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
7723                         shader->info.num_input_vgprs += 1;
7724                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
7725                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
7726                         shader->info.num_input_vgprs += 1;
7727                 }
7728                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
7729                         shader->info.num_input_vgprs += 1;
7730                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
7731                         shader->info.num_input_vgprs += 1;
7732                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
7733                         shader->info.num_input_vgprs += 1;
7734         }
7735
7736         return 0;
7737 }
7738
7739 /**
7740  * Create, compile and return a shader part (prolog or epilog).
7741  *
7742  * \param sscreen       screen
7743  * \param list          list of shader parts of the same category
7744  * \param type          shader type
7745  * \param key           shader part key
7746  * \param prolog        whether the part being requested is a prolog
7747  * \param tm            LLVM target machine
7748  * \param debug         debug callback
7749  * \param build         the callback responsible for building the main function
7750  * \return              non-NULL on success
7751  */
7752 static struct si_shader_part *
7753 si_get_shader_part(struct si_screen *sscreen,
7754                    struct si_shader_part **list,
7755                    enum pipe_shader_type type,
7756                    bool prolog,
7757                    union si_shader_part_key *key,
7758                    LLVMTargetMachineRef tm,
7759                    struct pipe_debug_callback *debug,
7760                    void (*build)(struct si_shader_context *,
7761                                  union si_shader_part_key *),
7762                    const char *name)
7763 {
7764         struct si_shader_part *result;
7765
7766         mtx_lock(&sscreen->shader_parts_mutex);
7767
7768         /* Find existing. */
7769         for (result = *list; result; result = result->next) {
7770                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
7771                         mtx_unlock(&sscreen->shader_parts_mutex);
7772                         return result;
7773                 }
7774         }
7775
7776         /* Compile a new one. */
7777         result = CALLOC_STRUCT(si_shader_part);
7778         result->key = *key;
7779
7780         struct si_shader shader = {};
7781         struct si_shader_context ctx;
7782         struct gallivm_state *gallivm = &ctx.gallivm;
7783
7784         si_init_shader_ctx(&ctx, sscreen, tm);
7785         ctx.shader = &shader;
7786         ctx.type = type;
7787
7788         switch (type) {
7789         case PIPE_SHADER_VERTEX:
7790                 break;
7791         case PIPE_SHADER_TESS_CTRL:
7792                 assert(!prolog);
7793                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
7794                 break;
7795         case PIPE_SHADER_GEOMETRY:
7796                 assert(prolog);
7797                 break;
7798         case PIPE_SHADER_FRAGMENT:
7799                 if (prolog)
7800                         shader.key.part.ps.prolog = key->ps_prolog.states;
7801                 else
7802                         shader.key.part.ps.epilog = key->ps_epilog.states;
7803                 break;
7804         default:
7805                 unreachable("bad shader part");
7806         }
7807
7808         build(&ctx, key);
7809
7810         /* Compile. */
7811         si_llvm_finalize_module(&ctx,
7812                 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
7813
7814         if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
7815                             gallivm->module, debug, ctx.type, name)) {
7816                 FREE(result);
7817                 result = NULL;
7818                 goto out;
7819         }
7820
7821         result->next = *list;
7822         *list = result;
7823
7824 out:
7825         si_llvm_dispose(&ctx);
7826         mtx_unlock(&sscreen->shader_parts_mutex);
7827         return result;
7828 }
7829
7830 /**
7831  * Build the vertex shader prolog function.
7832  *
7833  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
7834  * All inputs are returned unmodified. The vertex load indices are
7835  * stored after them, which will be used by the API VS for fetching inputs.
7836  *
7837  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
7838  *   input_v0,
7839  *   input_v1,
7840  *   input_v2,
7841  *   input_v3,
7842  *   (VertexID + BaseVertex),
7843  *   (InstanceID + StartInstance),
7844  *   (InstanceID / 2 + StartInstance)
7845  */
7846 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
7847                                         union si_shader_part_key *key)
7848 {
7849         struct gallivm_state *gallivm = &ctx->gallivm;
7850         LLVMTypeRef *params, *returns;
7851         LLVMValueRef ret, func;
7852         int last_sgpr, num_params, num_returns, i;
7853
7854         ctx->param_vertex_id = key->vs_prolog.num_input_sgprs;
7855         ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3;
7856
7857         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
7858         params = alloca((key->vs_prolog.num_input_sgprs + 4) *
7859                         sizeof(LLVMTypeRef));
7860         returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
7861                           key->vs_prolog.last_input + 1) *
7862                          sizeof(LLVMTypeRef));
7863         num_params = 0;
7864         num_returns = 0;
7865
7866         /* Declare input and output SGPRs. */
7867         num_params = 0;
7868         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7869                 params[num_params++] = ctx->i32;
7870                 returns[num_returns++] = ctx->i32;
7871         }
7872         last_sgpr = num_params - 1;
7873
7874         /* 4 preloaded VGPRs (outputs must be floats) */
7875         for (i = 0; i < 4; i++) {
7876                 params[num_params++] = ctx->i32;
7877                 returns[num_returns++] = ctx->f32;
7878         }
7879
7880         /* Vertex load indices. */
7881         for (i = 0; i <= key->vs_prolog.last_input; i++)
7882                 returns[num_returns++] = ctx->f32;
7883
7884         /* Create the function. */
7885         si_create_function(ctx, "vs_prolog", returns, num_returns, params,
7886                            num_params, last_sgpr);
7887         func = ctx->main_fn;
7888
7889         /* Copy inputs to outputs. This should be no-op, as the registers match,
7890          * but it will prevent the compiler from overwriting them unintentionally.
7891          */
7892         ret = ctx->return_value;
7893         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7894                 LLVMValueRef p = LLVMGetParam(func, i);
7895                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7896         }
7897         for (i = num_params - 4; i < num_params; i++) {
7898                 LLVMValueRef p = LLVMGetParam(func, i);
7899                 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
7900                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7901         }
7902
7903         /* Compute vertex load indices from instance divisors. */
7904         for (i = 0; i <= key->vs_prolog.last_input; i++) {
7905                 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
7906                 LLVMValueRef index;
7907
7908                 if (divisor) {
7909                         /* InstanceID / Divisor + StartInstance */
7910                         index = get_instance_index_for_fetch(ctx,
7911                                                              SI_SGPR_START_INSTANCE,
7912                                                              divisor);
7913                 } else {
7914                         /* VertexID + BaseVertex */
7915                         index = LLVMBuildAdd(gallivm->builder,
7916                                              LLVMGetParam(func, ctx->param_vertex_id),
7917                                              LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
7918                 }
7919
7920                 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
7921                 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
7922                                            num_params++, "");
7923         }
7924
7925         si_llvm_build_ret(ctx, ret);
7926 }
7927
7928 /**
7929  * Build the vertex shader epilog function. This is also used by the tessellation
7930  * evaluation shader compiled as VS.
7931  *
7932  * The input is PrimitiveID.
7933  *
7934  * If PrimitiveID is required by the pixel shader, export it.
7935  * Otherwise, do nothing.
7936  */
7937 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
7938                                         union si_shader_part_key *key)
7939 {
7940         struct gallivm_state *gallivm = &ctx->gallivm;
7941         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7942         LLVMTypeRef params[5];
7943         int num_params, i;
7944
7945         /* Declare input VGPRs. */
7946         num_params = key->vs_epilog.states.export_prim_id ?
7947                            (VS_EPILOG_PRIMID_LOC + 1) : 0;
7948         assert(num_params <= ARRAY_SIZE(params));
7949
7950         for (i = 0; i < num_params; i++)
7951                 params[i] = ctx->f32;
7952
7953         /* Create the function. */
7954         si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
7955
7956         /* Emit exports. */
7957         if (key->vs_epilog.states.export_prim_id) {
7958                 struct lp_build_context *base = &bld_base->base;
7959                 struct ac_export_args args;
7960
7961                 args.enabled_channels = 0x1; /* enabled channels */
7962                 args.valid_mask = 0; /* whether the EXEC mask is valid */
7963                 args.done = 0; /* DONE bit */
7964                 args.target = V_008DFC_SQ_EXP_PARAM +
7965                               key->vs_epilog.prim_id_param_offset;
7966                 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
7967                 args.out[0] = LLVMGetParam(ctx->main_fn,
7968                                        VS_EPILOG_PRIMID_LOC); /* X */
7969                 args.out[1] = base->undef; /* Y */
7970                 args.out[2] = base->undef; /* Z */
7971                 args.out[3] = base->undef; /* W */
7972
7973                 ac_build_export(&ctx->ac, &args);
7974         }
7975
7976         LLVMBuildRetVoid(gallivm->builder);
7977 }
7978
7979 static bool si_get_vs_prolog(struct si_screen *sscreen,
7980                              LLVMTargetMachineRef tm,
7981                              struct si_shader *shader,
7982                              struct pipe_debug_callback *debug,
7983                              struct si_shader *main_part,
7984                              const struct si_vs_prolog_bits *key)
7985 {
7986         struct si_shader_selector *vs = main_part->selector;
7987
7988         /* The prolog is a no-op if there are no inputs. */
7989         if (!vs->vs_needs_prolog)
7990                 return true;
7991
7992         /* Get the prolog. */
7993         union si_shader_part_key prolog_key;
7994         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
7995                              key, shader, &prolog_key);
7996
7997         shader->prolog =
7998                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7999                                    PIPE_SHADER_VERTEX, true, &prolog_key, tm,
8000                                    debug, si_build_vs_prolog_function,
8001                                    "Vertex Shader Prolog");
8002         return shader->prolog != NULL;
8003 }
8004
8005 /**
8006  * Create & compile a vertex shader epilog. This a helper used by VS and TES.
8007  */
8008 static bool si_get_vs_epilog(struct si_screen *sscreen,
8009                              LLVMTargetMachineRef tm,
8010                              struct si_shader *shader,
8011                              struct pipe_debug_callback *debug,
8012                              struct si_vs_epilog_bits *states)
8013 {
8014         union si_shader_part_key epilog_key;
8015
8016         si_get_vs_epilog_key(shader, states, &epilog_key);
8017
8018         shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
8019                                             PIPE_SHADER_VERTEX, true,
8020                                             &epilog_key, tm, debug,
8021                                             si_build_vs_epilog_function,
8022                                             "Vertex Shader Epilog");
8023         return shader->epilog != NULL;
8024 }
8025
8026 /**
8027  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
8028  */
8029 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
8030                                       LLVMTargetMachineRef tm,
8031                                       struct si_shader *shader,
8032                                       struct pipe_debug_callback *debug)
8033 {
8034         if (!si_get_vs_prolog(sscreen, tm, shader, debug, shader,
8035                               &shader->key.part.vs.prolog))
8036                 return false;
8037
8038         /* Get the epilog. */
8039         if (!shader->key.as_es && !shader->key.as_ls &&
8040             !si_get_vs_epilog(sscreen, tm, shader, debug,
8041                               &shader->key.part.vs.epilog))
8042                 return false;
8043
8044         return true;
8045 }
8046
8047 /**
8048  * Select and compile (or reuse) TES parts (epilog).
8049  */
8050 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
8051                                        LLVMTargetMachineRef tm,
8052                                        struct si_shader *shader,
8053                                        struct pipe_debug_callback *debug)
8054 {
8055         if (shader->key.as_es)
8056                 return true;
8057
8058         /* TES compiled as VS. */
8059         return si_get_vs_epilog(sscreen, tm, shader, debug,
8060                                 &shader->key.part.tes.epilog);
8061 }
8062
8063 /**
8064  * Compile the TCS epilog function. This writes tesselation factors to memory
8065  * based on the output primitive type of the tesselator (determined by TES).
8066  */
8067 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
8068                                          union si_shader_part_key *key)
8069 {
8070         struct gallivm_state *gallivm = &ctx->gallivm;
8071         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8072         LLVMTypeRef params[16];
8073         LLVMValueRef func;
8074         int last_sgpr, num_params = 0;
8075
8076         /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
8077         params[ctx->param_rw_buffers = num_params++] =
8078                 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
8079         params[ctx->param_const_buffers = num_params++] = ctx->i64;
8080         params[ctx->param_samplers = num_params++] = ctx->i64;
8081         params[ctx->param_images = num_params++] = ctx->i64;
8082         params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8083         params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8084         params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
8085         params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
8086         params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
8087         params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8088         params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8089         last_sgpr = num_params - 1;
8090
8091         params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
8092         params[num_params++] = ctx->i32; /* invocation ID within the patch */
8093         params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
8094
8095         /* Create the function. */
8096         si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
8097         declare_tess_lds(ctx);
8098         func = ctx->main_fn;
8099
8100         si_write_tess_factors(bld_base,
8101                               LLVMGetParam(func, last_sgpr + 1),
8102                               LLVMGetParam(func, last_sgpr + 2),
8103                               LLVMGetParam(func, last_sgpr + 3));
8104
8105         LLVMBuildRetVoid(gallivm->builder);
8106 }
8107
8108 /**
8109  * Select and compile (or reuse) TCS parts (epilog).
8110  */
8111 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8112                                        LLVMTargetMachineRef tm,
8113                                        struct si_shader *shader,
8114                                        struct pipe_debug_callback *debug)
8115 {
8116         if (sscreen->b.chip_class >= GFX9) {
8117                 struct si_shader *ls_main_part =
8118                         shader->key.part.tcs.ls->main_shader_part_ls;
8119
8120                 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8121                                       &shader->key.part.tcs.ls_prolog))
8122                         return false;
8123
8124                 shader->previous_stage = ls_main_part;
8125         }
8126
8127         /* Get the epilog. */
8128         union si_shader_part_key epilog_key;
8129         memset(&epilog_key, 0, sizeof(epilog_key));
8130         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8131
8132         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8133                                             PIPE_SHADER_TESS_CTRL, false,
8134                                             &epilog_key, tm, debug,
8135                                             si_build_tcs_epilog_function,
8136                                             "Tessellation Control Shader Epilog");
8137         return shader->epilog != NULL;
8138 }
8139
8140 /**
8141  * Select and compile (or reuse) GS parts (prolog).
8142  */
8143 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8144                                       LLVMTargetMachineRef tm,
8145                                       struct si_shader *shader,
8146                                       struct pipe_debug_callback *debug)
8147 {
8148         union si_shader_part_key prolog_key;
8149
8150         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8151                 return true;
8152
8153         memset(&prolog_key, 0, sizeof(prolog_key));
8154         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8155
8156         shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8157                                             PIPE_SHADER_GEOMETRY, true,
8158                                             &prolog_key, tm, debug,
8159                                             si_build_gs_prolog_function,
8160                                             "Geometry Shader Prolog");
8161         return shader->prolog != NULL;
8162 }
8163
8164 /**
8165  * Build the pixel shader prolog function. This handles:
8166  * - two-side color selection and interpolation
8167  * - overriding interpolation parameters for the API PS
8168  * - polygon stippling
8169  *
8170  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8171  * overriden by other states. (e.g. per-sample interpolation)
8172  * Interpolated colors are stored after the preloaded VGPRs.
8173  */
8174 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8175                                         union si_shader_part_key *key)
8176 {
8177         struct gallivm_state *gallivm = &ctx->gallivm;
8178         LLVMTypeRef *params;
8179         LLVMValueRef ret, func;
8180         int last_sgpr, num_params, num_returns, i, num_color_channels;
8181
8182         assert(si_need_ps_prolog(key));
8183
8184         /* Number of inputs + 8 color elements. */
8185         params = alloca((key->ps_prolog.num_input_sgprs +
8186                          key->ps_prolog.num_input_vgprs + 8) *
8187                         sizeof(LLVMTypeRef));
8188
8189         /* Declare inputs. */
8190         num_params = 0;
8191         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8192                 params[num_params++] = ctx->i32;
8193         last_sgpr = num_params - 1;
8194
8195         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8196                 params[num_params++] = ctx->f32;
8197
8198         /* Declare outputs (same as inputs + add colors if needed) */
8199         num_returns = num_params;
8200         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8201         for (i = 0; i < num_color_channels; i++)
8202                 params[num_returns++] = ctx->f32;
8203
8204         /* Create the function. */
8205         si_create_function(ctx, "ps_prolog", params, num_returns, params,
8206                            num_params, last_sgpr);
8207         func = ctx->main_fn;
8208
8209         /* Copy inputs to outputs. This should be no-op, as the registers match,
8210          * but it will prevent the compiler from overwriting them unintentionally.
8211          */
8212         ret = ctx->return_value;
8213         for (i = 0; i < num_params; i++) {
8214                 LLVMValueRef p = LLVMGetParam(func, i);
8215                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8216         }
8217
8218         /* Polygon stippling. */
8219         if (key->ps_prolog.states.poly_stipple) {
8220                 /* POS_FIXED_PT is always last. */
8221                 unsigned pos = key->ps_prolog.num_input_sgprs +
8222                                key->ps_prolog.num_input_vgprs - 1;
8223                 LLVMValueRef ptr[2], list;
8224
8225                 /* Get the pointer to rw buffers. */
8226                 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8227                 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8228                 list = lp_build_gather_values(gallivm, ptr, 2);
8229                 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8230                 list = LLVMBuildIntToPtr(gallivm->builder, list,
8231                                           const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
8232
8233                 si_llvm_emit_polygon_stipple(ctx, list, pos);
8234         }
8235
8236         if (key->ps_prolog.states.bc_optimize_for_persp ||
8237             key->ps_prolog.states.bc_optimize_for_linear) {
8238                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8239                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8240
8241                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8242                  * The hw doesn't compute CENTROID if the whole wave only
8243                  * contains fully-covered quads.
8244                  *
8245                  * PRIM_MASK is after user SGPRs.
8246                  */
8247                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8248                 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8249                                             LLVMConstInt(ctx->i32, 31, 0), "");
8250                 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8251                                              ctx->i1, "");
8252
8253                 if (key->ps_prolog.states.bc_optimize_for_persp) {
8254                         /* Read PERSP_CENTER. */
8255                         for (i = 0; i < 2; i++)
8256                                 center[i] = LLVMGetParam(func, base + 2 + i);
8257                         /* Read PERSP_CENTROID. */
8258                         for (i = 0; i < 2; i++)
8259                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
8260                         /* Select PERSP_CENTROID. */
8261                         for (i = 0; i < 2; i++) {
8262                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8263                                                       center[i], centroid[i], "");
8264                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8265                                                            tmp, base + 4 + i, "");
8266                         }
8267                 }
8268                 if (key->ps_prolog.states.bc_optimize_for_linear) {
8269                         /* Read LINEAR_CENTER. */
8270                         for (i = 0; i < 2; i++)
8271                                 center[i] = LLVMGetParam(func, base + 8 + i);
8272                         /* Read LINEAR_CENTROID. */
8273                         for (i = 0; i < 2; i++)
8274                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
8275                         /* Select LINEAR_CENTROID. */
8276                         for (i = 0; i < 2; i++) {
8277                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8278                                                       center[i], centroid[i], "");
8279                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8280                                                            tmp, base + 10 + i, "");
8281                         }
8282                 }
8283         }
8284
8285         /* Force per-sample interpolation. */
8286         if (key->ps_prolog.states.force_persp_sample_interp) {
8287                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8288                 LLVMValueRef persp_sample[2];
8289
8290                 /* Read PERSP_SAMPLE. */
8291                 for (i = 0; i < 2; i++)
8292                         persp_sample[i] = LLVMGetParam(func, base + i);
8293                 /* Overwrite PERSP_CENTER. */
8294                 for (i = 0; i < 2; i++)
8295                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8296                                                    persp_sample[i], base + 2 + i, "");
8297                 /* Overwrite PERSP_CENTROID. */
8298                 for (i = 0; i < 2; i++)
8299                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8300                                                    persp_sample[i], base + 4 + i, "");
8301         }
8302         if (key->ps_prolog.states.force_linear_sample_interp) {
8303                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8304                 LLVMValueRef linear_sample[2];
8305
8306                 /* Read LINEAR_SAMPLE. */
8307                 for (i = 0; i < 2; i++)
8308                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8309                 /* Overwrite LINEAR_CENTER. */
8310                 for (i = 0; i < 2; i++)
8311                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8312                                                    linear_sample[i], base + 8 + i, "");
8313                 /* Overwrite LINEAR_CENTROID. */
8314                 for (i = 0; i < 2; i++)
8315                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8316                                                    linear_sample[i], base + 10 + i, "");
8317         }
8318
8319         /* Force center interpolation. */
8320         if (key->ps_prolog.states.force_persp_center_interp) {
8321                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8322                 LLVMValueRef persp_center[2];
8323
8324                 /* Read PERSP_CENTER. */
8325                 for (i = 0; i < 2; i++)
8326                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
8327                 /* Overwrite PERSP_SAMPLE. */
8328                 for (i = 0; i < 2; i++)
8329                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8330                                                    persp_center[i], base + i, "");
8331                 /* Overwrite PERSP_CENTROID. */
8332                 for (i = 0; i < 2; i++)
8333                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8334                                                    persp_center[i], base + 4 + i, "");
8335         }
8336         if (key->ps_prolog.states.force_linear_center_interp) {
8337                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8338                 LLVMValueRef linear_center[2];
8339
8340                 /* Read LINEAR_CENTER. */
8341                 for (i = 0; i < 2; i++)
8342                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
8343                 /* Overwrite LINEAR_SAMPLE. */
8344                 for (i = 0; i < 2; i++)
8345                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8346                                                    linear_center[i], base + 6 + i, "");
8347                 /* Overwrite LINEAR_CENTROID. */
8348                 for (i = 0; i < 2; i++)
8349                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8350                                                    linear_center[i], base + 10 + i, "");
8351         }
8352
8353         /* Interpolate colors. */
8354         for (i = 0; i < 2; i++) {
8355                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8356                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8357                                      key->ps_prolog.face_vgpr_index;
8358                 LLVMValueRef interp[2], color[4];
8359                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8360
8361                 if (!writemask)
8362                         continue;
8363
8364                 /* If the interpolation qualifier is not CONSTANT (-1). */
8365                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8366                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8367                                                key->ps_prolog.color_interp_vgpr_index[i];
8368
8369                         /* Get the (i,j) updated by bc_optimize handling. */
8370                         interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8371                                                           interp_vgpr, "");
8372                         interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8373                                                           interp_vgpr + 1, "");
8374                         interp_ij = lp_build_gather_values(gallivm, interp, 2);
8375                 }
8376
8377                 /* Use the absolute location of the input. */
8378                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8379
8380                 if (key->ps_prolog.states.color_two_side) {
8381                         face = LLVMGetParam(func, face_vgpr);
8382                         face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8383                 }
8384
8385                 interp_fs_input(ctx,
8386                                 key->ps_prolog.color_attr_index[i],
8387                                 TGSI_SEMANTIC_COLOR, i,
8388                                 key->ps_prolog.num_interp_inputs,
8389                                 key->ps_prolog.colors_read, interp_ij,
8390                                 prim_mask, face, color);
8391
8392                 while (writemask) {
8393                         unsigned chan = u_bit_scan(&writemask);
8394                         ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8395                                                    num_params++, "");
8396                 }
8397         }
8398
8399         /* Tell LLVM to insert WQM instruction sequence when needed. */
8400         if (key->ps_prolog.wqm) {
8401                 LLVMAddTargetDependentFunctionAttr(func,
8402                                                    "amdgpu-ps-wqm-outputs", "");
8403         }
8404
8405         si_llvm_build_ret(ctx, ret);
8406 }
8407
8408 /**
8409  * Build the pixel shader epilog function. This handles everything that must be
8410  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8411  */
8412 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8413                                         union si_shader_part_key *key)
8414 {
8415         struct gallivm_state *gallivm = &ctx->gallivm;
8416         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8417         LLVMTypeRef params[16+8*4+3];
8418         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8419         int last_sgpr, num_params = 0, i;
8420         struct si_ps_exports exp = {};
8421
8422         /* Declare input SGPRs. */
8423         params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8424         params[ctx->param_const_buffers = num_params++] = ctx->i64;
8425         params[ctx->param_samplers = num_params++] = ctx->i64;
8426         params[ctx->param_images = num_params++] = ctx->i64;
8427         params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8428         assert(num_params == SI_PARAM_ALPHA_REF);
8429         params[SI_PARAM_ALPHA_REF] = ctx->f32;
8430         last_sgpr = SI_PARAM_ALPHA_REF;
8431
8432         /* Declare input VGPRs. */
8433         num_params = (last_sgpr + 1) +
8434                      util_bitcount(key->ps_epilog.colors_written) * 4 +
8435                      key->ps_epilog.writes_z +
8436                      key->ps_epilog.writes_stencil +
8437                      key->ps_epilog.writes_samplemask;
8438
8439         num_params = MAX2(num_params,
8440                           last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8441
8442         assert(num_params <= ARRAY_SIZE(params));
8443
8444         for (i = last_sgpr + 1; i < num_params; i++)
8445                 params[i] = ctx->f32;
8446
8447         /* Create the function. */
8448         si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
8449         /* Disable elimination of unused inputs. */
8450         si_llvm_add_attribute(ctx->main_fn,
8451                                   "InitialPSInputAddr", 0xffffff);
8452
8453         /* Process colors. */
8454         unsigned vgpr = last_sgpr + 1;
8455         unsigned colors_written = key->ps_epilog.colors_written;
8456         int last_color_export = -1;
8457
8458         /* Find the last color export. */
8459         if (!key->ps_epilog.writes_z &&
8460             !key->ps_epilog.writes_stencil &&
8461             !key->ps_epilog.writes_samplemask) {
8462                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8463
8464                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8465                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8466                         /* Just set this if any of the colorbuffers are enabled. */
8467                         if (spi_format &
8468                             ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8469                                 last_color_export = 0;
8470                 } else {
8471                         for (i = 0; i < 8; i++)
8472                                 if (colors_written & (1 << i) &&
8473                                     (spi_format >> (i * 4)) & 0xf)
8474                                         last_color_export = i;
8475                 }
8476         }
8477
8478         while (colors_written) {
8479                 LLVMValueRef color[4];
8480                 int mrt = u_bit_scan(&colors_written);
8481
8482                 for (i = 0; i < 4; i++)
8483                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8484
8485                 si_export_mrt_color(bld_base, color, mrt,
8486                                     num_params - 1,
8487                                     mrt == last_color_export, &exp);
8488         }
8489
8490         /* Process depth, stencil, samplemask. */
8491         if (key->ps_epilog.writes_z)
8492                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8493         if (key->ps_epilog.writes_stencil)
8494                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8495         if (key->ps_epilog.writes_samplemask)
8496                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8497
8498         if (depth || stencil || samplemask)
8499                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8500         else if (last_color_export == -1)
8501                 si_export_null(bld_base);
8502
8503         if (exp.num)
8504                 si_emit_ps_exports(ctx, &exp);
8505
8506         /* Compile. */
8507         LLVMBuildRetVoid(gallivm->builder);
8508 }
8509
8510 /**
8511  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
8512  */
8513 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
8514                                       LLVMTargetMachineRef tm,
8515                                       struct si_shader *shader,
8516                                       struct pipe_debug_callback *debug)
8517 {
8518         union si_shader_part_key prolog_key;
8519         union si_shader_part_key epilog_key;
8520
8521         /* Get the prolog. */
8522         si_get_ps_prolog_key(shader, &prolog_key, true);
8523
8524         /* The prolog is a no-op if these aren't set. */
8525         if (si_need_ps_prolog(&prolog_key)) {
8526                 shader->prolog =
8527                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
8528                                            PIPE_SHADER_FRAGMENT, true,
8529                                            &prolog_key, tm, debug,
8530                                            si_build_ps_prolog_function,
8531                                            "Fragment Shader Prolog");
8532                 if (!shader->prolog)
8533                         return false;
8534         }
8535
8536         /* Get the epilog. */
8537         si_get_ps_epilog_key(shader, &epilog_key);
8538
8539         shader->epilog =
8540                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
8541                                    PIPE_SHADER_FRAGMENT, false,
8542                                    &epilog_key, tm, debug,
8543                                    si_build_ps_epilog_function,
8544                                    "Fragment Shader Epilog");
8545         if (!shader->epilog)
8546                 return false;
8547
8548         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
8549         if (shader->key.part.ps.prolog.poly_stipple) {
8550                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
8551                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
8552         }
8553
8554         /* Set up the enable bits for per-sample shading if needed. */
8555         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
8556             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8557              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8558                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
8559                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8560                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
8561         }
8562         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
8563             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8564              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8565                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
8566                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8567                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
8568         }
8569         if (shader->key.part.ps.prolog.force_persp_center_interp &&
8570             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8571              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8572                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
8573                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8574                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8575         }
8576         if (shader->key.part.ps.prolog.force_linear_center_interp &&
8577             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8578              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8579                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
8580                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8581                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8582         }
8583
8584         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
8585         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
8586             !(shader->config.spi_ps_input_ena & 0xf)) {
8587                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8588                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
8589         }
8590
8591         /* At least one pair of interpolation weights must be enabled. */
8592         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
8593                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8594                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
8595         }
8596
8597         /* The sample mask input is always enabled, because the API shader always
8598          * passes it through to the epilog. Disable it here if it's unused.
8599          */
8600         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
8601             !shader->selector->info.reads_samplemask)
8602                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
8603
8604         return true;
8605 }
8606
8607 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
8608                                       unsigned *lds_size)
8609 {
8610         /* SPI barrier management bug:
8611          *   Make sure we have at least 4k of LDS in use to avoid the bug.
8612          *   It applies to workgroup sizes of more than one wavefront.
8613          */
8614         if (sscreen->b.family == CHIP_BONAIRE ||
8615             sscreen->b.family == CHIP_KABINI ||
8616             sscreen->b.family == CHIP_MULLINS)
8617                 *lds_size = MAX2(*lds_size, 8);
8618 }
8619
8620 static void si_fix_resource_usage(struct si_screen *sscreen,
8621                                   struct si_shader *shader)
8622 {
8623         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
8624
8625         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
8626
8627         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
8628             si_get_max_workgroup_size(shader) > 64) {
8629                 si_multiwave_lds_size_workaround(sscreen,
8630                                                  &shader->config.lds_size);
8631         }
8632 }
8633
8634 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
8635                      struct si_shader *shader,
8636                      struct pipe_debug_callback *debug)
8637 {
8638         struct si_shader_selector *sel = shader->selector;
8639         struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
8640         int r;
8641
8642         /* LS, ES, VS are compiled on demand if the main part hasn't been
8643          * compiled for that stage.
8644          *
8645          * Vertex shaders are compiled on demand when a vertex fetch
8646          * workaround must be applied.
8647          */
8648         if (shader->is_monolithic) {
8649                 /* Monolithic shader (compiled as a whole, has many variants,
8650                  * may take a long time to compile).
8651                  */
8652                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
8653                 if (r)
8654                         return r;
8655         } else {
8656                 /* The shader consists of 2-3 parts:
8657                  *
8658                  * - the middle part is the user shader, it has 1 variant only
8659                  *   and it was compiled during the creation of the shader
8660                  *   selector
8661                  * - the prolog part is inserted at the beginning
8662                  * - the epilog part is inserted at the end
8663                  *
8664                  * The prolog and epilog have many (but simple) variants.
8665                  */
8666
8667                 /* Copy the compiled TGSI shader data over. */
8668                 shader->is_binary_shared = true;
8669                 shader->binary = mainp->binary;
8670                 shader->config = mainp->config;
8671                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
8672                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
8673                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
8674                 memcpy(shader->info.vs_output_param_offset,
8675                        mainp->info.vs_output_param_offset,
8676                        sizeof(mainp->info.vs_output_param_offset));
8677                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
8678                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
8679                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
8680
8681                 /* Select prologs and/or epilogs. */
8682                 switch (sel->type) {
8683                 case PIPE_SHADER_VERTEX:
8684                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
8685                                 return -1;
8686                         break;
8687                 case PIPE_SHADER_TESS_CTRL:
8688                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
8689                                 return -1;
8690                         break;
8691                 case PIPE_SHADER_TESS_EVAL:
8692                         if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
8693                                 return -1;
8694                         break;
8695                 case PIPE_SHADER_GEOMETRY:
8696                         if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
8697                                 return -1;
8698                         break;
8699                 case PIPE_SHADER_FRAGMENT:
8700                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
8701                                 return -1;
8702
8703                         /* Make sure we have at least as many VGPRs as there
8704                          * are allocated inputs.
8705                          */
8706                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8707                                                         shader->info.num_input_vgprs);
8708                         break;
8709                 }
8710
8711                 /* Update SGPR and VGPR counts. */
8712                 if (shader->prolog) {
8713                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8714                                                         shader->prolog->config.num_sgprs);
8715                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8716                                                         shader->prolog->config.num_vgprs);
8717                 }
8718                 if (shader->previous_stage) {
8719                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8720                                                         shader->previous_stage->config.num_sgprs);
8721                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8722                                                         shader->previous_stage->config.num_vgprs);
8723                         shader->config.spilled_sgprs =
8724                                 MAX2(shader->config.spilled_sgprs,
8725                                      shader->previous_stage->config.spilled_sgprs);
8726                         shader->config.spilled_vgprs =
8727                                 MAX2(shader->config.spilled_vgprs,
8728                                      shader->previous_stage->config.spilled_vgprs);
8729                         shader->config.private_mem_vgprs =
8730                                 MAX2(shader->config.private_mem_vgprs,
8731                                      shader->previous_stage->config.private_mem_vgprs);
8732                         shader->config.scratch_bytes_per_wave =
8733                                 MAX2(shader->config.scratch_bytes_per_wave,
8734                                      shader->previous_stage->config.scratch_bytes_per_wave);
8735                         shader->info.uses_instanceid |=
8736                                 shader->previous_stage->info.uses_instanceid;
8737                 }
8738                 if (shader->epilog) {
8739                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8740                                                         shader->epilog->config.num_sgprs);
8741                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8742                                                         shader->epilog->config.num_vgprs);
8743                 }
8744         }
8745
8746         si_fix_resource_usage(sscreen, shader);
8747         si_shader_dump(sscreen, shader, debug, sel->info.processor,
8748                        stderr, true);
8749
8750         /* Upload. */
8751         r = si_shader_binary_upload(sscreen, shader);
8752         if (r) {
8753                 fprintf(stderr, "LLVM failed to upload shader\n");
8754                 return r;
8755         }
8756
8757         return 0;
8758 }
8759
8760 void si_shader_destroy(struct si_shader *shader)
8761 {
8762         if (shader->scratch_bo)
8763                 r600_resource_reference(&shader->scratch_bo, NULL);
8764
8765         r600_resource_reference(&shader->bo, NULL);
8766
8767         if (!shader->is_binary_shared)
8768                 radeon_shader_binary_clean(&shader->binary);
8769
8770         free(shader->shader_log);
8771 }