src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_flow.h"
  35 #include "gallivm/lp_bld_misc.h"
  36 #include "util/u_memory.h"
  37 #include "util/u_string.h"
  38 #include "tgsi/tgsi_build.h"
  39 #include "tgsi/tgsi_util.h"
  40 #include "tgsi/tgsi_dump.h"
  41
  42 #include "ac_binary.h"
  43 #include "ac_llvm_util.h"
  44 #include "ac_exp_param.h"
  45 #include "si_shader_internal.h"
  46 #include "si_pipe.h"
  47 #include "sid.h"
  48
  49
  50 static const char *scratch_rsrc_dword0_symbol =
  51         "SCRATCH_RSRC_DWORD0";
  52
  53 static const char *scratch_rsrc_dword1_symbol =
  54         "SCRATCH_RSRC_DWORD1";
  55
  56 struct si_shader_output_values
  57 {
  58         LLVMValueRef values[4];
  59         unsigned semantic_name;
  60         unsigned semantic_index;
  61         ubyte vertex_stream[4];
  62 };
  63
  64 static void si_init_shader_ctx(struct si_shader_context *ctx,
  65                                struct si_screen *sscreen,
  66                                LLVMTargetMachineRef tm);
  67
  68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  69                                  struct lp_build_tgsi_context *bld_base,
  70                                  struct lp_build_emit_data *emit_data);
  71
  72 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
  73                                FILE *f);
  74
  75 static unsigned llvm_get_type_size(LLVMTypeRef type);
  76
  77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  78                                         union si_shader_part_key *key);
  79 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
  80                                         union si_shader_part_key *key);
  81 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  82                                          union si_shader_part_key *key);
  83 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  84                                         union si_shader_part_key *key);
  85 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  86                                         union si_shader_part_key *key);
  87
  88 /* Ideally pass the sample mask input to the PS epilog as v13, which
  89  * is its usual location, so that the shader doesn't have to add v_mov.
  90  */
  91 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
  92
  93 /* The VS location of the PrimitiveID input is the same in the epilog,
  94  * so that the main shader part doesn't have to move it.
  95  */
  96 #define VS_EPILOG_PRIMID_LOC 2
  97
  98 enum {
  99         CONST_ADDR_SPACE = 2,
 100         LOCAL_ADDR_SPACE = 3,
 101 };
 102
 103 /**
 104  * Returns a unique index for a semantic name and index. The index must be
 105  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 106  * calculated.
 107  */
 108 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 109 {
 110         switch (semantic_name) {
 111         case TGSI_SEMANTIC_POSITION:
 112                 return 0;
 113         case TGSI_SEMANTIC_PSIZE:
 114                 return 1;
 115         case TGSI_SEMANTIC_CLIPDIST:
 116                 assert(index <= 1);
 117                 return 2 + index;
 118         case TGSI_SEMANTIC_GENERIC:
 119                 if (index <= 63-4)
 120                         return 4 + index;
 121
 122                 assert(!"invalid generic index");
 123                 return 0;
 124
 125         /* patch indices are completely separate and thus start from 0 */
 126         case TGSI_SEMANTIC_TESSOUTER:
 127                 return 0;
 128         case TGSI_SEMANTIC_TESSINNER:
 129                 return 1;
 130         case TGSI_SEMANTIC_PATCH:
 131                 return 2 + index;
 132
 133         default:
 134                 assert(!"invalid semantic name");
 135                 return 0;
 136         }
 137 }
 138
 139 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
 140 {
 141         switch (name) {
 142         case TGSI_SEMANTIC_FOG:
 143                 return 0;
 144         case TGSI_SEMANTIC_LAYER:
 145                 return 1;
 146         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 147                 return 2;
 148         case TGSI_SEMANTIC_PRIMID:
 149                 return 3;
 150         case TGSI_SEMANTIC_COLOR: /* these alias */
 151         case TGSI_SEMANTIC_BCOLOR:
 152                 return 4 + index;
 153         case TGSI_SEMANTIC_TEXCOORD:
 154                 return 6 + index;
 155         default:
 156                 assert(!"invalid semantic name");
 157                 return 0;
 158         }
 159 }
 160
 161 /**
 162  * Get the value of a shader input parameter and extract a bitfield.
 163  */
 164 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 165                                  unsigned param, unsigned rshift,
 166                                  unsigned bitwidth)
 167 {
 168         struct gallivm_state *gallivm = &ctx->gallivm;
 169         LLVMValueRef value = LLVMGetParam(ctx->main_fn,
 170                                           param);
 171
 172         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 173                 value = bitcast(&ctx->bld_base,
 174                                 TGSI_TYPE_UNSIGNED, value);
 175
 176         if (rshift)
 177                 value = LLVMBuildLShr(gallivm->builder, value,
 178                                       LLVMConstInt(ctx->i32, rshift, 0), "");
 179
 180         if (rshift + bitwidth < 32) {
 181                 unsigned mask = (1 << bitwidth) - 1;
 182                 value = LLVMBuildAnd(gallivm->builder, value,
 183                                      LLVMConstInt(ctx->i32, mask, 0), "");
 184         }
 185
 186         return value;
 187 }
 188
 189 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 190 {
 191         switch (ctx->type) {
 192         case PIPE_SHADER_TESS_CTRL:
 193                 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
 194
 195         case PIPE_SHADER_TESS_EVAL:
 196                 return LLVMGetParam(ctx->main_fn,
 197                                     ctx->param_tes_rel_patch_id);
 198
 199         default:
 200                 assert(0);
 201                 return NULL;
 202         }
 203 }
 204
 205 /* Tessellation shaders pass outputs to the next shader using LDS.
 206  *
 207  * LS outputs = TCS inputs
 208  * TCS outputs = TES inputs
 209  *
 210  * The LDS layout is:
 211  * - TCS inputs for patch 0
 212  * - TCS inputs for patch 1
 213  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 214  * - ...
 215  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 216  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 217  * - TCS outputs for patch 1
 218  * - Per-patch TCS outputs for patch 1
 219  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 220  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 221  * - ...
 222  *
 223  * All three shaders VS(LS), TCS, TES share the same LDS space.
 224  */
 225
 226 static LLVMValueRef
 227 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 228 {
 229         return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
 230 }
 231
 232 static LLVMValueRef
 233 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 234 {
 235         return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
 236 }
 237
 238 static LLVMValueRef
 239 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 240 {
 241         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 242                                 unpack_param(ctx,
 243                                              ctx->param_tcs_out_lds_offsets,
 244                                              0, 16),
 245                                 4);
 246 }
 247
 248 static LLVMValueRef
 249 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 250 {
 251         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 252                                 unpack_param(ctx,
 253                                              ctx->param_tcs_out_lds_offsets,
 254                                              16, 16),
 255                                 4);
 256 }
 257
 258 static LLVMValueRef
 259 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 260 {
 261         struct gallivm_state *gallivm = &ctx->gallivm;
 262         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 263         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 264
 265         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 266 }
 267
 268 static LLVMValueRef
 269 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 270 {
 271         struct gallivm_state *gallivm = &ctx->gallivm;
 272         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 273         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 274         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 275
 276         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 277                             LLVMBuildMul(gallivm->builder, patch_stride,
 278                                          rel_patch_id, ""),
 279                             "");
 280 }
 281
 282 static LLVMValueRef
 283 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 284 {
 285         struct gallivm_state *gallivm = &ctx->gallivm;
 286         LLVMValueRef patch0_patch_data_offset =
 287                 get_tcs_out_patch0_patch_data_offset(ctx);
 288         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 289         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 290
 291         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 292                             LLVMBuildMul(gallivm->builder, patch_stride,
 293                                          rel_patch_id, ""),
 294                             "");
 295 }
 296
 297 static LLVMValueRef get_instance_index_for_fetch(
 298         struct si_shader_context *ctx,
 299         unsigned param_start_instance, unsigned divisor)
 300 {
 301         struct gallivm_state *gallivm = &ctx->gallivm;
 302
 303         LLVMValueRef result = LLVMGetParam(ctx->main_fn,
 304                                            ctx->param_instance_id);
 305
 306         /* The division must be done before START_INSTANCE is added. */
 307         if (divisor > 1)
 308                 result = LLVMBuildUDiv(gallivm->builder, result,
 309                                 LLVMConstInt(ctx->i32, divisor, 0), "");
 310
 311         return LLVMBuildAdd(gallivm->builder, result,
 312                             LLVMGetParam(ctx->main_fn, param_start_instance), "");
 313 }
 314
 315 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
 316  * to float. */
 317 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
 318                                             LLVMValueRef vec4,
 319                                             unsigned double_index)
 320 {
 321         LLVMBuilderRef builder = ctx->gallivm.builder;
 322         LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
 323         LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
 324                                               LLVMVectorType(f64, 2), "");
 325         LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
 326         LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
 327         return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
 328 }
 329
 330 static void declare_input_vs(
 331         struct si_shader_context *ctx,
 332         unsigned input_index,
 333         const struct tgsi_full_declaration *decl,
 334         LLVMValueRef out[4])
 335 {
 336         struct gallivm_state *gallivm = &ctx->gallivm;
 337
 338         unsigned chan;
 339         unsigned fix_fetch;
 340         unsigned num_fetches;
 341         unsigned fetch_stride;
 342
 343         LLVMValueRef t_list_ptr;
 344         LLVMValueRef t_offset;
 345         LLVMValueRef t_list;
 346         LLVMValueRef vertex_index;
 347         LLVMValueRef input[3];
 348
 349         /* Load the T list */
 350         t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
 351
 352         t_offset = LLVMConstInt(ctx->i32, input_index, 0);
 353
 354         t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
 355
 356         vertex_index = LLVMGetParam(ctx->main_fn,
 357                                     ctx->param_vertex_index0 +
 358                                     input_index);
 359
 360         fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
 361
 362         /* Do multiple loads for special formats. */
 363         switch (fix_fetch) {
 364         case SI_FIX_FETCH_RGB_64_FLOAT:
 365                 num_fetches = 3; /* 3 2-dword loads */
 366                 fetch_stride = 8;
 367                 break;
 368         case SI_FIX_FETCH_RGBA_64_FLOAT:
 369                 num_fetches = 2; /* 2 4-dword loads */
 370                 fetch_stride = 16;
 371                 break;
 372         case SI_FIX_FETCH_RGB_8:
 373         case SI_FIX_FETCH_RGB_8_INT:
 374                 num_fetches = 3;
 375                 fetch_stride = 1;
 376                 break;
 377         case SI_FIX_FETCH_RGB_16:
 378         case SI_FIX_FETCH_RGB_16_INT:
 379                 num_fetches = 3;
 380                 fetch_stride = 2;
 381                 break;
 382         default:
 383                 num_fetches = 1;
 384                 fetch_stride = 0;
 385         }
 386
 387         for (unsigned i = 0; i < num_fetches; i++) {
 388                 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
 389
 390                 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
 391                                                        vertex_index, voffset,
 392                                                        true);
 393         }
 394
 395         /* Break up the vec4 into individual components */
 396         for (chan = 0; chan < 4; chan++) {
 397                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
 398                 out[chan] = LLVMBuildExtractElement(gallivm->builder,
 399                                                     input[0], llvm_chan, "");
 400         }
 401
 402         switch (fix_fetch) {
 403         case SI_FIX_FETCH_A2_SNORM:
 404         case SI_FIX_FETCH_A2_SSCALED:
 405         case SI_FIX_FETCH_A2_SINT: {
 406                 /* The hardware returns an unsigned value; convert it to a
 407                  * signed one.
 408                  */
 409                 LLVMValueRef tmp = out[3];
 410                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 411
 412                 /* First, recover the sign-extended signed integer value. */
 413                 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
 414                         tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
 415                 else
 416                         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
 417
 418                 /* For the integer-like cases, do a natural sign extension.
 419                  *
 420                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 421                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 422                  * exponent.
 423                  */
 424                 tmp = LLVMBuildShl(gallivm->builder, tmp,
 425                                    fix_fetch == SI_FIX_FETCH_A2_SNORM ?
 426                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 427                 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
 428
 429                 /* Convert back to the right type. */
 430                 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
 431                         LLVMValueRef clamp;
 432                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 433                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 434                         clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
 435                         tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
 436                 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
 437                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 438                 }
 439
 440                 out[3] = tmp;
 441                 break;
 442         }
 443         case SI_FIX_FETCH_RGBA_32_UNORM:
 444         case SI_FIX_FETCH_RGBX_32_UNORM:
 445                 for (chan = 0; chan < 4; chan++) {
 446                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 447                                                      ctx->i32, "");
 448                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 449                                                     out[chan], ctx->f32, "");
 450                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 451                                                   LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
 452                 }
 453                 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 454                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
 455                         out[3] = LLVMConstReal(ctx->f32, 1);
 456                 break;
 457         case SI_FIX_FETCH_RGBA_32_SNORM:
 458         case SI_FIX_FETCH_RGBX_32_SNORM:
 459         case SI_FIX_FETCH_RGBA_32_FIXED:
 460         case SI_FIX_FETCH_RGBX_32_FIXED: {
 461                 double scale;
 462                 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
 463                         scale = 1.0 / 0x10000;
 464                 else
 465                         scale = 1.0 / INT_MAX;
 466
 467                 for (chan = 0; chan < 4; chan++) {
 468                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 469                                                      ctx->i32, "");
 470                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 471                                                     out[chan], ctx->f32, "");
 472                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 473                                                   LLVMConstReal(ctx->f32, scale), "");
 474                 }
 475                 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 476                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
 477                     fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
 478                         out[3] = LLVMConstReal(ctx->f32, 1);
 479                 break;
 480         }
 481         case SI_FIX_FETCH_RGBA_32_USCALED:
 482                 for (chan = 0; chan < 4; chan++) {
 483                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 484                                                      ctx->i32, "");
 485                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 486                                                     out[chan], ctx->f32, "");
 487                 }
 488                 break;
 489         case SI_FIX_FETCH_RGBA_32_SSCALED:
 490                 for (chan = 0; chan < 4; chan++) {
 491                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 492                                                      ctx->i32, "");
 493                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 494                                                     out[chan], ctx->f32, "");
 495                 }
 496                 break;
 497         case SI_FIX_FETCH_RG_64_FLOAT:
 498                 for (chan = 0; chan < 2; chan++)
 499                         out[chan] = extract_double_to_float(ctx, input[0], chan);
 500
 501                 out[2] = LLVMConstReal(ctx->f32, 0);
 502                 out[3] = LLVMConstReal(ctx->f32, 1);
 503                 break;
 504         case SI_FIX_FETCH_RGB_64_FLOAT:
 505                 for (chan = 0; chan < 3; chan++)
 506                         out[chan] = extract_double_to_float(ctx, input[chan], 0);
 507
 508                 out[3] = LLVMConstReal(ctx->f32, 1);
 509                 break;
 510         case SI_FIX_FETCH_RGBA_64_FLOAT:
 511                 for (chan = 0; chan < 4; chan++) {
 512                         out[chan] = extract_double_to_float(ctx, input[chan / 2],
 513                                                             chan % 2);
 514                 }
 515                 break;
 516         case SI_FIX_FETCH_RGB_8:
 517         case SI_FIX_FETCH_RGB_8_INT:
 518         case SI_FIX_FETCH_RGB_16:
 519         case SI_FIX_FETCH_RGB_16_INT:
 520                 for (chan = 0; chan < 3; chan++) {
 521                         out[chan] = LLVMBuildExtractElement(gallivm->builder,
 522                                                             input[chan],
 523                                                             ctx->i32_0, "");
 524                 }
 525                 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
 526                     fix_fetch == SI_FIX_FETCH_RGB_16) {
 527                         out[3] = LLVMConstReal(ctx->f32, 1);
 528                 } else {
 529                         out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
 530                                                   ctx->f32, "");
 531                 }
 532                 break;
 533         }
 534 }
 535
 536 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 537                                      unsigned swizzle)
 538 {
 539         struct si_shader_context *ctx = si_shader_context(bld_base);
 540
 541         if (swizzle > 0)
 542                 return ctx->i32_0;
 543
 544         switch (ctx->type) {
 545         case PIPE_SHADER_VERTEX:
 546                 return LLVMGetParam(ctx->main_fn,
 547                                     ctx->param_vs_prim_id);
 548         case PIPE_SHADER_TESS_CTRL:
 549                 return LLVMGetParam(ctx->main_fn,
 550                                     ctx->param_tcs_patch_id);
 551         case PIPE_SHADER_TESS_EVAL:
 552                 return LLVMGetParam(ctx->main_fn,
 553                                     ctx->param_tes_patch_id);
 554         case PIPE_SHADER_GEOMETRY:
 555                 return LLVMGetParam(ctx->main_fn,
 556                                     ctx->param_gs_prim_id);
 557         default:
 558                 assert(0);
 559                 return ctx->i32_0;
 560         }
 561 }
 562
 563 /**
 564  * Return the value of tgsi_ind_register for indexing.
 565  * This is the indirect index with the constant offset added to it.
 566  */
 567 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 568                                        const struct tgsi_ind_register *ind,
 569                                        int rel_index)
 570 {
 571         struct gallivm_state *gallivm = &ctx->gallivm;
 572         LLVMValueRef result;
 573
 574         result = ctx->addrs[ind->Index][ind->Swizzle];
 575         result = LLVMBuildLoad(gallivm->builder, result, "");
 576         result = LLVMBuildAdd(gallivm->builder, result,
 577                               LLVMConstInt(ctx->i32, rel_index, 0), "");
 578         return result;
 579 }
 580
 581 /**
 582  * Like get_indirect_index, but restricts the return value to a (possibly
 583  * undefined) value inside [0..num).
 584  */
 585 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
 586                                                const struct tgsi_ind_register *ind,
 587                                                int rel_index, unsigned num)
 588 {
 589         LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
 590
 591         /* LLVM 3.8: If indirect resource indexing is used:
 592          * - SI & CIK hang
 593          * - VI crashes
 594          */
 595         if (HAVE_LLVM == 0x0308)
 596                 return LLVMGetUndef(ctx->i32);
 597
 598         return si_llvm_bound_index(ctx, result, num);
 599 }
 600
 601
 602 /**
 603  * Calculate a dword address given an input or output register and a stride.
 604  */
 605 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 606                                    const struct tgsi_full_dst_register *dst,
 607                                    const struct tgsi_full_src_register *src,
 608                                    LLVMValueRef vertex_dw_stride,
 609                                    LLVMValueRef base_addr)
 610 {
 611         struct gallivm_state *gallivm = &ctx->gallivm;
 612         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 613         ubyte *name, *index, *array_first;
 614         int first, param;
 615         struct tgsi_full_dst_register reg;
 616
 617         /* Set the register description. The address computation is the same
 618          * for sources and destinations. */
 619         if (src) {
 620                 reg.Register.File = src->Register.File;
 621                 reg.Register.Index = src->Register.Index;
 622                 reg.Register.Indirect = src->Register.Indirect;
 623                 reg.Register.Dimension = src->Register.Dimension;
 624                 reg.Indirect = src->Indirect;
 625                 reg.Dimension = src->Dimension;
 626                 reg.DimIndirect = src->DimIndirect;
 627         } else
 628                 reg = *dst;
 629
 630         /* If the register is 2-dimensional (e.g. an array of vertices
 631          * in a primitive), calculate the base address of the vertex. */
 632         if (reg.Register.Dimension) {
 633                 LLVMValueRef index;
 634
 635                 if (reg.Dimension.Indirect)
 636                         index = get_indirect_index(ctx, &reg.DimIndirect,
 637                                                    reg.Dimension.Index);
 638                 else
 639                         index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 640
 641                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 642                                          LLVMBuildMul(gallivm->builder, index,
 643                                                       vertex_dw_stride, ""), "");
 644         }
 645
 646         /* Get information about the register. */
 647         if (reg.Register.File == TGSI_FILE_INPUT) {
 648                 name = info->input_semantic_name;
 649                 index = info->input_semantic_index;
 650                 array_first = info->input_array_first;
 651         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 652                 name = info->output_semantic_name;
 653                 index = info->output_semantic_index;
 654                 array_first = info->output_array_first;
 655         } else {
 656                 assert(0);
 657                 return NULL;
 658         }
 659
 660         if (reg.Register.Indirect) {
 661                 /* Add the relative address of the element. */
 662                 LLVMValueRef ind_index;
 663
 664                 if (reg.Indirect.ArrayID)
 665                         first = array_first[reg.Indirect.ArrayID];
 666                 else
 667                         first = reg.Register.Index;
 668
 669                 ind_index = get_indirect_index(ctx, &reg.Indirect,
 670                                            reg.Register.Index - first);
 671
 672                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 673                                     LLVMBuildMul(gallivm->builder, ind_index,
 674                                                  LLVMConstInt(ctx->i32, 4, 0), ""), "");
 675
 676                 param = si_shader_io_get_unique_index(name[first], index[first]);
 677         } else {
 678                 param = si_shader_io_get_unique_index(name[reg.Register.Index],
 679                                                       index[reg.Register.Index]);
 680         }
 681
 682         /* Add the base address of the element. */
 683         return LLVMBuildAdd(gallivm->builder, base_addr,
 684                             LLVMConstInt(ctx->i32, param * 4, 0), "");
 685 }
 686
 687 /* The offchip buffer layout for TCS->TES is
 688  *
 689  * - attribute 0 of patch 0 vertex 0
 690  * - attribute 0 of patch 0 vertex 1
 691  * - attribute 0 of patch 0 vertex 2
 692  *   ...
 693  * - attribute 0 of patch 1 vertex 0
 694  * - attribute 0 of patch 1 vertex 1
 695  *   ...
 696  * - attribute 1 of patch 0 vertex 0
 697  * - attribute 1 of patch 0 vertex 1
 698  *   ...
 699  * - per patch attribute 0 of patch 0
 700  * - per patch attribute 0 of patch 1
 701  *   ...
 702  *
 703  * Note that every attribute has 4 components.
 704  */
 705 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 706                                                LLVMValueRef rel_patch_id,
 707                                                LLVMValueRef vertex_index,
 708                                                LLVMValueRef param_index)
 709 {
 710         struct gallivm_state *gallivm = &ctx->gallivm;
 711         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 712         LLVMValueRef param_stride, constant16;
 713
 714         vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 6);
 715         num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 9);
 716         total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 717                                       num_patches, "");
 718
 719         constant16 = LLVMConstInt(ctx->i32, 16, 0);
 720         if (vertex_index) {
 721                 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
 722                                          vertices_per_patch, "");
 723
 724                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 725                                          vertex_index, "");
 726
 727                 param_stride = total_vertices;
 728         } else {
 729                 base_addr = rel_patch_id;
 730                 param_stride = num_patches;
 731         }
 732
 733         base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 734                                  LLVMBuildMul(gallivm->builder, param_index,
 735                                               param_stride, ""), "");
 736
 737         base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 738
 739         if (!vertex_index) {
 740                 LLVMValueRef patch_data_offset =
 741                            unpack_param(ctx, ctx->param_tcs_offchip_layout, 16, 16);
 742
 743                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 744                                          patch_data_offset, "");
 745         }
 746         return base_addr;
 747 }
 748
 749 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 750                                        struct si_shader_context *ctx,
 751                                        const struct tgsi_full_dst_register *dst,
 752                                        const struct tgsi_full_src_register *src)
 753 {
 754         struct gallivm_state *gallivm = &ctx->gallivm;
 755         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 756         ubyte *name, *index, *array_first;
 757         struct tgsi_full_src_register reg;
 758         LLVMValueRef vertex_index = NULL;
 759         LLVMValueRef param_index = NULL;
 760         unsigned param_index_base, param_base;
 761
 762         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 763
 764         if (reg.Register.Dimension) {
 765
 766                 if (reg.Dimension.Indirect)
 767                         vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
 768                                                           reg.Dimension.Index);
 769                 else
 770                         vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
 771         }
 772
 773         /* Get information about the register. */
 774         if (reg.Register.File == TGSI_FILE_INPUT) {
 775                 name = info->input_semantic_name;
 776                 index = info->input_semantic_index;
 777                 array_first = info->input_array_first;
 778         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 779                 name = info->output_semantic_name;
 780                 index = info->output_semantic_index;
 781                 array_first = info->output_array_first;
 782         } else {
 783                 assert(0);
 784                 return NULL;
 785         }
 786
 787         if (reg.Register.Indirect) {
 788                 if (reg.Indirect.ArrayID)
 789                         param_base = array_first[reg.Indirect.ArrayID];
 790                 else
 791                         param_base = reg.Register.Index;
 792
 793                 param_index = get_indirect_index(ctx, &reg.Indirect,
 794                                                  reg.Register.Index - param_base);
 795
 796         } else {
 797                 param_base = reg.Register.Index;
 798                 param_index = ctx->i32_0;
 799         }
 800
 801         param_index_base = si_shader_io_get_unique_index(name[param_base],
 802                                                          index[param_base]);
 803
 804         param_index = LLVMBuildAdd(gallivm->builder, param_index,
 805                                    LLVMConstInt(ctx->i32, param_index_base, 0),
 806                                    "");
 807
 808         return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
 809                                           vertex_index, param_index);
 810 }
 811
 812 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 813                                 enum tgsi_opcode_type type, unsigned swizzle,
 814                                 LLVMValueRef buffer, LLVMValueRef offset,
 815                                 LLVMValueRef base, bool readonly_memory)
 816 {
 817         struct si_shader_context *ctx = si_shader_context(bld_base);
 818         struct gallivm_state *gallivm = &ctx->gallivm;
 819         LLVMValueRef value, value2;
 820         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
 821         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 822
 823         if (swizzle == ~0) {
 824                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 825                                              0, 1, 0, readonly_memory);
 826
 827                 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 828         }
 829
 830         if (!tgsi_type_is_64bit(type)) {
 831                 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
 832                                              0, 1, 0, readonly_memory);
 833
 834                 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 835                 return LLVMBuildExtractElement(gallivm->builder, value,
 836                                     LLVMConstInt(ctx->i32, swizzle, 0), "");
 837         }
 838
 839         value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 840                                   swizzle * 4, 1, 0, readonly_memory);
 841
 842         value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
 843                                    swizzle * 4 + 4, 1, 0, readonly_memory);
 844
 845         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 846 }
 847
 848 /**
 849  * Load from LDS.
 850  *
 851  * \param type          output value type
 852  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 853  * \param dw_addr       address in dwords
 854  */
 855 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 856                              enum tgsi_opcode_type type, unsigned swizzle,
 857                              LLVMValueRef dw_addr)
 858 {
 859         struct si_shader_context *ctx = si_shader_context(bld_base);
 860         struct gallivm_state *gallivm = &ctx->gallivm;
 861         LLVMValueRef value;
 862
 863         if (swizzle == ~0) {
 864                 LLVMValueRef values[TGSI_NUM_CHANNELS];
 865
 866                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
 867                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
 868
 869                 return lp_build_gather_values(gallivm, values,
 870                                               TGSI_NUM_CHANNELS);
 871         }
 872
 873         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 874                             LLVMConstInt(ctx->i32, swizzle, 0));
 875
 876         value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 877         if (tgsi_type_is_64bit(type)) {
 878                 LLVMValueRef value2;
 879                 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 880                                        ctx->i32_1);
 881                 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 882                 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 883         }
 884
 885         return LLVMBuildBitCast(gallivm->builder, value,
 886                                 tgsi2llvmtype(bld_base, type), "");
 887 }
 888
 889 /**
 890  * Store to LDS.
 891  *
 892  * \param swizzle       offset (typically 0..3)
 893  * \param dw_addr       address in dwords
 894  * \param value         value to store
 895  */
 896 static void lds_store(struct lp_build_tgsi_context *bld_base,
 897                       unsigned swizzle, LLVMValueRef dw_addr,
 898                       LLVMValueRef value)
 899 {
 900         struct si_shader_context *ctx = si_shader_context(bld_base);
 901         struct gallivm_state *gallivm = &ctx->gallivm;
 902
 903         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 904                             LLVMConstInt(ctx->i32, swizzle, 0));
 905
 906         value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
 907         ac_build_indexed_store(&ctx->ac, ctx->lds,
 908                                dw_addr, value);
 909 }
 910
 911 static LLVMValueRef fetch_input_tcs(
 912         struct lp_build_tgsi_context *bld_base,
 913         const struct tgsi_full_src_register *reg,
 914         enum tgsi_opcode_type type, unsigned swizzle)
 915 {
 916         struct si_shader_context *ctx = si_shader_context(bld_base);
 917         LLVMValueRef dw_addr, stride;
 918
 919         stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 920         dw_addr = get_tcs_in_current_patch_offset(ctx);
 921         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 922
 923         return lds_load(bld_base, type, swizzle, dw_addr);
 924 }
 925
 926 static LLVMValueRef fetch_output_tcs(
 927                 struct lp_build_tgsi_context *bld_base,
 928                 const struct tgsi_full_src_register *reg,
 929                 enum tgsi_opcode_type type, unsigned swizzle)
 930 {
 931         struct si_shader_context *ctx = si_shader_context(bld_base);
 932         LLVMValueRef dw_addr, stride;
 933
 934         if (reg->Register.Dimension) {
 935                 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
 936                 dw_addr = get_tcs_out_current_patch_offset(ctx);
 937                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
 938         } else {
 939                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 940                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
 941         }
 942
 943         return lds_load(bld_base, type, swizzle, dw_addr);
 944 }
 945
 946 static LLVMValueRef fetch_input_tes(
 947         struct lp_build_tgsi_context *bld_base,
 948         const struct tgsi_full_src_register *reg,
 949         enum tgsi_opcode_type type, unsigned swizzle)
 950 {
 951         struct si_shader_context *ctx = si_shader_context(bld_base);
 952         LLVMValueRef rw_buffers, buffer, base, addr;
 953
 954         rw_buffers = LLVMGetParam(ctx->main_fn,
 955                                   ctx->param_rw_buffers);
 956         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
 957                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
 958
 959         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
 960         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
 961
 962         return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
 963 }
 964
 965 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
 966                              const struct tgsi_full_instruction *inst,
 967                              const struct tgsi_opcode_info *info,
 968                              LLVMValueRef dst[4])
 969 {
 970         struct si_shader_context *ctx = si_shader_context(bld_base);
 971         struct gallivm_state *gallivm = &ctx->gallivm;
 972         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 973         const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
 974         unsigned chan_index;
 975         LLVMValueRef dw_addr, stride;
 976         LLVMValueRef rw_buffers, buffer, base, buf_addr;
 977         LLVMValueRef values[4];
 978         bool skip_lds_store;
 979         bool is_tess_factor = false;
 980
 981         /* Only handle per-patch and per-vertex outputs here.
 982          * Vectors will be lowered to scalars and this function will be called again.
 983          */
 984         if (reg->Register.File != TGSI_FILE_OUTPUT ||
 985             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
 986                 si_llvm_emit_store(bld_base, inst, info, dst);
 987                 return;
 988         }
 989
 990         if (reg->Register.Dimension) {
 991                 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
 992                 dw_addr = get_tcs_out_current_patch_offset(ctx);
 993                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
 994                 skip_lds_store = !sh_info->reads_pervertex_outputs;
 995         } else {
 996                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
 997                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
 998                 skip_lds_store = !sh_info->reads_perpatch_outputs;
 999
1000                 if (!reg->Register.Indirect) {
1001                         int name = sh_info->output_semantic_name[reg->Register.Index];
1002
1003                         /* Always write tess factors into LDS for the TCS epilog. */
1004                         if (name == TGSI_SEMANTIC_TESSINNER ||
1005                             name == TGSI_SEMANTIC_TESSOUTER) {
1006                                 skip_lds_store = false;
1007                                 is_tess_factor = true;
1008                         }
1009                 }
1010         }
1011
1012         rw_buffers = LLVMGetParam(ctx->main_fn,
1013                                   ctx->param_rw_buffers);
1014         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1015                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1016
1017         base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1018         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1019
1020
1021         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1022                 LLVMValueRef value = dst[chan_index];
1023
1024                 if (inst->Instruction.Saturate)
1025                         value = ac_build_clamp(&ctx->ac, value);
1026
1027                 /* Skip LDS stores if there is no LDS read of this output. */
1028                 if (!skip_lds_store)
1029                         lds_store(bld_base, chan_index, dw_addr, value);
1030
1031                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1032                 values[chan_index] = value;
1033
1034                 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1035                         ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1036                                                     buf_addr, base,
1037                                                     4 * chan_index, 1, 0, true, false);
1038                 }
1039         }
1040
1041         if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1042                 LLVMValueRef value = lp_build_gather_values(gallivm,
1043                                                             values, 4);
1044                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1045                                             base, 0, 1, 0, true, false);
1046         }
1047 }
1048
1049 static LLVMValueRef fetch_input_gs(
1050         struct lp_build_tgsi_context *bld_base,
1051         const struct tgsi_full_src_register *reg,
1052         enum tgsi_opcode_type type,
1053         unsigned swizzle)
1054 {
1055         struct si_shader_context *ctx = si_shader_context(bld_base);
1056         struct si_shader *shader = ctx->shader;
1057         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1058         struct gallivm_state *gallivm = &ctx->gallivm;
1059         LLVMValueRef vtx_offset, soffset;
1060         unsigned vtx_offset_param;
1061         struct tgsi_shader_info *info = &shader->selector->info;
1062         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1063         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1064         unsigned param;
1065         LLVMValueRef value;
1066
1067         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1068                 return get_primitive_id(bld_base, swizzle);
1069
1070         if (!reg->Register.Dimension)
1071                 return NULL;
1072
1073         if (swizzle == ~0) {
1074                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1075                 unsigned chan;
1076                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1077                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1078                 }
1079                 return lp_build_gather_values(gallivm, values,
1080                                               TGSI_NUM_CHANNELS);
1081         }
1082
1083         /* Get the vertex offset parameter */
1084         vtx_offset_param = reg->Dimension.Index;
1085         if (vtx_offset_param < 2) {
1086                 vtx_offset_param += ctx->param_gs_vtx0_offset;
1087         } else {
1088                 assert(vtx_offset_param < 6);
1089                 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1090         }
1091         vtx_offset = lp_build_mul_imm(uint,
1092                                       LLVMGetParam(ctx->main_fn,
1093                                                    vtx_offset_param),
1094                                       4);
1095
1096         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1097         soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1098
1099         value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1100                                      vtx_offset, soffset, 0, 1, 0, true);
1101         if (tgsi_type_is_64bit(type)) {
1102                 LLVMValueRef value2;
1103                 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1104
1105                 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1106                                               ctx->i32_0, vtx_offset, soffset,
1107                                               0, 1, 0, true);
1108                 return si_llvm_emit_fetch_64bit(bld_base, type,
1109                                                 value, value2);
1110         }
1111         return LLVMBuildBitCast(gallivm->builder,
1112                                 value,
1113                                 tgsi2llvmtype(bld_base, type), "");
1114 }
1115
1116 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1117 {
1118         switch (interpolate) {
1119         case TGSI_INTERPOLATE_CONSTANT:
1120                 return 0;
1121
1122         case TGSI_INTERPOLATE_LINEAR:
1123                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1124                         return SI_PARAM_LINEAR_SAMPLE;
1125                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1126                         return SI_PARAM_LINEAR_CENTROID;
1127                 else
1128                         return SI_PARAM_LINEAR_CENTER;
1129                 break;
1130         case TGSI_INTERPOLATE_COLOR:
1131         case TGSI_INTERPOLATE_PERSPECTIVE:
1132                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1133                         return SI_PARAM_PERSP_SAMPLE;
1134                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1135                         return SI_PARAM_PERSP_CENTROID;
1136                 else
1137                         return SI_PARAM_PERSP_CENTER;
1138                 break;
1139         default:
1140                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1141                 return -1;
1142         }
1143 }
1144
1145 /**
1146  * Interpolate a fragment shader input.
1147  *
1148  * @param ctx           context
1149  * @param input_index           index of the input in hardware
1150  * @param semantic_name         TGSI_SEMANTIC_*
1151  * @param semantic_index        semantic index
1152  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1153  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1154  * @param interp_param          interpolation weights (i,j)
1155  * @param prim_mask             SI_PARAM_PRIM_MASK
1156  * @param face                  SI_PARAM_FRONT_FACE
1157  * @param result                the return value (4 components)
1158  */
1159 static void interp_fs_input(struct si_shader_context *ctx,
1160                             unsigned input_index,
1161                             unsigned semantic_name,
1162                             unsigned semantic_index,
1163                             unsigned num_interp_inputs,
1164                             unsigned colors_read_mask,
1165                             LLVMValueRef interp_param,
1166                             LLVMValueRef prim_mask,
1167                             LLVMValueRef face,
1168                             LLVMValueRef result[4])
1169 {
1170         struct gallivm_state *gallivm = &ctx->gallivm;
1171         LLVMValueRef attr_number;
1172         LLVMValueRef i, j;
1173
1174         unsigned chan;
1175
1176         /* fs.constant returns the param from the middle vertex, so it's not
1177          * really useful for flat shading. It's meant to be used for custom
1178          * interpolation (but the intrinsic can't fetch from the other two
1179          * vertices).
1180          *
1181          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1182          * to do the right thing. The only reason we use fs.constant is that
1183          * fs.interp cannot be used on integers, because they can be equal
1184          * to NaN.
1185          *
1186          * When interp is false we will use fs.constant or for newer llvm,
1187          * amdgcn.interp.mov.
1188          */
1189         bool interp = interp_param != NULL;
1190
1191         attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1192
1193         if (interp) {
1194                 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1195                                                 LLVMVectorType(ctx->f32, 2), "");
1196
1197                 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1198                                                 ctx->i32_0, "");
1199                 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1200                                                 ctx->i32_1, "");
1201         }
1202
1203         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1204             ctx->shader->key.part.ps.prolog.color_two_side) {
1205                 LLVMValueRef is_face_positive;
1206                 LLVMValueRef back_attr_number;
1207
1208                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1209                  * otherwise it's at offset "num_inputs".
1210                  */
1211                 unsigned back_attr_offset = num_interp_inputs;
1212                 if (semantic_index == 1 && colors_read_mask & 0xf)
1213                         back_attr_offset += 1;
1214
1215                 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1216
1217                 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1218                                                  face, ctx->i32_0, "");
1219
1220                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1221                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1222                         LLVMValueRef front, back;
1223
1224                         if (interp) {
1225                                 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1226                                                         attr_number, prim_mask,
1227                                                         i, j);
1228                                 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1229                                                         back_attr_number, prim_mask,
1230                                                         i, j);
1231                         } else {
1232                                 front = ac_build_fs_interp_mov(&ctx->ac,
1233                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1234                                         llvm_chan, attr_number, prim_mask);
1235                                 back = ac_build_fs_interp_mov(&ctx->ac,
1236                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1237                                         llvm_chan, back_attr_number, prim_mask);
1238                         }
1239
1240                         result[chan] = LLVMBuildSelect(gallivm->builder,
1241                                                 is_face_positive,
1242                                                 front,
1243                                                 back,
1244                                                 "");
1245                 }
1246         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1247                 if (interp) {
1248                         result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1249                                                        attr_number, prim_mask, i, j);
1250                 } else {
1251                         result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1252                                                            LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1253                                                            attr_number, prim_mask);
1254                 }
1255                 result[1] =
1256                 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1257                 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1258         } else {
1259                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1260                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1261
1262                         if (interp) {
1263                                 result[chan] = ac_build_fs_interp(&ctx->ac,
1264                                         llvm_chan, attr_number, prim_mask, i, j);
1265                         } else {
1266                                 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1267                                         LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1268                                         llvm_chan, attr_number, prim_mask);
1269                         }
1270                 }
1271         }
1272 }
1273
1274 static void declare_input_fs(
1275         struct si_shader_context *ctx,
1276         unsigned input_index,
1277         const struct tgsi_full_declaration *decl,
1278         LLVMValueRef out[4])
1279 {
1280         struct lp_build_context *base = &ctx->bld_base.base;
1281         struct si_shader *shader = ctx->shader;
1282         LLVMValueRef main_fn = ctx->main_fn;
1283         LLVMValueRef interp_param = NULL;
1284         int interp_param_idx;
1285
1286         /* Get colors from input VGPRs (set by the prolog). */
1287         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1288                 unsigned i = decl->Semantic.Index;
1289                 unsigned colors_read = shader->selector->info.colors_read;
1290                 unsigned mask = colors_read >> (i * 4);
1291                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1292                                   (i ? util_bitcount(colors_read & 0xf) : 0);
1293
1294                 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1295                 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1296                 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1297                 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1298                 return;
1299         }
1300
1301         interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1302                                                      decl->Interp.Location);
1303         if (interp_param_idx == -1)
1304                 return;
1305         else if (interp_param_idx) {
1306                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1307         }
1308
1309         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1310             decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1311             ctx->shader->key.part.ps.prolog.flatshade_colors)
1312                 interp_param = NULL; /* load the constant color */
1313
1314         interp_fs_input(ctx, input_index, decl->Semantic.Name,
1315                         decl->Semantic.Index, shader->selector->info.num_inputs,
1316                         shader->selector->info.colors_read, interp_param,
1317                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1318                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1319                         &out[0]);
1320 }
1321
1322 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1323 {
1324         return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1325 }
1326
1327
1328 /**
1329  * Load a dword from a constant buffer.
1330  */
1331 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1332                                       LLVMValueRef resource,
1333                                       LLVMValueRef offset)
1334 {
1335         LLVMBuilderRef builder = ctx->gallivm.builder;
1336         LLVMValueRef args[2] = {resource, offset};
1337
1338         return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1339                                   LP_FUNC_ATTR_READNONE |
1340                                   LP_FUNC_ATTR_LEGACY);
1341 }
1342
1343 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1344 {
1345         struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1346         struct gallivm_state *gallivm = &ctx->gallivm;
1347         LLVMBuilderRef builder = gallivm->builder;
1348         LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1349         LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1350         LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1351
1352         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1353         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1354         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1355
1356         LLVMValueRef pos[4] = {
1357                 buffer_load_const(ctx, resource, offset0),
1358                 buffer_load_const(ctx, resource, offset1),
1359                 LLVMConstReal(ctx->f32, 0),
1360                 LLVMConstReal(ctx->f32, 0)
1361         };
1362
1363         return lp_build_gather_values(gallivm, pos, 4);
1364 }
1365
1366 static void declare_system_value(struct si_shader_context *ctx,
1367                                  unsigned index,
1368                                  const struct tgsi_full_declaration *decl)
1369 {
1370         struct lp_build_context *bld = &ctx->bld_base.base;
1371         struct gallivm_state *gallivm = &ctx->gallivm;
1372         LLVMValueRef value = 0;
1373
1374         assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1375
1376         switch (decl->Semantic.Name) {
1377         case TGSI_SEMANTIC_INSTANCEID:
1378                 value = LLVMGetParam(ctx->main_fn,
1379                                      ctx->param_instance_id);
1380                 break;
1381
1382         case TGSI_SEMANTIC_VERTEXID:
1383                 value = LLVMBuildAdd(gallivm->builder,
1384                                      LLVMGetParam(ctx->main_fn,
1385                                                   ctx->param_vertex_id),
1386                                      LLVMGetParam(ctx->main_fn,
1387                                                   ctx->param_base_vertex), "");
1388                 break;
1389
1390         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1391                 /* Unused. Clarify the meaning in indexed vs. non-indexed
1392                  * draws if this is ever used again. */
1393                 assert(false);
1394                 break;
1395
1396         case TGSI_SEMANTIC_BASEVERTEX:
1397         {
1398                 /* For non-indexed draws, the base vertex set by the driver
1399                  * (for direct draws) or the CP (for indirect draws) is the
1400                  * first vertex ID, but GLSL expects 0 to be returned.
1401                  */
1402                 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1403                 LLVMValueRef indexed;
1404
1405                 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1406                 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1407
1408                 value = LLVMBuildSelect(gallivm->builder, indexed,
1409                                         LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1410                                         ctx->i32_0, "");
1411                 break;
1412         }
1413
1414         case TGSI_SEMANTIC_BASEINSTANCE:
1415                 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1416                 break;
1417
1418         case TGSI_SEMANTIC_DRAWID:
1419                 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1420                 break;
1421
1422         case TGSI_SEMANTIC_INVOCATIONID:
1423                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1424                         value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1425                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1426                         value = LLVMGetParam(ctx->main_fn,
1427                                              ctx->param_gs_instance_id);
1428                 else
1429                         assert(!"INVOCATIONID not implemented");
1430                 break;
1431
1432         case TGSI_SEMANTIC_POSITION:
1433         {
1434                 LLVMValueRef pos[4] = {
1435                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1436                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1437                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1438                         lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1439                                                  LLVMGetParam(ctx->main_fn,
1440                                                               SI_PARAM_POS_W_FLOAT)),
1441                 };
1442                 value = lp_build_gather_values(gallivm, pos, 4);
1443                 break;
1444         }
1445
1446         case TGSI_SEMANTIC_FACE:
1447                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1448                 break;
1449
1450         case TGSI_SEMANTIC_SAMPLEID:
1451                 value = get_sample_id(ctx);
1452                 break;
1453
1454         case TGSI_SEMANTIC_SAMPLEPOS: {
1455                 LLVMValueRef pos[4] = {
1456                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1457                         LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1458                         LLVMConstReal(ctx->f32, 0),
1459                         LLVMConstReal(ctx->f32, 0)
1460                 };
1461                 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1462                                                   TGSI_OPCODE_FRC, pos[0]);
1463                 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1464                                                   TGSI_OPCODE_FRC, pos[1]);
1465                 value = lp_build_gather_values(gallivm, pos, 4);
1466                 break;
1467         }
1468
1469         case TGSI_SEMANTIC_SAMPLEMASK:
1470                 /* This can only occur with the OpenGL Core profile, which
1471                  * doesn't support smoothing.
1472                  */
1473                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1474                 break;
1475
1476         case TGSI_SEMANTIC_TESSCOORD:
1477         {
1478                 LLVMValueRef coord[4] = {
1479                         LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1480                         LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1481                         bld->zero,
1482                         bld->zero
1483                 };
1484
1485                 /* For triangles, the vector should be (u, v, 1-u-v). */
1486                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1487                     PIPE_PRIM_TRIANGLES)
1488                         coord[2] = lp_build_sub(bld, bld->one,
1489                                                 lp_build_add(bld, coord[0], coord[1]));
1490
1491                 value = lp_build_gather_values(gallivm, coord, 4);
1492                 break;
1493         }
1494
1495         case TGSI_SEMANTIC_VERTICESIN:
1496                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1497                         value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1498                 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1499                         value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 7);
1500                 else
1501                         assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1502                 break;
1503
1504         case TGSI_SEMANTIC_TESSINNER:
1505         case TGSI_SEMANTIC_TESSOUTER:
1506         {
1507                 LLVMValueRef rw_buffers, buffer, base, addr;
1508                 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1509
1510                 rw_buffers = LLVMGetParam(ctx->main_fn,
1511                                           ctx->param_rw_buffers);
1512                 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1513                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1514
1515                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1516                 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1517                                           LLVMConstInt(ctx->i32, param, 0));
1518
1519                 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1520                                     ~0, buffer, base, addr, true);
1521
1522                 break;
1523         }
1524
1525         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1526         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1527         {
1528                 LLVMValueRef buf, slot, val[4];
1529                 int i, offset;
1530
1531                 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1532                 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1533                 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1534                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1535
1536                 for (i = 0; i < 4; i++)
1537                         val[i] = buffer_load_const(ctx, buf,
1538                                                    LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1539                 value = lp_build_gather_values(gallivm, val, 4);
1540                 break;
1541         }
1542
1543         case TGSI_SEMANTIC_PRIMID:
1544                 value = get_primitive_id(&ctx->bld_base, 0);
1545                 break;
1546
1547         case TGSI_SEMANTIC_GRID_SIZE:
1548                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
1549                 break;
1550
1551         case TGSI_SEMANTIC_BLOCK_SIZE:
1552         {
1553                 LLVMValueRef values[3];
1554                 unsigned i;
1555                 unsigned *properties = ctx->shader->selector->info.properties;
1556
1557                 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1558                         unsigned sizes[3] = {
1559                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1560                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1561                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1562                         };
1563
1564                         for (i = 0; i < 3; ++i)
1565                                 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1566
1567                         value = lp_build_gather_values(gallivm, values, 3);
1568                 } else {
1569                         value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
1570                 }
1571                 break;
1572         }
1573
1574         case TGSI_SEMANTIC_BLOCK_ID:
1575                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
1576                 break;
1577
1578         case TGSI_SEMANTIC_THREAD_ID:
1579                 value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
1580                 break;
1581
1582         case TGSI_SEMANTIC_HELPER_INVOCATION:
1583                 if (HAVE_LLVM >= 0x0309) {
1584                         value = lp_build_intrinsic(gallivm->builder,
1585                                                    "llvm.amdgcn.ps.live",
1586                                                    ctx->i1, NULL, 0,
1587                                                    LP_FUNC_ATTR_READNONE);
1588                         value = LLVMBuildNot(gallivm->builder, value, "");
1589                         value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1590                 } else {
1591                         assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
1592                         return;
1593                 }
1594                 break;
1595
1596         case TGSI_SEMANTIC_SUBGROUP_SIZE:
1597                 value = LLVMConstInt(ctx->i32, 64, 0);
1598                 break;
1599
1600         case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1601                 value = ac_get_thread_id(&ctx->ac);
1602                 break;
1603
1604         case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1605         {
1606                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1607                 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1608                 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1609                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1610                 break;
1611         }
1612
1613         case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1614         case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1615         case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1616         case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1617         {
1618                 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1619                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1620                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1621                         /* All bits set except LSB */
1622                         value = LLVMConstInt(ctx->i64, -2, 0);
1623                 } else {
1624                         /* All bits set */
1625                         value = LLVMConstInt(ctx->i64, -1, 0);
1626                 }
1627                 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1628                 value = LLVMBuildShl(gallivm->builder, value, id, "");
1629                 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1630                     decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1631                         value = LLVMBuildNot(gallivm->builder, value, "");
1632                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1633                 break;
1634         }
1635
1636         default:
1637                 assert(!"unknown system value");
1638                 return;
1639         }
1640
1641         ctx->system_values[index] = value;
1642 }
1643
1644 static void declare_compute_memory(struct si_shader_context *ctx,
1645                                    const struct tgsi_full_declaration *decl)
1646 {
1647         struct si_shader_selector *sel = ctx->shader->selector;
1648         struct gallivm_state *gallivm = &ctx->gallivm;
1649
1650         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1651         LLVMValueRef var;
1652
1653         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1654         assert(decl->Range.First == decl->Range.Last);
1655         assert(!ctx->shared_memory);
1656
1657         var = LLVMAddGlobalInAddressSpace(gallivm->module,
1658                                           LLVMArrayType(ctx->i8, sel->local_size),
1659                                           "compute_lds",
1660                                           LOCAL_ADDR_SPACE);
1661         LLVMSetAlignment(var, 4);
1662
1663         ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1664 }
1665
1666 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1667 {
1668         LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1669                                              ctx->param_const_buffers);
1670
1671         return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1672                                         LLVMConstInt(ctx->i32, i, 0));
1673 }
1674
1675 static LLVMValueRef fetch_constant(
1676         struct lp_build_tgsi_context *bld_base,
1677         const struct tgsi_full_src_register *reg,
1678         enum tgsi_opcode_type type,
1679         unsigned swizzle)
1680 {
1681         struct si_shader_context *ctx = si_shader_context(bld_base);
1682         struct lp_build_context *base = &bld_base->base;
1683         const struct tgsi_ind_register *ireg = &reg->Indirect;
1684         unsigned buf, idx;
1685
1686         LLVMValueRef addr, bufp;
1687         LLVMValueRef result;
1688
1689         if (swizzle == LP_CHAN_ALL) {
1690                 unsigned chan;
1691                 LLVMValueRef values[4];
1692                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1693                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1694
1695                 return lp_build_gather_values(&ctx->gallivm, values, 4);
1696         }
1697
1698         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1699         idx = reg->Register.Index * 4 + swizzle;
1700
1701         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1702                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1703                 LLVMValueRef index;
1704                 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1705                                                    reg->Dimension.Index,
1706                                                    SI_NUM_CONST_BUFFERS);
1707                 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1708         } else
1709                 bufp = load_const_buffer_desc(ctx, buf);
1710
1711         if (reg->Register.Indirect) {
1712                 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1713                 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1714                 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1715                 addr = lp_build_add(&bld_base->uint_bld, addr,
1716                                     LLVMConstInt(ctx->i32, idx * 4, 0));
1717         } else {
1718                 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1719         }
1720
1721         result = buffer_load_const(ctx, bufp, addr);
1722
1723         if (!tgsi_type_is_64bit(type))
1724                 result = bitcast(bld_base, type, result);
1725         else {
1726                 LLVMValueRef addr2, result2;
1727
1728                 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1729                                      LLVMConstInt(ctx->i32, 4, 0));
1730                 result2 = buffer_load_const(ctx, bufp, addr2);
1731
1732                 result = si_llvm_emit_fetch_64bit(bld_base, type,
1733                                                   result, result2);
1734         }
1735         return result;
1736 }
1737
1738 /* Upper 16 bits must be zero. */
1739 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1740                                            LLVMValueRef val[2])
1741 {
1742         return LLVMBuildOr(ctx->gallivm.builder, val[0],
1743                            LLVMBuildShl(ctx->gallivm.builder, val[1],
1744                                         LLVMConstInt(ctx->i32, 16, 0),
1745                                         ""), "");
1746 }
1747
1748 /* Upper 16 bits are ignored and will be dropped. */
1749 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1750                                                     LLVMValueRef val[2])
1751 {
1752         LLVMValueRef v[2] = {
1753                 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1754                              LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1755                 val[1],
1756         };
1757         return si_llvm_pack_two_int16(ctx, v);
1758 }
1759
1760 /* Initialize arguments for the shader export intrinsic */
1761 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1762                                      LLVMValueRef *values,
1763                                      unsigned target,
1764                                      struct ac_export_args *args)
1765 {
1766         struct si_shader_context *ctx = si_shader_context(bld_base);
1767         struct lp_build_context *base = &bld_base->base;
1768         LLVMBuilderRef builder = ctx->gallivm.builder;
1769         LLVMValueRef val[4];
1770         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1771         unsigned chan;
1772         bool is_int8, is_int10;
1773
1774         /* Default is 0xf. Adjusted below depending on the format. */
1775         args->enabled_channels = 0xf; /* writemask */
1776
1777         /* Specify whether the EXEC mask represents the valid mask */
1778         args->valid_mask = 0;
1779
1780         /* Specify whether this is the last export */
1781         args->done = 0;
1782
1783         /* Specify the target we are exporting */
1784         args->target = target;
1785
1786         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1787                 const struct si_shader_key *key = &ctx->shader->key;
1788                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1789                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1790
1791                 assert(cbuf >= 0 && cbuf < 8);
1792                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1793                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1794                 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1795         }
1796
1797         args->compr = false;
1798         args->out[0] = base->undef;
1799         args->out[1] = base->undef;
1800         args->out[2] = base->undef;
1801         args->out[3] = base->undef;
1802
1803         switch (spi_shader_col_format) {
1804         case V_028714_SPI_SHADER_ZERO:
1805                 args->enabled_channels = 0; /* writemask */
1806                 args->target = V_008DFC_SQ_EXP_NULL;
1807                 break;
1808
1809         case V_028714_SPI_SHADER_32_R:
1810                 args->enabled_channels = 1; /* writemask */
1811                 args->out[0] = values[0];
1812                 break;
1813
1814         case V_028714_SPI_SHADER_32_GR:
1815                 args->enabled_channels = 0x3; /* writemask */
1816                 args->out[0] = values[0];
1817                 args->out[1] = values[1];
1818                 break;
1819
1820         case V_028714_SPI_SHADER_32_AR:
1821                 args->enabled_channels = 0x9; /* writemask */
1822                 args->out[0] = values[0];
1823                 args->out[3] = values[3];
1824                 break;
1825
1826         case V_028714_SPI_SHADER_FP16_ABGR:
1827                 args->compr = 1; /* COMPR flag */
1828
1829                 for (chan = 0; chan < 2; chan++) {
1830                         LLVMValueRef pack_args[2] = {
1831                                 values[2 * chan],
1832                                 values[2 * chan + 1]
1833                         };
1834                         LLVMValueRef packed;
1835
1836                         packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1837                         args->out[chan] =
1838                                 LLVMBuildBitCast(ctx->gallivm.builder,
1839                                                  packed, ctx->f32, "");
1840                 }
1841                 break;
1842
1843         case V_028714_SPI_SHADER_UNORM16_ABGR:
1844                 for (chan = 0; chan < 4; chan++) {
1845                         val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1846                         val[chan] = LLVMBuildFMul(builder, val[chan],
1847                                                   LLVMConstReal(ctx->f32, 65535), "");
1848                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1849                                                   LLVMConstReal(ctx->f32, 0.5), "");
1850                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
1851                                                     ctx->i32, "");
1852                 }
1853
1854                 args->compr = 1; /* COMPR flag */
1855                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1856                                   si_llvm_pack_two_int16(ctx, val));
1857                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1858                                   si_llvm_pack_two_int16(ctx, val+2));
1859                 break;
1860
1861         case V_028714_SPI_SHADER_SNORM16_ABGR:
1862                 for (chan = 0; chan < 4; chan++) {
1863                         /* Clamp between [-1, 1]. */
1864                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1865                                                               values[chan],
1866                                                               LLVMConstReal(ctx->f32, 1));
1867                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1868                                                               val[chan],
1869                                                               LLVMConstReal(ctx->f32, -1));
1870                         /* Convert to a signed integer in [-32767, 32767]. */
1871                         val[chan] = LLVMBuildFMul(builder, val[chan],
1872                                                   LLVMConstReal(ctx->f32, 32767), "");
1873                         /* If positive, add 0.5, else add -0.5. */
1874                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1875                                         LLVMBuildSelect(builder,
1876                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
1877                                                               val[chan], base->zero, ""),
1878                                                 LLVMConstReal(ctx->f32, 0.5),
1879                                                 LLVMConstReal(ctx->f32, -0.5), ""), "");
1880                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1881                 }
1882
1883                 args->compr = 1; /* COMPR flag */
1884                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1885                                   si_llvm_pack_two_int32_as_int16(ctx, val));
1886                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1887                                   si_llvm_pack_two_int32_as_int16(ctx, val+2));
1888                 break;
1889
1890         case V_028714_SPI_SHADER_UINT16_ABGR: {
1891                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1892                         is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1893                 LLVMValueRef max_alpha =
1894                         !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1895
1896                 /* Clamp. */
1897                 for (chan = 0; chan < 4; chan++) {
1898                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1899                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1900                                         val[chan],
1901                                         chan == 3 ? max_alpha : max_rgb);
1902                 }
1903
1904                 args->compr = 1; /* COMPR flag */
1905                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1906                                   si_llvm_pack_two_int16(ctx, val));
1907                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1908                                   si_llvm_pack_two_int16(ctx, val+2));
1909                 break;
1910         }
1911
1912         case V_028714_SPI_SHADER_SINT16_ABGR: {
1913                 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1914                         is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1915                 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1916                         is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1917                 LLVMValueRef max_alpha =
1918                         !is_int10 ? max_rgb : ctx->i32_1;
1919                 LLVMValueRef min_alpha =
1920                         !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1921
1922                 /* Clamp. */
1923                 for (chan = 0; chan < 4; chan++) {
1924                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1925                         val[chan] = lp_build_emit_llvm_binary(bld_base,
1926                                         TGSI_OPCODE_IMIN,
1927                                         val[chan], chan == 3 ? max_alpha : max_rgb);
1928                         val[chan] = lp_build_emit_llvm_binary(bld_base,
1929                                         TGSI_OPCODE_IMAX,
1930                                         val[chan], chan == 3 ? min_alpha : min_rgb);
1931                 }
1932
1933                 args->compr = 1; /* COMPR flag */
1934                 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1935                                   si_llvm_pack_two_int32_as_int16(ctx, val));
1936                 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1937                                   si_llvm_pack_two_int32_as_int16(ctx, val+2));
1938                 break;
1939         }
1940
1941         case V_028714_SPI_SHADER_32_ABGR:
1942                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1943                 break;
1944         }
1945 }
1946
1947 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1948                           LLVMValueRef alpha)
1949 {
1950         struct si_shader_context *ctx = si_shader_context(bld_base);
1951
1952         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1953                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
1954                                 SI_PARAM_ALPHA_REF);
1955
1956                 LLVMValueRef alpha_pass =
1957                         lp_build_cmp(&bld_base->base,
1958                                      ctx->shader->key.part.ps.epilog.alpha_func,
1959                                      alpha, alpha_ref);
1960                 LLVMValueRef arg =
1961                         lp_build_select(&bld_base->base,
1962                                         alpha_pass,
1963                                         LLVMConstReal(ctx->f32, 1.0f),
1964                                         LLVMConstReal(ctx->f32, -1.0f));
1965
1966                 ac_build_kill(&ctx->ac, arg);
1967         } else {
1968                 ac_build_kill(&ctx->ac, NULL);
1969         }
1970 }
1971
1972 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
1973                                                   LLVMValueRef alpha,
1974                                                   unsigned samplemask_param)
1975 {
1976         struct si_shader_context *ctx = si_shader_context(bld_base);
1977         struct gallivm_state *gallivm = &ctx->gallivm;
1978         LLVMValueRef coverage;
1979
1980         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
1981         coverage = LLVMGetParam(ctx->main_fn,
1982                                 samplemask_param);
1983         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
1984
1985         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
1986                                    ctx->i32,
1987                                    &coverage, 1, LP_FUNC_ATTR_READNONE);
1988
1989         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
1990                                    ctx->f32, "");
1991
1992         coverage = LLVMBuildFMul(gallivm->builder, coverage,
1993                                  LLVMConstReal(ctx->f32,
1994                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
1995
1996         return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
1997 }
1998
1999 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2000                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
2001 {
2002         struct si_shader_context *ctx = si_shader_context(bld_base);
2003         struct lp_build_context *base = &bld_base->base;
2004         unsigned reg_index;
2005         unsigned chan;
2006         unsigned const_chan;
2007         LLVMValueRef base_elt;
2008         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2009         LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2010                                                    SI_VS_CONST_CLIP_PLANES, 0);
2011         LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2012
2013         for (reg_index = 0; reg_index < 2; reg_index ++) {
2014                 struct ac_export_args *args = &pos[2 + reg_index];
2015
2016                 args->out[0] =
2017                 args->out[1] =
2018                 args->out[2] =
2019                 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2020
2021                 /* Compute dot products of position and user clip plane vectors */
2022                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2023                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2024                                 LLVMValueRef addr =
2025                                         LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2026                                                                 const_chan) * 4, 0);
2027                                 base_elt = buffer_load_const(ctx, const_resource,
2028                                                              addr);
2029                                 args->out[chan] =
2030                                         lp_build_add(base, args->out[chan],
2031                                                      lp_build_mul(base, base_elt,
2032                                                                   out_elts[const_chan]));
2033                         }
2034                 }
2035
2036                 args->enabled_channels = 0xf;
2037                 args->valid_mask = 0;
2038                 args->done = 0;
2039                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2040                 args->compr = 0;
2041         }
2042 }
2043
2044 static void si_dump_streamout(struct pipe_stream_output_info *so)
2045 {
2046         unsigned i;
2047
2048         if (so->num_outputs)
2049                 fprintf(stderr, "STREAMOUT\n");
2050
2051         for (i = 0; i < so->num_outputs; i++) {
2052                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2053                                 so->output[i].start_component;
2054                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2055                         i, so->output[i].output_buffer,
2056                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2057                         so->output[i].register_index,
2058                         mask & 1 ? "x" : "",
2059                         mask & 2 ? "y" : "",
2060                         mask & 4 ? "z" : "",
2061                         mask & 8 ? "w" : "");
2062         }
2063 }
2064
2065 static void emit_streamout_output(struct si_shader_context *ctx,
2066                                   LLVMValueRef const *so_buffers,
2067                                   LLVMValueRef const *so_write_offsets,
2068                                   struct pipe_stream_output *stream_out,
2069                                   struct si_shader_output_values *shader_out)
2070 {
2071         struct gallivm_state *gallivm = &ctx->gallivm;
2072         LLVMBuilderRef builder = gallivm->builder;
2073         unsigned buf_idx = stream_out->output_buffer;
2074         unsigned start = stream_out->start_component;
2075         unsigned num_comps = stream_out->num_components;
2076         LLVMValueRef out[4];
2077
2078         assert(num_comps && num_comps <= 4);
2079         if (!num_comps || num_comps > 4)
2080                 return;
2081
2082         /* Load the output as int. */
2083         for (int j = 0; j < num_comps; j++) {
2084                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2085
2086                 out[j] = LLVMBuildBitCast(builder,
2087                                           shader_out->values[start + j],
2088                                 ctx->i32, "");
2089         }
2090
2091         /* Pack the output. */
2092         LLVMValueRef vdata = NULL;
2093
2094         switch (num_comps) {
2095         case 1: /* as i32 */
2096                 vdata = out[0];
2097                 break;
2098         case 2: /* as v2i32 */
2099         case 3: /* as v4i32 (aligned to 4) */
2100         case 4: /* as v4i32 */
2101                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2102                 for (int j = 0; j < num_comps; j++) {
2103                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2104                                                        LLVMConstInt(ctx->i32, j, 0), "");
2105                 }
2106                 break;
2107         }
2108
2109         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2110                                     vdata, num_comps,
2111                                     so_write_offsets[buf_idx],
2112                                     ctx->i32_0,
2113                                     stream_out->dst_offset * 4, 1, 1, true, false);
2114 }
2115
2116 /**
2117  * Write streamout data to buffers for vertex stream @p stream (different
2118  * vertex streams can occur for GS copy shaders).
2119  */
2120 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2121                                    struct si_shader_output_values *outputs,
2122                                    unsigned noutput, unsigned stream)
2123 {
2124         struct si_shader_selector *sel = ctx->shader->selector;
2125         struct pipe_stream_output_info *so = &sel->so;
2126         struct gallivm_state *gallivm = &ctx->gallivm;
2127         LLVMBuilderRef builder = gallivm->builder;
2128         int i;
2129         struct lp_build_if_state if_ctx;
2130
2131         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2132         LLVMValueRef so_vtx_count =
2133                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2134
2135         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2136
2137         /* can_emit = tid < so_vtx_count; */
2138         LLVMValueRef can_emit =
2139                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2140
2141         /* Emit the streamout code conditionally. This actually avoids
2142          * out-of-bounds buffer access. The hw tells us via the SGPR
2143          * (so_vtx_count) which threads are allowed to emit streamout data. */
2144         lp_build_if(&if_ctx, gallivm, can_emit);
2145         {
2146                 /* The buffer offset is computed as follows:
2147                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2148                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2149                  *                attrib_offset
2150                  */
2151
2152                 LLVMValueRef so_write_index =
2153                         LLVMGetParam(ctx->main_fn,
2154                                      ctx->param_streamout_write_index);
2155
2156                 /* Compute (streamout_write_index + thread_id). */
2157                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2158
2159                 /* Load the descriptor and compute the write offset for each
2160                  * enabled buffer. */
2161                 LLVMValueRef so_write_offset[4] = {};
2162                 LLVMValueRef so_buffers[4];
2163                 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2164                                                     ctx->param_rw_buffers);
2165
2166                 for (i = 0; i < 4; i++) {
2167                         if (!so->stride[i])
2168                                 continue;
2169
2170                         LLVMValueRef offset = LLVMConstInt(ctx->i32,
2171                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
2172
2173                         so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2174
2175                         LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2176                                                               ctx->param_streamout_offset[i]);
2177                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2178
2179                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2180                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2181                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2182                 }
2183
2184                 /* Write streamout data. */
2185                 for (i = 0; i < so->num_outputs; i++) {
2186                         unsigned reg = so->output[i].register_index;
2187
2188                         if (reg >= noutput)
2189                                 continue;
2190
2191                         if (stream != so->output[i].stream)
2192                                 continue;
2193
2194                         emit_streamout_output(ctx, so_buffers, so_write_offset,
2195                                               &so->output[i], &outputs[reg]);
2196                 }
2197         }
2198         lp_build_endif(&if_ctx);
2199 }
2200
2201
2202 /* Generate export instructions for hardware VS shader stage */
2203 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2204                               struct si_shader_output_values *outputs,
2205                               unsigned noutput)
2206 {
2207         struct si_shader_context *ctx = si_shader_context(bld_base);
2208         struct si_shader *shader = ctx->shader;
2209         struct lp_build_context *base = &bld_base->base;
2210         struct ac_export_args args, pos_args[4] = {};
2211         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2212         unsigned semantic_name, semantic_index;
2213         unsigned target;
2214         unsigned param_count = 0;
2215         unsigned pos_idx;
2216         int i;
2217
2218         for (i = 0; i < noutput; i++) {
2219                 semantic_name = outputs[i].semantic_name;
2220                 semantic_index = outputs[i].semantic_index;
2221                 bool export_param = true;
2222
2223                 switch (semantic_name) {
2224                 case TGSI_SEMANTIC_POSITION: /* ignore these */
2225                 case TGSI_SEMANTIC_PSIZE:
2226                 case TGSI_SEMANTIC_CLIPVERTEX:
2227                 case TGSI_SEMANTIC_EDGEFLAG:
2228                         break;
2229                 case TGSI_SEMANTIC_GENERIC:
2230                 case TGSI_SEMANTIC_CLIPDIST:
2231                         if (shader->key.opt.hw_vs.kill_outputs &
2232                             (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2233                                 export_param = false;
2234                         break;
2235                 default:
2236                         if (shader->key.opt.hw_vs.kill_outputs2 &
2237                             (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2238                                 export_param = false;
2239                         break;
2240                 }
2241
2242                 if (outputs[i].vertex_stream[0] != 0 &&
2243                     outputs[i].vertex_stream[1] != 0 &&
2244                     outputs[i].vertex_stream[2] != 0 &&
2245                     outputs[i].vertex_stream[3] != 0)
2246                         export_param = false;
2247
2248 handle_semantic:
2249                 /* Select the correct target */
2250                 switch(semantic_name) {
2251                 case TGSI_SEMANTIC_PSIZE:
2252                         psize_value = outputs[i].values[0];
2253                         continue;
2254                 case TGSI_SEMANTIC_EDGEFLAG:
2255                         edgeflag_value = outputs[i].values[0];
2256                         continue;
2257                 case TGSI_SEMANTIC_LAYER:
2258                         layer_value = outputs[i].values[0];
2259                         semantic_name = TGSI_SEMANTIC_GENERIC;
2260                         goto handle_semantic;
2261                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2262                         viewport_index_value = outputs[i].values[0];
2263                         semantic_name = TGSI_SEMANTIC_GENERIC;
2264                         goto handle_semantic;
2265                 case TGSI_SEMANTIC_POSITION:
2266                         target = V_008DFC_SQ_EXP_POS;
2267                         break;
2268                 case TGSI_SEMANTIC_CLIPDIST:
2269                         if (shader->key.opt.hw_vs.clip_disable) {
2270                                 semantic_name = TGSI_SEMANTIC_GENERIC;
2271                                 goto handle_semantic;
2272                         }
2273                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2274                         break;
2275                 case TGSI_SEMANTIC_CLIPVERTEX:
2276                         if (shader->key.opt.hw_vs.clip_disable)
2277                                 continue;
2278                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2279                         continue;
2280                 case TGSI_SEMANTIC_COLOR:
2281                 case TGSI_SEMANTIC_BCOLOR:
2282                 case TGSI_SEMANTIC_PRIMID:
2283                 case TGSI_SEMANTIC_FOG:
2284                 case TGSI_SEMANTIC_TEXCOORD:
2285                 case TGSI_SEMANTIC_GENERIC:
2286                         if (!export_param)
2287                                 continue;
2288                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2289                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2290                         shader->info.vs_output_param_offset[i] = param_count;
2291                         param_count++;
2292                         break;
2293                 default:
2294                         target = 0;
2295                         fprintf(stderr,
2296                                 "Warning: SI unhandled vs output type:%d\n",
2297                                 semantic_name);
2298                 }
2299
2300                 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2301
2302                 if (target >= V_008DFC_SQ_EXP_POS &&
2303                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
2304                         memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2305                                &args, sizeof(args));
2306                 } else {
2307                         ac_build_export(&ctx->ac, &args);
2308                 }
2309
2310                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2311                         semantic_name = TGSI_SEMANTIC_GENERIC;
2312                         goto handle_semantic;
2313                 }
2314         }
2315
2316         shader->info.nr_param_exports = param_count;
2317
2318         /* We need to add the position output manually if it's missing. */
2319         if (!pos_args[0].out[0]) {
2320                 pos_args[0].enabled_channels = 0xf; /* writemask */
2321                 pos_args[0].valid_mask = 0; /* EXEC mask */
2322                 pos_args[0].done = 0; /* last export? */
2323                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2324                 pos_args[0].compr = 0; /* COMPR flag */
2325                 pos_args[0].out[0] = base->zero; /* X */
2326                 pos_args[0].out[1] = base->zero; /* Y */
2327                 pos_args[0].out[2] = base->zero; /* Z */
2328                 pos_args[0].out[3] = base->one;  /* W */
2329         }
2330
2331         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2332         if (shader->selector->info.writes_psize ||
2333             shader->selector->info.writes_edgeflag ||
2334             shader->selector->info.writes_viewport_index ||
2335             shader->selector->info.writes_layer) {
2336                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2337                                                (shader->selector->info.writes_edgeflag << 1) |
2338                                                (shader->selector->info.writes_layer << 2) |
2339                                                (shader->selector->info.writes_viewport_index << 3);
2340                 pos_args[1].valid_mask = 0; /* EXEC mask */
2341                 pos_args[1].done = 0; /* last export? */
2342                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2343                 pos_args[1].compr = 0; /* COMPR flag */
2344                 pos_args[1].out[0] = base->zero; /* X */
2345                 pos_args[1].out[1] = base->zero; /* Y */
2346                 pos_args[1].out[2] = base->zero; /* Z */
2347                 pos_args[1].out[3] = base->zero; /* W */
2348
2349                 if (shader->selector->info.writes_psize)
2350                         pos_args[1].out[0] = psize_value;
2351
2352                 if (shader->selector->info.writes_edgeflag) {
2353                         /* The output is a float, but the hw expects an integer
2354                          * with the first bit containing the edge flag. */
2355                         edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2356                                                          edgeflag_value,
2357                                                          ctx->i32, "");
2358                         edgeflag_value = lp_build_min(&bld_base->int_bld,
2359                                                       edgeflag_value,
2360                                                       ctx->i32_1);
2361
2362                         /* The LLVM intrinsic expects a float. */
2363                         pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2364                                                           edgeflag_value,
2365                                                           ctx->f32, "");
2366                 }
2367
2368                 if (shader->selector->info.writes_layer)
2369                         pos_args[1].out[2] = layer_value;
2370
2371                 if (shader->selector->info.writes_viewport_index)
2372                         pos_args[1].out[3] = viewport_index_value;
2373         }
2374
2375         for (i = 0; i < 4; i++)
2376                 if (pos_args[i].out[0])
2377                         shader->info.nr_pos_exports++;
2378
2379         pos_idx = 0;
2380         for (i = 0; i < 4; i++) {
2381                 if (!pos_args[i].out[0])
2382                         continue;
2383
2384                 /* Specify the target we are exporting */
2385                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2386
2387                 if (pos_idx == shader->info.nr_pos_exports)
2388                         /* Specify that this is the last export */
2389                         pos_args[i].done = 1;
2390
2391                 ac_build_export(&ctx->ac, &pos_args[i]);
2392         }
2393 }
2394
2395 /**
2396  * Forward all outputs from the vertex shader to the TES. This is only used
2397  * for the fixed function TCS.
2398  */
2399 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2400 {
2401         struct si_shader_context *ctx = si_shader_context(bld_base);
2402         struct gallivm_state *gallivm = &ctx->gallivm;
2403         LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2404         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2405         uint64_t inputs;
2406
2407         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2408
2409         rw_buffers = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2410         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2411                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2412
2413         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2414
2415         lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2416         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2417                                          lds_vertex_stride, "");
2418         lds_base = get_tcs_in_current_patch_offset(ctx);
2419         lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2420
2421         inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2422         while (inputs) {
2423                 unsigned i = u_bit_scan64(&inputs);
2424
2425                 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2426                                             LLVMConstInt(ctx->i32, 4 * i, 0),
2427                                              "");
2428
2429                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2430                                               get_rel_patch_id(ctx),
2431                                               invocation_id,
2432                                               LLVMConstInt(ctx->i32, i, 0));
2433
2434                 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2435                                               lds_ptr);
2436
2437                 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2438                                             buffer_offset, 0, 1, 0, true, false);
2439         }
2440 }
2441
2442 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2443                                   LLVMValueRef rel_patch_id,
2444                                   LLVMValueRef invocation_id,
2445                                   LLVMValueRef tcs_out_current_patch_data_offset)
2446 {
2447         struct si_shader_context *ctx = si_shader_context(bld_base);
2448         struct gallivm_state *gallivm = &ctx->gallivm;
2449         struct si_shader *shader = ctx->shader;
2450         unsigned tess_inner_index, tess_outer_index;
2451         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2452         LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
2453         unsigned stride, outer_comps, inner_comps, i;
2454         struct lp_build_if_state if_ctx, inner_if_ctx;
2455
2456         si_llvm_emit_barrier(NULL, bld_base, NULL);
2457
2458         /* Do this only for invocation 0, because the tess levels are per-patch,
2459          * not per-vertex.
2460          *
2461          * This can't jump, because invocation 0 executes this. It should
2462          * at least mask out the loads and stores for other invocations.
2463          */
2464         lp_build_if(&if_ctx, gallivm,
2465                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2466                                   invocation_id, ctx->i32_0, ""));
2467
2468         /* Determine the layout of one tess factor element in the buffer. */
2469         switch (shader->key.part.tcs.epilog.prim_mode) {
2470         case PIPE_PRIM_LINES:
2471                 stride = 2; /* 2 dwords, 1 vec2 store */
2472                 outer_comps = 2;
2473                 inner_comps = 0;
2474                 break;
2475         case PIPE_PRIM_TRIANGLES:
2476                 stride = 4; /* 4 dwords, 1 vec4 store */
2477                 outer_comps = 3;
2478                 inner_comps = 1;
2479                 break;
2480         case PIPE_PRIM_QUADS:
2481                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2482                 outer_comps = 4;
2483                 inner_comps = 2;
2484                 break;
2485         default:
2486                 assert(0);
2487                 return;
2488         }
2489
2490         /* Load tess_inner and tess_outer from LDS.
2491          * Any invocation can write them, so we can't get them from a temporary.
2492          */
2493         tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2494         tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2495
2496         lds_base = tcs_out_current_patch_data_offset;
2497         lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2498                                  LLVMConstInt(ctx->i32,
2499                                               tess_inner_index * 4, 0), "");
2500         lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2501                                  LLVMConstInt(ctx->i32,
2502                                               tess_outer_index * 4, 0), "");
2503
2504         for (i = 0; i < 4; i++) {
2505                 inner[i] = LLVMGetUndef(ctx->i32);
2506                 outer[i] = LLVMGetUndef(ctx->i32);
2507         }
2508
2509         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2510                 /* For isolines, the hardware expects tess factors in the
2511                  * reverse order from what GLSL / TGSI specify.
2512                  */
2513                 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2514                 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2515         } else {
2516                 for (i = 0; i < outer_comps; i++) {
2517                         outer[i] = out[i] =
2518                                 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2519                 }
2520                 for (i = 0; i < inner_comps; i++) {
2521                         inner[i] = out[outer_comps+i] =
2522                                 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2523                 }
2524         }
2525
2526         /* Convert the outputs to vectors for stores. */
2527         vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2528         vec1 = NULL;
2529
2530         if (stride > 4)
2531                 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2532
2533         /* Get the buffer. */
2534         rw_buffers = LLVMGetParam(ctx->main_fn,
2535                                   ctx->param_rw_buffers);
2536         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2537                         LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0));
2538
2539         /* Get the offset. */
2540         tf_base = LLVMGetParam(ctx->main_fn,
2541                                ctx->param_tcs_factor_offset);
2542         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2543                                   LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2544
2545         lp_build_if(&inner_if_ctx, gallivm,
2546                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2547                                   rel_patch_id, ctx->i32_0, ""));
2548
2549         /* Store the dynamic HS control word. */
2550         ac_build_buffer_store_dword(&ctx->ac, buffer,
2551                                     LLVMConstInt(ctx->i32, 0x80000000, 0),
2552                                     1, ctx->i32_0, tf_base,
2553                                     0, 1, 0, true, false);
2554
2555         lp_build_endif(&inner_if_ctx);
2556
2557         /* Store the tessellation factors. */
2558         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2559                                     MIN2(stride, 4), byteoffset, tf_base,
2560                                     4, 1, 0, true, false);
2561         if (vec1)
2562                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2563                                             stride - 4, byteoffset, tf_base,
2564                                             20, 1, 0, true, false);
2565
2566         /* Store the tess factors into the offchip buffer if TES reads them. */
2567         if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2568                 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2569                 LLVMValueRef tf_inner_offset;
2570                 unsigned param_outer, param_inner;
2571
2572                 buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2573                                 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2574                 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2575
2576                 param_outer = si_shader_io_get_unique_index(
2577                                       TGSI_SEMANTIC_TESSOUTER, 0);
2578                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2579                                         LLVMConstInt(ctx->i32, param_outer, 0));
2580
2581                 outer_vec = lp_build_gather_values(gallivm, outer,
2582                                                    util_next_power_of_two(outer_comps));
2583
2584                 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2585                                             outer_comps, tf_outer_offset,
2586                                             base, 0, 1, 0, true, false);
2587                 if (inner_comps) {
2588                         param_inner = si_shader_io_get_unique_index(
2589                                               TGSI_SEMANTIC_TESSINNER, 0);
2590                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2591                                         LLVMConstInt(ctx->i32, param_inner, 0));
2592
2593                         inner_vec = inner_comps == 1 ? inner[0] :
2594                                     lp_build_gather_values(gallivm, inner, inner_comps);
2595                         ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2596                                                     inner_comps, tf_inner_offset,
2597                                                     base, 0, 1, 0, true, false);
2598                 }
2599         }
2600
2601         lp_build_endif(&if_ctx);
2602 }
2603
2604 /* This only writes the tessellation factor levels. */
2605 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2606 {
2607         struct si_shader_context *ctx = si_shader_context(bld_base);
2608         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2609         LLVMValueRef offchip_soffset, offchip_layout;
2610
2611         si_copy_tcs_inputs(bld_base);
2612
2613         rel_patch_id = get_rel_patch_id(ctx);
2614         invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2615         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2616
2617         /* Return epilog parameters from this function. */
2618         LLVMBuilderRef builder = ctx->gallivm.builder;
2619         LLVMValueRef ret = ctx->return_value;
2620         LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2621         unsigned vgpr;
2622
2623         /* RW_BUFFERS pointer */
2624         rw_buffers = LLVMGetParam(ctx->main_fn,
2625                                   ctx->param_rw_buffers);
2626         rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2627         rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2628         rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2629                                       ctx->i32_0, "");
2630         rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2631                                       ctx->i32_1, "");
2632         ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2633         ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2634
2635         /* Tess offchip and factor buffer soffset are after user SGPRs. */
2636         offchip_layout = LLVMGetParam(ctx->main_fn,
2637                                       ctx->param_tcs_offchip_layout);
2638         offchip_soffset = LLVMGetParam(ctx->main_fn,
2639                                        ctx->param_tcs_offchip_offset);
2640         tf_soffset = LLVMGetParam(ctx->main_fn,
2641                                   ctx->param_tcs_factor_offset);
2642         ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2643                                    GFX6_SGPR_TCS_OFFCHIP_LAYOUT, "");
2644         ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
2645                                    GFX6_TCS_NUM_USER_SGPR, "");
2646         ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2647                                    GFX6_TCS_NUM_USER_SGPR + 1, "");
2648
2649         /* VGPRs */
2650         rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2651         invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2652         tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2653
2654         vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2655         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2656         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2657         ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2658         ctx->return_value = ret;
2659 }
2660
2661 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2662 {
2663         struct si_shader_context *ctx = si_shader_context(bld_base);
2664         struct si_shader *shader = ctx->shader;
2665         struct tgsi_shader_info *info = &shader->selector->info;
2666         struct gallivm_state *gallivm = &ctx->gallivm;
2667         unsigned i, chan;
2668         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2669                                               ctx->param_rel_auto_id);
2670         LLVMValueRef vertex_dw_stride =
2671                 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2672         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2673                                                  vertex_dw_stride, "");
2674
2675         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2676          * its inputs from it. */
2677         for (i = 0; i < info->num_outputs; i++) {
2678                 LLVMValueRef *out_ptr = ctx->outputs[i];
2679                 unsigned name = info->output_semantic_name[i];
2680                 unsigned index = info->output_semantic_index[i];
2681
2682                 /* The ARB_shader_viewport_layer_array spec contains the
2683                  * following issue:
2684                  *
2685                  *    2) What happens if gl_ViewportIndex or gl_Layer is
2686                  *    written in the vertex shader and a geometry shader is
2687                  *    present?
2688                  *
2689                  *    RESOLVED: The value written by the last vertex processing
2690                  *    stage is used. If the last vertex processing stage
2691                  *    (vertex, tessellation evaluation or geometry) does not
2692                  *    statically assign to gl_ViewportIndex or gl_Layer, index
2693                  *    or layer zero is assumed.
2694                  *
2695                  * So writes to those outputs in VS-as-LS are simply ignored.
2696                  */
2697                 if (name == TGSI_SEMANTIC_LAYER ||
2698                     name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2699                         continue;
2700
2701                 int param = si_shader_io_get_unique_index(name, index);
2702                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2703                                         LLVMConstInt(ctx->i32, param * 4, 0), "");
2704
2705                 for (chan = 0; chan < 4; chan++) {
2706                         lds_store(bld_base, chan, dw_addr,
2707                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2708                 }
2709         }
2710 }
2711
2712 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2713 {
2714         struct si_shader_context *ctx = si_shader_context(bld_base);
2715         struct gallivm_state *gallivm = &ctx->gallivm;
2716         struct si_shader *es = ctx->shader;
2717         struct tgsi_shader_info *info = &es->selector->info;
2718         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2719                                             ctx->param_es2gs_offset);
2720         unsigned chan;
2721         int i;
2722
2723         for (i = 0; i < info->num_outputs; i++) {
2724                 LLVMValueRef *out_ptr = ctx->outputs[i];
2725                 int param_index;
2726
2727                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2728                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2729                         continue;
2730
2731                 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2732                                                             info->output_semantic_index[i]);
2733
2734                 for (chan = 0; chan < 4; chan++) {
2735                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2736                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2737
2738                         ac_build_buffer_store_dword(&ctx->ac,
2739                                                     ctx->esgs_ring,
2740                                                     out_val, 1, NULL, soffset,
2741                                                     (4 * param_index + chan) * 4,
2742                                                     1, 1, true, true);
2743                 }
2744         }
2745 }
2746
2747 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2748 {
2749         struct si_shader_context *ctx = si_shader_context(bld_base);
2750
2751         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2752                          LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
2753 }
2754
2755 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2756 {
2757         struct si_shader_context *ctx = si_shader_context(bld_base);
2758         struct gallivm_state *gallivm = &ctx->gallivm;
2759         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2760         struct si_shader_output_values *outputs = NULL;
2761         int i,j;
2762
2763         assert(!ctx->shader->is_gs_copy_shader);
2764
2765         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2766
2767         /* Vertex color clamping.
2768          *
2769          * This uses a state constant loaded in a user data SGPR and
2770          * an IF statement is added that clamps all colors if the constant
2771          * is true.
2772          */
2773         if (ctx->type == PIPE_SHADER_VERTEX) {
2774                 struct lp_build_if_state if_ctx;
2775                 LLVMValueRef cond = NULL;
2776                 LLVMValueRef addr, val;
2777
2778                 for (i = 0; i < info->num_outputs; i++) {
2779                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2780                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2781                                 continue;
2782
2783                         /* We've found a color. */
2784                         if (!cond) {
2785                                 /* The state is in the first bit of the user SGPR. */
2786                                 cond = LLVMGetParam(ctx->main_fn,
2787                                                     ctx->param_vs_state_bits);
2788                                 cond = LLVMBuildTrunc(gallivm->builder, cond,
2789                                                       ctx->i1, "");
2790                                 lp_build_if(&if_ctx, gallivm, cond);
2791                         }
2792
2793                         for (j = 0; j < 4; j++) {
2794                                 addr = ctx->outputs[i][j];
2795                                 val = LLVMBuildLoad(gallivm->builder, addr, "");
2796                                 val = ac_build_clamp(&ctx->ac, val);
2797                                 LLVMBuildStore(gallivm->builder, val, addr);
2798                         }
2799                 }
2800
2801                 if (cond)
2802                         lp_build_endif(&if_ctx);
2803         }
2804
2805         for (i = 0; i < info->num_outputs; i++) {
2806                 outputs[i].semantic_name = info->output_semantic_name[i];
2807                 outputs[i].semantic_index = info->output_semantic_index[i];
2808
2809                 for (j = 0; j < 4; j++) {
2810                         outputs[i].values[j] =
2811                                 LLVMBuildLoad(gallivm->builder,
2812                                               ctx->outputs[i][j],
2813                                               "");
2814                         outputs[i].vertex_stream[j] =
2815                                 (info->output_streams[i] >> (2 * j)) & 3;
2816                 }
2817
2818         }
2819
2820         /* Return the primitive ID from the LLVM function. */
2821         ctx->return_value =
2822                 LLVMBuildInsertValue(gallivm->builder,
2823                                      ctx->return_value,
2824                                      bitcast(bld_base, TGSI_TYPE_FLOAT,
2825                                              get_primitive_id(bld_base, 0)),
2826                                      VS_EPILOG_PRIMID_LOC, "");
2827
2828         if (ctx->shader->selector->so.num_outputs)
2829                 si_llvm_emit_streamout(ctx, outputs, i, 0);
2830         si_llvm_export_vs(bld_base, outputs, i);
2831         FREE(outputs);
2832 }
2833
2834 struct si_ps_exports {
2835         unsigned num;
2836         struct ac_export_args args[10];
2837 };
2838
2839 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
2840                                     bool writes_samplemask)
2841 {
2842         if (writes_z) {
2843                 /* Z needs 32 bits. */
2844                 if (writes_samplemask)
2845                         return V_028710_SPI_SHADER_32_ABGR;
2846                 else if (writes_stencil)
2847                         return V_028710_SPI_SHADER_32_GR;
2848                 else
2849                         return V_028710_SPI_SHADER_32_R;
2850         } else if (writes_stencil || writes_samplemask) {
2851                 /* Both stencil and sample mask need only 16 bits. */
2852                 return V_028710_SPI_SHADER_UINT16_ABGR;
2853         } else {
2854                 return V_028710_SPI_SHADER_ZERO;
2855         }
2856 }
2857
2858 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2859                             LLVMValueRef depth, LLVMValueRef stencil,
2860                             LLVMValueRef samplemask, struct si_ps_exports *exp)
2861 {
2862         struct si_shader_context *ctx = si_shader_context(bld_base);
2863         struct lp_build_context *base = &bld_base->base;
2864         struct ac_export_args args;
2865         unsigned mask = 0;
2866         unsigned format = si_get_spi_shader_z_format(depth != NULL,
2867                                                      stencil != NULL,
2868                                                      samplemask != NULL);
2869
2870         assert(depth || stencil || samplemask);
2871
2872         args.valid_mask = 1; /* whether the EXEC mask is valid */
2873         args.done = 1; /* DONE bit */
2874
2875         /* Specify the target we are exporting */
2876         args.target = V_008DFC_SQ_EXP_MRTZ;
2877
2878         args.compr = 0; /* COMP flag */
2879         args.out[0] = base->undef; /* R, depth */
2880         args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2881         args.out[2] = base->undef; /* B, sample mask */
2882         args.out[3] = base->undef; /* A, alpha to mask */
2883
2884         if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
2885                 assert(!depth);
2886                 args.compr = 1; /* COMPR flag */
2887
2888                 if (stencil) {
2889                         /* Stencil should be in X[23:16]. */
2890                         stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
2891                         stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
2892                                                LLVMConstInt(ctx->i32, 16, 0), "");
2893                         args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
2894                         mask |= 0x3;
2895                 }
2896                 if (samplemask) {
2897                         /* SampleMask should be in Y[15:0]. */
2898                         args.out[1] = samplemask;
2899                         mask |= 0xc;
2900                 }
2901         } else {
2902                 if (depth) {
2903                         args.out[0] = depth;
2904                         mask |= 0x1;
2905                 }
2906                 if (stencil) {
2907                         args.out[1] = stencil;
2908                         mask |= 0x2;
2909                 }
2910                 if (samplemask) {
2911                         args.out[2] = samplemask;
2912                         mask |= 0x4;
2913                 }
2914         }
2915
2916         /* SI (except OLAND and HAINAN) has a bug that it only looks
2917          * at the X writemask component. */
2918         if (ctx->screen->b.chip_class == SI &&
2919             ctx->screen->b.family != CHIP_OLAND &&
2920             ctx->screen->b.family != CHIP_HAINAN)
2921                 mask |= 0x1;
2922
2923         /* Specify which components to enable */
2924         args.enabled_channels = mask;
2925
2926         memcpy(&exp->args[exp->num++], &args, sizeof(args));
2927 }
2928
2929 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2930                                 LLVMValueRef *color, unsigned index,
2931                                 unsigned samplemask_param,
2932                                 bool is_last, struct si_ps_exports *exp)
2933 {
2934         struct si_shader_context *ctx = si_shader_context(bld_base);
2935         struct lp_build_context *base = &bld_base->base;
2936         int i;
2937
2938         /* Clamp color */
2939         if (ctx->shader->key.part.ps.epilog.clamp_color)
2940                 for (i = 0; i < 4; i++)
2941                         color[i] = ac_build_clamp(&ctx->ac, color[i]);
2942
2943         /* Alpha to one */
2944         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
2945                 color[3] = base->one;
2946
2947         /* Alpha test */
2948         if (index == 0 &&
2949             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2950                 si_alpha_test(bld_base, color[3]);
2951
2952         /* Line & polygon smoothing */
2953         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
2954                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2955                                                          samplemask_param);
2956
2957         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2958         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
2959                 struct ac_export_args args[8];
2960                 int c, last = -1;
2961
2962                 /* Get the export arguments, also find out what the last one is. */
2963                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
2964                         si_llvm_init_export_args(bld_base, color,
2965                                                  V_008DFC_SQ_EXP_MRT + c, &args[c]);
2966                         if (args[c].enabled_channels)
2967                                 last = c;
2968                 }
2969
2970                 /* Emit all exports. */
2971                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
2972                         if (is_last && last == c) {
2973                                 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
2974                                 args[c].done = 1; /* DONE bit */
2975                         } else if (!args[c].enabled_channels)
2976                                 continue; /* unnecessary NULL export */
2977
2978                         memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
2979                 }
2980         } else {
2981                 struct ac_export_args args;
2982
2983                 /* Export */
2984                 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
2985                                          &args);
2986                 if (is_last) {
2987                         args.valid_mask = 1; /* whether the EXEC mask is valid */
2988                         args.done = 1; /* DONE bit */
2989                 } else if (!args.enabled_channels)
2990                         return; /* unnecessary NULL export */
2991
2992                 memcpy(&exp->args[exp->num++], &args, sizeof(args));
2993         }
2994 }
2995
2996 static void si_emit_ps_exports(struct si_shader_context *ctx,
2997                                struct si_ps_exports *exp)
2998 {
2999         for (unsigned i = 0; i < exp->num; i++)
3000                 ac_build_export(&ctx->ac, &exp->args[i]);
3001 }
3002
3003 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3004 {
3005         struct si_shader_context *ctx = si_shader_context(bld_base);
3006         struct lp_build_context *base = &bld_base->base;
3007         struct ac_export_args args;
3008
3009         args.enabled_channels = 0x0; /* enabled channels */
3010         args.valid_mask = 1; /* whether the EXEC mask is valid */
3011         args.done = 1; /* DONE bit */
3012         args.target = V_008DFC_SQ_EXP_NULL;
3013         args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3014         args.out[0] = base->undef; /* R */
3015         args.out[1] = base->undef; /* G */
3016         args.out[2] = base->undef; /* B */
3017         args.out[3] = base->undef; /* A */
3018
3019         ac_build_export(&ctx->ac, &args);
3020 }
3021
3022 /**
3023  * Return PS outputs in this order:
3024  *
3025  * v[0:3] = color0.xyzw
3026  * v[4:7] = color1.xyzw
3027  * ...
3028  * vN+0 = Depth
3029  * vN+1 = Stencil
3030  * vN+2 = SampleMask
3031  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3032  *
3033  * The alpha-ref SGPR is returned via its original location.
3034  */
3035 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3036 {
3037         struct si_shader_context *ctx = si_shader_context(bld_base);
3038         struct si_shader *shader = ctx->shader;
3039         struct tgsi_shader_info *info = &shader->selector->info;
3040         LLVMBuilderRef builder = ctx->gallivm.builder;
3041         unsigned i, j, first_vgpr, vgpr;
3042
3043         LLVMValueRef color[8][4] = {};
3044         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3045         LLVMValueRef ret;
3046
3047         /* Read the output values. */
3048         for (i = 0; i < info->num_outputs; i++) {
3049                 unsigned semantic_name = info->output_semantic_name[i];
3050                 unsigned semantic_index = info->output_semantic_index[i];
3051
3052                 switch (semantic_name) {
3053                 case TGSI_SEMANTIC_COLOR:
3054                         assert(semantic_index < 8);
3055                         for (j = 0; j < 4; j++) {
3056                                 LLVMValueRef ptr = ctx->outputs[i][j];
3057                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3058                                 color[semantic_index][j] = result;
3059                         }
3060                         break;
3061                 case TGSI_SEMANTIC_POSITION:
3062                         depth = LLVMBuildLoad(builder,
3063                                               ctx->outputs[i][2], "");
3064                         break;
3065                 case TGSI_SEMANTIC_STENCIL:
3066                         stencil = LLVMBuildLoad(builder,
3067                                                 ctx->outputs[i][1], "");
3068                         break;
3069                 case TGSI_SEMANTIC_SAMPLEMASK:
3070                         samplemask = LLVMBuildLoad(builder,
3071                                                    ctx->outputs[i][0], "");
3072                         break;
3073                 default:
3074                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3075                                 semantic_name);
3076                 }
3077         }
3078
3079         /* Fill the return structure. */
3080         ret = ctx->return_value;
3081
3082         /* Set SGPRs. */
3083         ret = LLVMBuildInsertValue(builder, ret,
3084                                    bitcast(bld_base, TGSI_TYPE_SIGNED,
3085                                            LLVMGetParam(ctx->main_fn,
3086                                                         SI_PARAM_ALPHA_REF)),
3087                                    SI_SGPR_ALPHA_REF, "");
3088
3089         /* Set VGPRs */
3090         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3091         for (i = 0; i < ARRAY_SIZE(color); i++) {
3092                 if (!color[i][0])
3093                         continue;
3094
3095                 for (j = 0; j < 4; j++)
3096                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3097         }
3098         if (depth)
3099                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3100         if (stencil)
3101                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3102         if (samplemask)
3103                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3104
3105         /* Add the input sample mask for smoothing at the end. */
3106         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3107                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3108         ret = LLVMBuildInsertValue(builder, ret,
3109                                    LLVMGetParam(ctx->main_fn,
3110                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3111
3112         ctx->return_value = ret;
3113 }
3114
3115 /**
3116  * Given a v8i32 resource descriptor for a buffer, extract the size of the
3117  * buffer in number of elements and return it as an i32.
3118  */
3119 static LLVMValueRef get_buffer_size(
3120         struct lp_build_tgsi_context *bld_base,
3121         LLVMValueRef descriptor)
3122 {
3123         struct si_shader_context *ctx = si_shader_context(bld_base);
3124         struct gallivm_state *gallivm = &ctx->gallivm;
3125         LLVMBuilderRef builder = gallivm->builder;
3126         LLVMValueRef size =
3127                 LLVMBuildExtractElement(builder, descriptor,
3128                                         LLVMConstInt(ctx->i32, 2, 0), "");
3129
3130         if (ctx->screen->b.chip_class == VI) {
3131                 /* On VI, the descriptor contains the size in bytes,
3132                  * but TXQ must return the size in elements.
3133                  * The stride is always non-zero for resources using TXQ.
3134                  */
3135                 LLVMValueRef stride =
3136                         LLVMBuildExtractElement(builder, descriptor,
3137                                                 ctx->i32_1, "");
3138                 stride = LLVMBuildLShr(builder, stride,
3139                                        LLVMConstInt(ctx->i32, 16, 0), "");
3140                 stride = LLVMBuildAnd(builder, stride,
3141                                       LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3142
3143                 size = LLVMBuildUDiv(builder, size, stride, "");
3144         }
3145
3146         return size;
3147 }
3148
3149 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3150                                 struct lp_build_tgsi_context *bld_base,
3151                                 struct lp_build_emit_data *emit_data);
3152
3153 /* Prevent optimizations (at least of memory accesses) across the current
3154  * point in the program by emitting empty inline assembly that is marked as
3155  * having side effects.
3156  *
3157  * Optionally, a value can be passed through the inline assembly to prevent
3158  * LLVM from hoisting calls to ReadNone functions.
3159  */
3160 static void emit_optimization_barrier(struct si_shader_context *ctx,
3161                                       LLVMValueRef *pvgpr)
3162 {
3163         static int counter = 0;
3164
3165         LLVMBuilderRef builder = ctx->gallivm.builder;
3166         char code[16];
3167
3168         snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3169
3170         if (!pvgpr) {
3171                 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3172                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3173                 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3174         } else {
3175                 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3176                 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3177                 LLVMValueRef vgpr = *pvgpr;
3178                 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3179                 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3180                 LLVMValueRef vgpr0;
3181
3182                 assert(vgpr_size % 4 == 0);
3183
3184                 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3185                 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3186                 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3187                 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3188                 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3189
3190                 *pvgpr = vgpr;
3191         }
3192 }
3193
3194 /* Combine these with & instead of |. */
3195 #define NOOP_WAITCNT 0xf7f
3196 #define LGKM_CNT 0x07f
3197 #define VM_CNT 0xf70
3198
3199 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3200 {
3201         struct gallivm_state *gallivm = &ctx->gallivm;
3202         LLVMBuilderRef builder = gallivm->builder;
3203         LLVMValueRef args[1] = {
3204                 LLVMConstInt(ctx->i32, simm16, 0)
3205         };
3206         lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3207                            ctx->voidt, args, 1, 0);
3208 }
3209
3210 static void membar_emit(
3211                 const struct lp_build_tgsi_action *action,
3212                 struct lp_build_tgsi_context *bld_base,
3213                 struct lp_build_emit_data *emit_data)
3214 {
3215         struct si_shader_context *ctx = si_shader_context(bld_base);
3216         LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3217         unsigned flags = LLVMConstIntGetZExtValue(src0);
3218         unsigned waitcnt = NOOP_WAITCNT;
3219
3220         if (flags & TGSI_MEMBAR_THREAD_GROUP)
3221                 waitcnt &= VM_CNT & LGKM_CNT;
3222
3223         if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3224                      TGSI_MEMBAR_SHADER_BUFFER |
3225                      TGSI_MEMBAR_SHADER_IMAGE))
3226                 waitcnt &= VM_CNT;
3227
3228         if (flags & TGSI_MEMBAR_SHARED)
3229                 waitcnt &= LGKM_CNT;
3230
3231         if (waitcnt != NOOP_WAITCNT)
3232                 emit_waitcnt(ctx, waitcnt);
3233 }
3234
3235 static void clock_emit(
3236                 const struct lp_build_tgsi_action *action,
3237                 struct lp_build_tgsi_context *bld_base,
3238                 struct lp_build_emit_data *emit_data)
3239 {
3240         struct si_shader_context *ctx = si_shader_context(bld_base);
3241         struct gallivm_state *gallivm = &ctx->gallivm;
3242         LLVMValueRef tmp;
3243
3244         tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3245                                  ctx->i64, NULL, 0, 0);
3246         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3247
3248         emit_data->output[0] =
3249                 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3250         emit_data->output[1] =
3251                 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3252 }
3253
3254 static LLVMValueRef
3255 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3256                          const struct tgsi_full_src_register *reg)
3257 {
3258         LLVMValueRef index;
3259         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3260                                              ctx->param_shader_buffers);
3261
3262         if (!reg->Register.Indirect)
3263                 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3264         else
3265                 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3266                                                    reg->Register.Index,
3267                                                    SI_NUM_SHADER_BUFFERS);
3268
3269         return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3270 }
3271
3272 static bool tgsi_is_array_sampler(unsigned target)
3273 {
3274         return target == TGSI_TEXTURE_1D_ARRAY ||
3275                target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3276                target == TGSI_TEXTURE_2D_ARRAY ||
3277                target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3278                target == TGSI_TEXTURE_CUBE_ARRAY ||
3279                target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3280                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3281 }
3282
3283 static bool tgsi_is_array_image(unsigned target)
3284 {
3285         return target == TGSI_TEXTURE_3D ||
3286                target == TGSI_TEXTURE_CUBE ||
3287                target == TGSI_TEXTURE_1D_ARRAY ||
3288                target == TGSI_TEXTURE_2D_ARRAY ||
3289                target == TGSI_TEXTURE_CUBE_ARRAY ||
3290                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3291 }
3292
3293 /**
3294  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3295  *
3296  * At least on Tonga, executing image stores on images with DCC enabled and
3297  * non-trivial can eventually lead to lockups. This can occur when an
3298  * application binds an image as read-only but then uses a shader that writes
3299  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3300  * program termination) in this case, but it doesn't cost much to be a bit
3301  * nicer: disabling DCC in the shader still leads to undefined results but
3302  * avoids the lockup.
3303  */
3304 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3305                                   LLVMValueRef rsrc)
3306 {
3307         if (ctx->screen->b.chip_class <= CIK) {
3308                 return rsrc;
3309         } else {
3310                 LLVMBuilderRef builder = ctx->gallivm.builder;
3311                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3312                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3313                 LLVMValueRef tmp;
3314
3315                 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3316                 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3317                 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3318         }
3319 }
3320
3321 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3322 {
3323         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3324                                CONST_ADDR_SPACE);
3325 }
3326
3327 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3328                                     LLVMValueRef list, LLVMValueRef index,
3329                                     unsigned target)
3330 {
3331         LLVMBuilderRef builder = ctx->gallivm.builder;
3332
3333         if (target == TGSI_TEXTURE_BUFFER) {
3334                 index = LLVMBuildMul(builder, index,
3335                                      LLVMConstInt(ctx->i32, 2, 0), "");
3336                 index = LLVMBuildAdd(builder, index,
3337                                      ctx->i32_1, "");
3338                 list = LLVMBuildPointerCast(builder, list,
3339                                             const_array(ctx->v4i32, 0), "");
3340         }
3341
3342         return ac_build_indexed_load_const(&ctx->ac, list, index);
3343 }
3344
3345 /**
3346  * Load the resource descriptor for \p image.
3347  */
3348 static void
3349 image_fetch_rsrc(
3350         struct lp_build_tgsi_context *bld_base,
3351         const struct tgsi_full_src_register *image,
3352         bool is_store, unsigned target,
3353         LLVMValueRef *rsrc)
3354 {
3355         struct si_shader_context *ctx = si_shader_context(bld_base);
3356         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3357                                              ctx->param_images);
3358         LLVMValueRef index;
3359         bool dcc_off = is_store;
3360
3361         assert(image->Register.File == TGSI_FILE_IMAGE);
3362
3363         if (!image->Register.Indirect) {
3364                 const struct tgsi_shader_info *info = bld_base->info;
3365                 unsigned images_writemask = info->images_store |
3366                                             info->images_atomic;
3367
3368                 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3369
3370                 if (images_writemask & (1 << image->Register.Index))
3371                         dcc_off = true;
3372         } else {
3373                 /* From the GL_ARB_shader_image_load_store extension spec:
3374                  *
3375                  *    If a shader performs an image load, store, or atomic
3376                  *    operation using an image variable declared as an array,
3377                  *    and if the index used to select an individual element is
3378                  *    negative or greater than or equal to the size of the
3379                  *    array, the results of the operation are undefined but may
3380                  *    not lead to termination.
3381                  */
3382                 index = get_bounded_indirect_index(ctx, &image->Indirect,
3383                                                    image->Register.Index,
3384                                                    SI_NUM_IMAGES);
3385         }
3386
3387         *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3388         if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3389                 *rsrc = force_dcc_off(ctx, *rsrc);
3390 }
3391
3392 static LLVMValueRef image_fetch_coords(
3393                 struct lp_build_tgsi_context *bld_base,
3394                 const struct tgsi_full_instruction *inst,
3395                 unsigned src, LLVMValueRef desc)
3396 {
3397         struct si_shader_context *ctx = si_shader_context(bld_base);
3398         struct gallivm_state *gallivm = &ctx->gallivm;
3399         LLVMBuilderRef builder = gallivm->builder;
3400         unsigned target = inst->Memory.Texture;
3401         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3402         LLVMValueRef coords[4];
3403         LLVMValueRef tmp;
3404         int chan;
3405
3406         for (chan = 0; chan < num_coords; ++chan) {
3407                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3408                 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3409                 coords[chan] = tmp;
3410         }
3411
3412         if (ctx->screen->b.chip_class >= GFX9) {
3413                 /* 1D textures are allocated and used as 2D on GFX9. */
3414                 if (target == TGSI_TEXTURE_1D) {
3415                         coords[1] = ctx->i32_0;
3416                         num_coords++;
3417                 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3418                         coords[2] = coords[1];
3419                         coords[1] = ctx->i32_0;
3420                         num_coords++;
3421                 } else if (target == TGSI_TEXTURE_2D) {
3422                         /* The hw can't bind a slice of a 3D image as a 2D
3423                          * image, because it ignores BASE_ARRAY if the target
3424                          * is 3D. The workaround is to read BASE_ARRAY and set
3425                          * it as the 3rd address operand for all 2D images.
3426                          */
3427                         LLVMValueRef first_layer, const5, mask;
3428
3429                         const5 = LLVMConstInt(ctx->i32, 5, 0);
3430                         mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3431                         first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3432                         first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3433
3434                         coords[2] = first_layer;
3435                         num_coords++;
3436                 }
3437         }
3438
3439         if (num_coords == 1)
3440                 return coords[0];
3441
3442         if (num_coords == 3) {
3443                 /* LLVM has difficulties lowering 3-element vectors. */
3444                 coords[3] = bld_base->uint_bld.undef;
3445                 num_coords = 4;
3446         }
3447
3448         return lp_build_gather_values(gallivm, coords, num_coords);
3449 }
3450
3451 /**
3452  * Append the extra mode bits that are used by image load and store.
3453  */
3454 static void image_append_args(
3455                 struct si_shader_context *ctx,
3456                 struct lp_build_emit_data * emit_data,
3457                 unsigned target,
3458                 bool atomic,
3459                 bool force_glc)
3460 {
3461         const struct tgsi_full_instruction *inst = emit_data->inst;
3462         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3463         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3464         LLVMValueRef r128 = i1false;
3465         LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3466         LLVMValueRef glc =
3467                 force_glc ||
3468                 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3469                 i1true : i1false;
3470         LLVMValueRef slc = i1false;
3471         LLVMValueRef lwe = i1false;
3472
3473         if (atomic || (HAVE_LLVM <= 0x0309)) {
3474                 emit_data->args[emit_data->arg_count++] = r128;
3475                 emit_data->args[emit_data->arg_count++] = da;
3476                 if (!atomic) {
3477                         emit_data->args[emit_data->arg_count++] = glc;
3478                 }
3479                 emit_data->args[emit_data->arg_count++] = slc;
3480                 return;
3481         }
3482
3483         /* HAVE_LLVM >= 0x0400 */
3484         emit_data->args[emit_data->arg_count++] = glc;
3485         emit_data->args[emit_data->arg_count++] = slc;
3486         emit_data->args[emit_data->arg_count++] = lwe;
3487         emit_data->args[emit_data->arg_count++] = da;
3488 }
3489
3490 /**
3491  * Append the resource and indexing arguments for buffer intrinsics.
3492  *
3493  * \param rsrc the v4i32 buffer resource
3494  * \param index index into the buffer (stride-based)
3495  * \param offset byte offset into the buffer
3496  */
3497 static void buffer_append_args(
3498                 struct si_shader_context *ctx,
3499                 struct lp_build_emit_data *emit_data,
3500                 LLVMValueRef rsrc,
3501                 LLVMValueRef index,
3502                 LLVMValueRef offset,
3503                 bool atomic,
3504                 bool force_glc)
3505 {
3506         const struct tgsi_full_instruction *inst = emit_data->inst;
3507         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3508         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3509
3510         emit_data->args[emit_data->arg_count++] = rsrc;
3511         emit_data->args[emit_data->arg_count++] = index; /* vindex */
3512         emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3513         if (!atomic) {
3514                 emit_data->args[emit_data->arg_count++] =
3515                         force_glc ||
3516                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3517                         i1true : i1false; /* glc */
3518         }
3519         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3520 }
3521
3522 static void load_fetch_args(
3523                 struct lp_build_tgsi_context * bld_base,
3524                 struct lp_build_emit_data * emit_data)
3525 {
3526         struct si_shader_context *ctx = si_shader_context(bld_base);
3527         struct gallivm_state *gallivm = &ctx->gallivm;
3528         const struct tgsi_full_instruction * inst = emit_data->inst;
3529         unsigned target = inst->Memory.Texture;
3530         LLVMValueRef rsrc;
3531
3532         emit_data->dst_type = ctx->v4f32;
3533
3534         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3535                 LLVMBuilderRef builder = gallivm->builder;
3536                 LLVMValueRef offset;
3537                 LLVMValueRef tmp;
3538
3539                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3540
3541                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3542                 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3543
3544                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3545                                    offset, false, false);
3546         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3547                 LLVMValueRef coords;
3548
3549                 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3550                 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3551
3552                 if (target == TGSI_TEXTURE_BUFFER) {
3553                         buffer_append_args(ctx, emit_data, rsrc, coords,
3554                                            ctx->i32_0, false, false);
3555                 } else {
3556                         emit_data->args[0] = coords;
3557                         emit_data->args[1] = rsrc;
3558                         emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3559                         emit_data->arg_count = 3;
3560
3561                         image_append_args(ctx, emit_data, target, false, false);
3562                 }
3563         }
3564 }
3565
3566 static unsigned get_load_intr_attribs(bool readonly_memory)
3567 {
3568         /* READNONE means writes can't affect it, while READONLY means that
3569          * writes can affect it. */
3570         return readonly_memory && HAVE_LLVM >= 0x0400 ?
3571                                  LP_FUNC_ATTR_READNONE :
3572                                  LP_FUNC_ATTR_READONLY;
3573 }
3574
3575 static unsigned get_store_intr_attribs(bool writeonly_memory)
3576 {
3577         return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3578                                   LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3579                                   LP_FUNC_ATTR_WRITEONLY;
3580 }
3581
3582 static void load_emit_buffer(struct si_shader_context *ctx,
3583                              struct lp_build_emit_data *emit_data,
3584                              bool readonly_memory)
3585 {
3586         const struct tgsi_full_instruction *inst = emit_data->inst;
3587         struct gallivm_state *gallivm = &ctx->gallivm;
3588         LLVMBuilderRef builder = gallivm->builder;
3589         uint writemask = inst->Dst[0].Register.WriteMask;
3590         uint count = util_last_bit(writemask);
3591         const char *intrinsic_name;
3592         LLVMTypeRef dst_type;
3593
3594         switch (count) {
3595         case 1:
3596                 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3597                 dst_type = ctx->f32;
3598                 break;
3599         case 2:
3600                 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3601                 dst_type = LLVMVectorType(ctx->f32, 2);
3602                 break;
3603         default: // 3 & 4
3604                 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3605                 dst_type = ctx->v4f32;
3606                 count = 4;
3607         }
3608
3609         emit_data->output[emit_data->chan] = lp_build_intrinsic(
3610                         builder, intrinsic_name, dst_type,
3611                         emit_data->args, emit_data->arg_count,
3612                         get_load_intr_attribs(readonly_memory));
3613 }
3614
3615 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3616                                    const struct tgsi_full_instruction *inst,
3617                                    LLVMTypeRef type, int arg)
3618 {
3619         struct gallivm_state *gallivm = &ctx->gallivm;
3620         LLVMBuilderRef builder = gallivm->builder;
3621         LLVMValueRef offset, ptr;
3622         int addr_space;
3623
3624         offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3625         offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3626
3627         ptr = ctx->shared_memory;
3628         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3629         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3630         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3631
3632         return ptr;
3633 }
3634
3635 static void load_emit_memory(
3636                 struct si_shader_context *ctx,
3637                 struct lp_build_emit_data *emit_data)
3638 {
3639         const struct tgsi_full_instruction *inst = emit_data->inst;
3640         struct gallivm_state *gallivm = &ctx->gallivm;
3641         LLVMBuilderRef builder = gallivm->builder;
3642         unsigned writemask = inst->Dst[0].Register.WriteMask;
3643         LLVMValueRef channels[4], ptr, derived_ptr, index;
3644         int chan;
3645
3646         ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3647
3648         for (chan = 0; chan < 4; ++chan) {
3649                 if (!(writemask & (1 << chan))) {
3650                         channels[chan] = LLVMGetUndef(ctx->f32);
3651                         continue;
3652                 }
3653
3654                 index = LLVMConstInt(ctx->i32, chan, 0);
3655                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3656                 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3657         }
3658         emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3659 }
3660
3661 /**
3662  * Return true if the memory accessed by a LOAD or STORE instruction is
3663  * read-only or write-only, respectively.
3664  *
3665  * \param shader_buffers_reverse_access_mask
3666  *      For LOAD, set this to (store | atomic) slot usage in the shader.
3667  *      For STORE, set this to (load | atomic) slot usage in the shader.
3668  * \param images_reverse_access_mask  Same as above, but for images.
3669  */
3670 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3671                                   const struct tgsi_shader_info *info,
3672                                   unsigned shader_buffers_reverse_access_mask,
3673                                   unsigned images_reverse_access_mask)
3674 {
3675         /* RESTRICT means NOALIAS.
3676          * If there are no writes, we can assume the accessed memory is read-only.
3677          * If there are no reads, we can assume the accessed memory is write-only.
3678          */
3679         if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3680                 unsigned reverse_access_mask;
3681
3682                 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3683                         reverse_access_mask = shader_buffers_reverse_access_mask;
3684                 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3685                         reverse_access_mask = info->images_buffers &
3686                                               images_reverse_access_mask;
3687                 } else {
3688                         reverse_access_mask = ~info->images_buffers &
3689                                               images_reverse_access_mask;
3690                 }
3691
3692                 if (inst->Src[0].Register.Indirect) {
3693                         if (!reverse_access_mask)
3694                                 return true;
3695                 } else {
3696                         if (!(reverse_access_mask &
3697                               (1u << inst->Src[0].Register.Index)))
3698                                 return true;
3699                 }
3700         }
3701
3702         /* If there are no buffer writes (for both shader buffers & image
3703          * buffers), it implies that buffer memory is read-only.
3704          * If there are no buffer reads (for both shader buffers & image
3705          * buffers), it implies that buffer memory is write-only.
3706          *
3707          * Same for the case when there are no writes/reads for non-buffer
3708          * images.
3709          */
3710         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3711             (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3712              inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3713                 if (!shader_buffers_reverse_access_mask &&
3714                     !(info->images_buffers & images_reverse_access_mask))
3715                         return true;
3716         } else {
3717                 if (!(~info->images_buffers & images_reverse_access_mask))
3718                         return true;
3719         }
3720         return false;
3721 }
3722
3723 static void load_emit(
3724                 const struct lp_build_tgsi_action *action,
3725                 struct lp_build_tgsi_context *bld_base,
3726                 struct lp_build_emit_data *emit_data)
3727 {
3728         struct si_shader_context *ctx = si_shader_context(bld_base);
3729         struct gallivm_state *gallivm = &ctx->gallivm;
3730         LLVMBuilderRef builder = gallivm->builder;
3731         const struct tgsi_full_instruction * inst = emit_data->inst;
3732         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3733         char intrinsic_name[64];
3734         bool readonly_memory = false;
3735
3736         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3737                 load_emit_memory(ctx, emit_data);
3738                 return;
3739         }
3740
3741         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3742                 emit_waitcnt(ctx, VM_CNT);
3743
3744         readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3745                           is_oneway_access_only(inst, info,
3746                                                 info->shader_buffers_store |
3747                                                 info->shader_buffers_atomic,
3748                                                 info->images_store |
3749                                                 info->images_atomic);
3750
3751         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3752                 load_emit_buffer(ctx, emit_data, readonly_memory);
3753                 return;
3754         }
3755
3756         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3757                 emit_data->output[emit_data->chan] =
3758                         lp_build_intrinsic(
3759                                 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3760                                 emit_data->args, emit_data->arg_count,
3761                                 get_load_intr_attribs(readonly_memory));
3762         } else {
3763                 ac_get_image_intr_name("llvm.amdgcn.image.load",
3764                                        emit_data->dst_type,             /* vdata */
3765                                        LLVMTypeOf(emit_data->args[0]), /* coords */
3766                                        LLVMTypeOf(emit_data->args[1]), /* rsrc */
3767                                        intrinsic_name, sizeof(intrinsic_name));
3768
3769                 emit_data->output[emit_data->chan] =
3770                         lp_build_intrinsic(
3771                                 builder, intrinsic_name, emit_data->dst_type,
3772                                 emit_data->args, emit_data->arg_count,
3773                                 get_load_intr_attribs(readonly_memory));
3774         }
3775 }
3776
3777 static void store_fetch_args(
3778                 struct lp_build_tgsi_context * bld_base,
3779                 struct lp_build_emit_data * emit_data)
3780 {
3781         struct si_shader_context *ctx = si_shader_context(bld_base);
3782         struct gallivm_state *gallivm = &ctx->gallivm;
3783         LLVMBuilderRef builder = gallivm->builder;
3784         const struct tgsi_full_instruction * inst = emit_data->inst;
3785         struct tgsi_full_src_register memory;
3786         LLVMValueRef chans[4];
3787         LLVMValueRef data;
3788         LLVMValueRef rsrc;
3789         unsigned chan;
3790
3791         emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3792
3793         for (chan = 0; chan < 4; ++chan) {
3794                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3795         }
3796         data = lp_build_gather_values(gallivm, chans, 4);
3797
3798         emit_data->args[emit_data->arg_count++] = data;
3799
3800         memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3801
3802         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3803                 LLVMValueRef offset;
3804                 LLVMValueRef tmp;
3805
3806                 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3807
3808                 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3809                 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3810
3811                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3812                                    offset, false, false);
3813         } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3814                 unsigned target = inst->Memory.Texture;
3815                 LLVMValueRef coords;
3816
3817                 /* 8bit/16bit TC L1 write corruption bug on SI.
3818                  * All store opcodes not aligned to a dword are affected.
3819                  *
3820                  * The only way to get unaligned stores in radeonsi is through
3821                  * shader images.
3822                  */
3823                 bool force_glc = ctx->screen->b.chip_class == SI;
3824
3825                 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
3826                 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
3827
3828                 if (target == TGSI_TEXTURE_BUFFER) {
3829                         buffer_append_args(ctx, emit_data, rsrc, coords,
3830                                            ctx->i32_0, false, force_glc);
3831                 } else {
3832                         emit_data->args[1] = coords;
3833                         emit_data->args[2] = rsrc;
3834                         emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3835                         emit_data->arg_count = 4;
3836
3837                         image_append_args(ctx, emit_data, target, false, force_glc);
3838                 }
3839         }
3840 }
3841
3842 static void store_emit_buffer(
3843                 struct si_shader_context *ctx,
3844                 struct lp_build_emit_data *emit_data,
3845                 bool writeonly_memory)
3846 {
3847         const struct tgsi_full_instruction *inst = emit_data->inst;
3848         struct gallivm_state *gallivm = &ctx->gallivm;
3849         LLVMBuilderRef builder = gallivm->builder;
3850         LLVMValueRef base_data = emit_data->args[0];
3851         LLVMValueRef base_offset = emit_data->args[3];
3852         unsigned writemask = inst->Dst[0].Register.WriteMask;
3853
3854         while (writemask) {
3855                 int start, count;
3856                 const char *intrinsic_name;
3857                 LLVMValueRef data;
3858                 LLVMValueRef offset;
3859                 LLVMValueRef tmp;
3860
3861                 u_bit_scan_consecutive_range(&writemask, &start, &count);
3862
3863                 /* Due to an LLVM limitation, split 3-element writes
3864                  * into a 2-element and a 1-element write. */
3865                 if (count == 3) {
3866                         writemask |= 1 << (start + 2);
3867                         count = 2;
3868                 }
3869
3870                 if (count == 4) {
3871                         data = base_data;
3872                         intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3873                 } else if (count == 2) {
3874                         LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3875
3876                         tmp = LLVMBuildExtractElement(
3877                                 builder, base_data,
3878                                 LLVMConstInt(ctx->i32, start, 0), "");
3879                         data = LLVMBuildInsertElement(
3880                                 builder, LLVMGetUndef(v2f32), tmp,
3881                                 ctx->i32_0, "");
3882
3883                         tmp = LLVMBuildExtractElement(
3884                                 builder, base_data,
3885                                 LLVMConstInt(ctx->i32, start + 1, 0), "");
3886                         data = LLVMBuildInsertElement(
3887                                 builder, data, tmp, ctx->i32_1, "");
3888
3889                         intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3890                 } else {
3891                         assert(count == 1);
3892                         data = LLVMBuildExtractElement(
3893                                 builder, base_data,
3894                                 LLVMConstInt(ctx->i32, start, 0), "");
3895                         intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3896                 }
3897
3898                 offset = base_offset;
3899                 if (start != 0) {
3900                         offset = LLVMBuildAdd(
3901                                 builder, offset,
3902                                 LLVMConstInt(ctx->i32, start * 4, 0), "");
3903                 }
3904
3905                 emit_data->args[0] = data;
3906                 emit_data->args[3] = offset;
3907
3908                 lp_build_intrinsic(
3909                         builder, intrinsic_name, emit_data->dst_type,
3910                         emit_data->args, emit_data->arg_count,
3911                         get_store_intr_attribs(writeonly_memory));
3912         }
3913 }
3914
3915 static void store_emit_memory(
3916                 struct si_shader_context *ctx,
3917                 struct lp_build_emit_data *emit_data)
3918 {
3919         const struct tgsi_full_instruction *inst = emit_data->inst;
3920         struct gallivm_state *gallivm = &ctx->gallivm;
3921         LLVMBuilderRef builder = gallivm->builder;
3922         unsigned writemask = inst->Dst[0].Register.WriteMask;
3923         LLVMValueRef ptr, derived_ptr, data, index;
3924         int chan;
3925
3926         ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
3927
3928         for (chan = 0; chan < 4; ++chan) {
3929                 if (!(writemask & (1 << chan))) {
3930                         continue;
3931                 }
3932                 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
3933                 index = LLVMConstInt(ctx->i32, chan, 0);
3934                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3935                 LLVMBuildStore(builder, data, derived_ptr);
3936         }
3937 }
3938
3939 static void store_emit(
3940                 const struct lp_build_tgsi_action *action,
3941                 struct lp_build_tgsi_context *bld_base,
3942                 struct lp_build_emit_data *emit_data)
3943 {
3944         struct si_shader_context *ctx = si_shader_context(bld_base);
3945         struct gallivm_state *gallivm = &ctx->gallivm;
3946         LLVMBuilderRef builder = gallivm->builder;
3947         const struct tgsi_full_instruction * inst = emit_data->inst;
3948         const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3949         unsigned target = inst->Memory.Texture;
3950         char intrinsic_name[64];
3951         bool writeonly_memory = false;
3952
3953         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3954                 store_emit_memory(ctx, emit_data);
3955                 return;
3956         }
3957
3958         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3959                 emit_waitcnt(ctx, VM_CNT);
3960
3961         writeonly_memory = is_oneway_access_only(inst, info,
3962                                                  info->shader_buffers_load |
3963                                                  info->shader_buffers_atomic,
3964                                                  info->images_load |
3965                                                  info->images_atomic);
3966
3967         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3968                 store_emit_buffer(ctx, emit_data, writeonly_memory);
3969                 return;
3970         }
3971
3972         if (target == TGSI_TEXTURE_BUFFER) {
3973                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3974                         builder, "llvm.amdgcn.buffer.store.format.v4f32",
3975                         emit_data->dst_type, emit_data->args,
3976                         emit_data->arg_count,
3977                         get_store_intr_attribs(writeonly_memory));
3978         } else {
3979                 ac_get_image_intr_name("llvm.amdgcn.image.store",
3980                                        LLVMTypeOf(emit_data->args[0]), /* vdata */
3981                                        LLVMTypeOf(emit_data->args[1]), /* coords */
3982                                        LLVMTypeOf(emit_data->args[2]), /* rsrc */
3983                                        intrinsic_name, sizeof(intrinsic_name));
3984
3985                 emit_data->output[emit_data->chan] =
3986                         lp_build_intrinsic(
3987                                 builder, intrinsic_name, emit_data->dst_type,
3988                                 emit_data->args, emit_data->arg_count,
3989                                 get_store_intr_attribs(writeonly_memory));
3990         }
3991 }
3992
3993 static void atomic_fetch_args(
3994                 struct lp_build_tgsi_context * bld_base,
3995                 struct lp_build_emit_data * emit_data)
3996 {
3997         struct si_shader_context *ctx = si_shader_context(bld_base);
3998         struct gallivm_state *gallivm = &ctx->gallivm;
3999         LLVMBuilderRef builder = gallivm->builder;
4000         const struct tgsi_full_instruction * inst = emit_data->inst;
4001         LLVMValueRef data1, data2;
4002         LLVMValueRef rsrc;
4003         LLVMValueRef tmp;
4004
4005         emit_data->dst_type = ctx->f32;
4006
4007         tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4008         data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4009
4010         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4011                 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4012                 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4013         }
4014
4015         /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4016          * of arguments, which is reversed relative to TGSI (and GLSL)
4017          */
4018         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4019                 emit_data->args[emit_data->arg_count++] = data2;
4020         emit_data->args[emit_data->arg_count++] = data1;
4021
4022         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4023                 LLVMValueRef offset;
4024
4025                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4026
4027                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4028                 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4029
4030                 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4031                                    offset, true, false);
4032         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4033                 unsigned target = inst->Memory.Texture;
4034                 LLVMValueRef coords;
4035
4036                 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4037                 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4038
4039                 if (target == TGSI_TEXTURE_BUFFER) {
4040                         buffer_append_args(ctx, emit_data, rsrc, coords,
4041                                            ctx->i32_0, true, false);
4042                 } else {
4043                         emit_data->args[emit_data->arg_count++] = coords;
4044                         emit_data->args[emit_data->arg_count++] = rsrc;
4045
4046                         image_append_args(ctx, emit_data, target, true, false);
4047                 }
4048         }
4049 }
4050
4051 static void atomic_emit_memory(struct si_shader_context *ctx,
4052                                struct lp_build_emit_data *emit_data) {
4053         struct gallivm_state *gallivm = &ctx->gallivm;
4054         LLVMBuilderRef builder = gallivm->builder;
4055         const struct tgsi_full_instruction * inst = emit_data->inst;
4056         LLVMValueRef ptr, result, arg;
4057
4058         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4059
4060         arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4061         arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4062
4063         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4064                 LLVMValueRef new_data;
4065                 new_data = lp_build_emit_fetch(&ctx->bld_base,
4066                                                inst, 3, 0);
4067
4068                 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4069
4070 #if HAVE_LLVM >= 0x309
4071                 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4072                                        LLVMAtomicOrderingSequentiallyConsistent,
4073                                        LLVMAtomicOrderingSequentiallyConsistent,
4074                                        false);
4075 #endif
4076
4077                 result = LLVMBuildExtractValue(builder, result, 0, "");
4078         } else {
4079                 LLVMAtomicRMWBinOp op;
4080
4081                 switch(inst->Instruction.Opcode) {
4082                         case TGSI_OPCODE_ATOMUADD:
4083                                 op = LLVMAtomicRMWBinOpAdd;
4084                                 break;
4085                         case TGSI_OPCODE_ATOMXCHG:
4086                                 op = LLVMAtomicRMWBinOpXchg;
4087                                 break;
4088                         case TGSI_OPCODE_ATOMAND:
4089                                 op = LLVMAtomicRMWBinOpAnd;
4090                                 break;
4091                         case TGSI_OPCODE_ATOMOR:
4092                                 op = LLVMAtomicRMWBinOpOr;
4093                                 break;
4094                         case TGSI_OPCODE_ATOMXOR:
4095                                 op = LLVMAtomicRMWBinOpXor;
4096                                 break;
4097                         case TGSI_OPCODE_ATOMUMIN:
4098                                 op = LLVMAtomicRMWBinOpUMin;
4099                                 break;
4100                         case TGSI_OPCODE_ATOMUMAX:
4101                                 op = LLVMAtomicRMWBinOpUMax;
4102                                 break;
4103                         case TGSI_OPCODE_ATOMIMIN:
4104                                 op = LLVMAtomicRMWBinOpMin;
4105                                 break;
4106                         case TGSI_OPCODE_ATOMIMAX:
4107                                 op = LLVMAtomicRMWBinOpMax;
4108                                 break;
4109                         default:
4110                                 unreachable("unknown atomic opcode");
4111                 }
4112
4113                 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4114                                        LLVMAtomicOrderingSequentiallyConsistent,
4115                                        false);
4116         }
4117         emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4118 }
4119
4120 static void atomic_emit(
4121                 const struct lp_build_tgsi_action *action,
4122                 struct lp_build_tgsi_context *bld_base,
4123                 struct lp_build_emit_data *emit_data)
4124 {
4125         struct si_shader_context *ctx = si_shader_context(bld_base);
4126         struct gallivm_state *gallivm = &ctx->gallivm;
4127         LLVMBuilderRef builder = gallivm->builder;
4128         const struct tgsi_full_instruction * inst = emit_data->inst;
4129         char intrinsic_name[40];
4130         LLVMValueRef tmp;
4131
4132         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4133                 atomic_emit_memory(ctx, emit_data);
4134                 return;
4135         }
4136
4137         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4138             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4139                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4140                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4141         } else {
4142                 LLVMValueRef coords;
4143                 char coords_type[8];
4144
4145                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4146                         coords = emit_data->args[2];
4147                 else
4148                         coords = emit_data->args[1];
4149
4150                 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4151                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4152                          "llvm.amdgcn.image.atomic.%s.%s",
4153                          action->intr_name, coords_type);
4154         }
4155
4156         tmp = lp_build_intrinsic(
4157                 builder, intrinsic_name, ctx->i32,
4158                 emit_data->args, emit_data->arg_count, 0);
4159         emit_data->output[emit_data->chan] =
4160                 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4161 }
4162
4163 static void set_tex_fetch_args(struct si_shader_context *ctx,
4164                                struct lp_build_emit_data *emit_data,
4165                                unsigned target,
4166                                LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4167                                LLVMValueRef *param, unsigned count,
4168                                unsigned dmask)
4169 {
4170         struct gallivm_state *gallivm = &ctx->gallivm;
4171         struct ac_image_args args = {};
4172
4173         /* Pad to power of two vector */
4174         while (count < util_next_power_of_two(count))
4175                 param[count++] = LLVMGetUndef(ctx->i32);
4176
4177         if (count > 1)
4178                 args.addr = lp_build_gather_values(gallivm, param, count);
4179         else
4180                 args.addr = param[0];
4181
4182         args.resource = res_ptr;
4183         args.sampler = samp_ptr;
4184         args.dmask = dmask;
4185         args.unorm = target == TGSI_TEXTURE_RECT ||
4186                      target == TGSI_TEXTURE_SHADOWRECT;
4187         args.da = tgsi_is_array_sampler(target);
4188
4189         /* Ugly, but we seem to have no other choice right now. */
4190         STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4191         memcpy(emit_data->args, &args, sizeof(args));
4192 }
4193
4194 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4195                                 unsigned target, LLVMValueRef out)
4196 {
4197         LLVMBuilderRef builder = ctx->gallivm.builder;
4198
4199         /* 1D textures are allocated and used as 2D on GFX9. */
4200         if (ctx->screen->b.chip_class >= GFX9 &&
4201             (target == TGSI_TEXTURE_1D_ARRAY ||
4202              target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4203                 LLVMValueRef layers =
4204                         LLVMBuildExtractElement(builder, out,
4205                                                 LLVMConstInt(ctx->i32, 2, 0), "");
4206                 out = LLVMBuildInsertElement(builder, out, layers,
4207                                              ctx->i32_1, "");
4208         }
4209
4210         /* Divide the number of layers by 6 to get the number of cubes. */
4211         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4212             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4213                 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4214
4215                 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4216                 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4217
4218                 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4219         }
4220         return out;
4221 }
4222
4223 static void resq_fetch_args(
4224                 struct lp_build_tgsi_context * bld_base,
4225                 struct lp_build_emit_data * emit_data)
4226 {
4227         struct si_shader_context *ctx = si_shader_context(bld_base);
4228         const struct tgsi_full_instruction *inst = emit_data->inst;
4229         const struct tgsi_full_src_register *reg = &inst->Src[0];
4230
4231         emit_data->dst_type = ctx->v4i32;
4232
4233         if (reg->Register.File == TGSI_FILE_BUFFER) {
4234                 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4235                 emit_data->arg_count = 1;
4236         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4237                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4238                                  &emit_data->args[0]);
4239                 emit_data->arg_count = 1;
4240         } else {
4241                 LLVMValueRef res_ptr;
4242                 unsigned image_target;
4243
4244                 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4245                         image_target = TGSI_TEXTURE_2D_ARRAY;
4246                 else
4247                         image_target = inst->Memory.Texture;
4248
4249                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4250                                  &res_ptr);
4251                 set_tex_fetch_args(ctx, emit_data, image_target,
4252                                    res_ptr, NULL, &ctx->i32_0, 1,
4253                                    0xf);
4254         }
4255 }
4256
4257 static void resq_emit(
4258                 const struct lp_build_tgsi_action *action,
4259                 struct lp_build_tgsi_context *bld_base,
4260                 struct lp_build_emit_data *emit_data)
4261 {
4262         struct si_shader_context *ctx = si_shader_context(bld_base);
4263         struct gallivm_state *gallivm = &ctx->gallivm;
4264         LLVMBuilderRef builder = gallivm->builder;
4265         const struct tgsi_full_instruction *inst = emit_data->inst;
4266         LLVMValueRef out;
4267
4268         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4269                 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4270                                               LLVMConstInt(ctx->i32, 2, 0), "");
4271         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4272                 out = get_buffer_size(bld_base, emit_data->args[0]);
4273         } else {
4274                 struct ac_image_args args;
4275
4276                 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4277                 args.opcode = ac_image_get_resinfo;
4278                 out = ac_build_image_opcode(&ctx->ac, &args);
4279
4280                 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4281         }
4282
4283         emit_data->output[emit_data->chan] = out;
4284 }
4285
4286 static const struct lp_build_tgsi_action tex_action;
4287
4288 enum desc_type {
4289         DESC_IMAGE,
4290         DESC_BUFFER,
4291         DESC_FMASK,
4292         DESC_SAMPLER,
4293 };
4294
4295 /**
4296  * Load an image view, fmask view. or sampler state descriptor.
4297  */
4298 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4299                                       LLVMValueRef list, LLVMValueRef index,
4300                                       enum desc_type type)
4301 {
4302         struct gallivm_state *gallivm = &ctx->gallivm;
4303         LLVMBuilderRef builder = gallivm->builder;
4304
4305         switch (type) {
4306         case DESC_IMAGE:
4307                 /* The image is at [0:7]. */
4308                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4309                 break;
4310         case DESC_BUFFER:
4311                 /* The buffer is in [4:7]. */
4312                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4313                 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4314                 list = LLVMBuildPointerCast(builder, list,
4315                                             const_array(ctx->v4i32, 0), "");
4316                 break;
4317         case DESC_FMASK:
4318                 /* The FMASK is at [8:15]. */
4319                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4320                 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4321                 break;
4322         case DESC_SAMPLER:
4323                 /* The sampler state is at [12:15]. */
4324                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4325                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4326                 list = LLVMBuildPointerCast(builder, list,
4327                                             const_array(ctx->v4i32, 0), "");
4328                 break;
4329         }
4330
4331         return ac_build_indexed_load_const(&ctx->ac, list, index);
4332 }
4333
4334 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4335  *
4336  * SI-CI:
4337  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4338  *   filtering manually. The driver sets img7 to a mask clearing
4339  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4340  *     s_and_b32 samp0, samp0, img7
4341  *
4342  * VI:
4343  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
4344  */
4345 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4346                                            LLVMValueRef res, LLVMValueRef samp)
4347 {
4348         LLVMBuilderRef builder = ctx->gallivm.builder;
4349         LLVMValueRef img7, samp0;
4350
4351         if (ctx->screen->b.chip_class >= VI)
4352                 return samp;
4353
4354         img7 = LLVMBuildExtractElement(builder, res,
4355                                        LLVMConstInt(ctx->i32, 7, 0), "");
4356         samp0 = LLVMBuildExtractElement(builder, samp,
4357                                         ctx->i32_0, "");
4358         samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4359         return LLVMBuildInsertElement(builder, samp, samp0,
4360                                       ctx->i32_0, "");
4361 }
4362
4363 static void tex_fetch_ptrs(
4364         struct lp_build_tgsi_context *bld_base,
4365         struct lp_build_emit_data *emit_data,
4366         LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4367 {
4368         struct si_shader_context *ctx = si_shader_context(bld_base);
4369         LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4370         const struct tgsi_full_instruction *inst = emit_data->inst;
4371         const struct tgsi_full_src_register *reg;
4372         unsigned target = inst->Texture.Texture;
4373         unsigned sampler_src;
4374         LLVMValueRef index;
4375
4376         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4377         reg = &emit_data->inst->Src[sampler_src];
4378
4379         if (reg->Register.Indirect) {
4380                 index = get_bounded_indirect_index(ctx,
4381                                                    &reg->Indirect,
4382                                                    reg->Register.Index,
4383                                                    SI_NUM_SAMPLERS);
4384         } else {
4385                 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4386         }
4387
4388         if (target == TGSI_TEXTURE_BUFFER)
4389                 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4390         else
4391                 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4392
4393         if (samp_ptr)
4394                 *samp_ptr = NULL;
4395         if (fmask_ptr)
4396                 *fmask_ptr = NULL;
4397
4398         if (target == TGSI_TEXTURE_2D_MSAA ||
4399             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4400                 if (fmask_ptr)
4401                         *fmask_ptr = load_sampler_desc(ctx, list, index,
4402                                                        DESC_FMASK);
4403         } else if (target != TGSI_TEXTURE_BUFFER) {
4404                 if (samp_ptr) {
4405                         *samp_ptr = load_sampler_desc(ctx, list, index,
4406                                                       DESC_SAMPLER);
4407                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4408                 }
4409         }
4410 }
4411
4412 static void txq_fetch_args(
4413         struct lp_build_tgsi_context *bld_base,
4414         struct lp_build_emit_data *emit_data)
4415 {
4416         struct si_shader_context *ctx = si_shader_context(bld_base);
4417         const struct tgsi_full_instruction *inst = emit_data->inst;
4418         unsigned target = inst->Texture.Texture;
4419         LLVMValueRef res_ptr;
4420         LLVMValueRef address;
4421
4422         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4423
4424         if (target == TGSI_TEXTURE_BUFFER) {
4425                 /* Read the size from the buffer descriptor directly. */
4426                 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4427                 return;
4428         }
4429
4430         /* Textures - set the mip level. */
4431         address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4432
4433         set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4434                            NULL, &address, 1, 0xf);
4435 }
4436
4437 static void txq_emit(const struct lp_build_tgsi_action *action,
4438                      struct lp_build_tgsi_context *bld_base,
4439                      struct lp_build_emit_data *emit_data)
4440 {
4441         struct si_shader_context *ctx = si_shader_context(bld_base);
4442         struct ac_image_args args;
4443         unsigned target = emit_data->inst->Texture.Texture;
4444
4445         if (target == TGSI_TEXTURE_BUFFER) {
4446                 /* Just return the buffer size. */
4447                 emit_data->output[emit_data->chan] = emit_data->args[0];
4448                 return;
4449         }
4450
4451         memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4452
4453         args.opcode = ac_image_get_resinfo;
4454         LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4455
4456         emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4457 }
4458
4459 static void tex_fetch_args(
4460         struct lp_build_tgsi_context *bld_base,
4461         struct lp_build_emit_data *emit_data)
4462 {
4463         struct si_shader_context *ctx = si_shader_context(bld_base);
4464         struct gallivm_state *gallivm = &ctx->gallivm;
4465         const struct tgsi_full_instruction *inst = emit_data->inst;
4466         unsigned opcode = inst->Instruction.Opcode;
4467         unsigned target = inst->Texture.Texture;
4468         LLVMValueRef coords[5], derivs[6];
4469         LLVMValueRef address[16];
4470         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4471         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4472         unsigned count = 0;
4473         unsigned chan;
4474         unsigned num_deriv_channels = 0;
4475         bool has_offset = inst->Texture.NumOffsets > 0;
4476         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4477         unsigned dmask = 0xf;
4478
4479         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4480
4481         if (target == TGSI_TEXTURE_BUFFER) {
4482                 emit_data->dst_type = ctx->v4f32;
4483                 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
4484                                                       ctx->v16i8, "");
4485                 emit_data->args[1] = ctx->i32_0;
4486                 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4487                 emit_data->arg_count = 3;
4488                 return;
4489         }
4490
4491         /* Fetch and project texture coordinates */
4492         coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4493         for (chan = 0; chan < 3; chan++ ) {
4494                 coords[chan] = lp_build_emit_fetch(bld_base,
4495                                                    emit_data->inst, 0,
4496                                                    chan);
4497                 if (opcode == TGSI_OPCODE_TXP)
4498                         coords[chan] = lp_build_emit_llvm_binary(bld_base,
4499                                                                  TGSI_OPCODE_DIV,
4500                                                                  coords[chan],
4501                                                                  coords[3]);
4502         }
4503
4504         if (opcode == TGSI_OPCODE_TXP)
4505                 coords[3] = bld_base->base.one;
4506
4507         /* Pack offsets. */
4508         if (has_offset &&
4509             opcode != TGSI_OPCODE_TXF &&
4510             opcode != TGSI_OPCODE_TXF_LZ) {
4511                 /* The offsets are six-bit signed integers packed like this:
4512                  *   X=[5:0], Y=[13:8], and Z=[21:16].
4513                  */
4514                 LLVMValueRef offset[3], pack;
4515
4516                 assert(inst->Texture.NumOffsets == 1);
4517
4518                 for (chan = 0; chan < 3; chan++) {
4519                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4520                                                                      emit_data->inst, 0, chan);
4521                         offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4522                                                     LLVMConstInt(ctx->i32, 0x3f, 0), "");
4523                         if (chan)
4524                                 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4525                                                             LLVMConstInt(ctx->i32, chan*8, 0), "");
4526                 }
4527
4528                 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4529                 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4530                 address[count++] = pack;
4531         }
4532
4533         /* Pack LOD bias value */
4534         if (opcode == TGSI_OPCODE_TXB)
4535                 address[count++] = coords[3];
4536         if (opcode == TGSI_OPCODE_TXB2)
4537                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4538
4539         /* Pack depth comparison value */
4540         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4541                 LLVMValueRef z;
4542
4543                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4544                         z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4545                 } else {
4546                         assert(ref_pos >= 0);
4547                         z = coords[ref_pos];
4548                 }
4549
4550                 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4551                  * so the depth comparison value isn't clamped for Z16 and
4552                  * Z24 anymore. Do it manually here.
4553                  *
4554                  * It's unnecessary if the original texture format was
4555                  * Z32_FLOAT, but we don't know that here.
4556                  */
4557                 if (ctx->screen->b.chip_class == VI)
4558                         z = ac_build_clamp(&ctx->ac, z);
4559
4560                 address[count++] = z;
4561         }
4562
4563         /* Pack user derivatives */
4564         if (opcode == TGSI_OPCODE_TXD) {
4565                 int param, num_src_deriv_channels, num_dst_deriv_channels;
4566
4567                 switch (target) {
4568                 case TGSI_TEXTURE_3D:
4569                         num_src_deriv_channels = 3;
4570                         num_dst_deriv_channels = 3;
4571                         num_deriv_channels = 3;
4572                         break;
4573                 case TGSI_TEXTURE_2D:
4574                 case TGSI_TEXTURE_SHADOW2D:
4575                 case TGSI_TEXTURE_RECT:
4576                 case TGSI_TEXTURE_SHADOWRECT:
4577                 case TGSI_TEXTURE_2D_ARRAY:
4578                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4579                         num_src_deriv_channels = 2;
4580                         num_dst_deriv_channels = 2;
4581                         num_deriv_channels = 2;
4582                         break;
4583                 case TGSI_TEXTURE_CUBE:
4584                 case TGSI_TEXTURE_SHADOWCUBE:
4585                 case TGSI_TEXTURE_CUBE_ARRAY:
4586                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4587                         /* Cube derivatives will be converted to 2D. */
4588                         num_src_deriv_channels = 3;
4589                         num_dst_deriv_channels = 3;
4590                         num_deriv_channels = 2;
4591                         break;
4592                 case TGSI_TEXTURE_1D:
4593                 case TGSI_TEXTURE_SHADOW1D:
4594                 case TGSI_TEXTURE_1D_ARRAY:
4595                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4596                         num_src_deriv_channels = 1;
4597
4598                         /* 1D textures are allocated and used as 2D on GFX9. */
4599                         if (ctx->screen->b.chip_class >= GFX9) {
4600                                 num_dst_deriv_channels = 2;
4601                                 num_deriv_channels = 2;
4602                         } else {
4603                                 num_dst_deriv_channels = 1;
4604                                 num_deriv_channels = 1;
4605                         }
4606                         break;
4607                 default:
4608                         unreachable("invalid target");
4609                 }
4610
4611                 for (param = 0; param < 2; param++) {
4612                         for (chan = 0; chan < num_src_deriv_channels; chan++)
4613                                 derivs[param * num_dst_deriv_channels + chan] =
4614                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
4615
4616                         /* Fill in the rest with zeros. */
4617                         for (chan = num_src_deriv_channels;
4618                              chan < num_dst_deriv_channels; chan++)
4619                                 derivs[param * num_dst_deriv_channels + chan] =
4620                                         bld_base->base.zero;
4621                 }
4622         }
4623
4624         if (target == TGSI_TEXTURE_CUBE ||
4625             target == TGSI_TEXTURE_CUBE_ARRAY ||
4626             target == TGSI_TEXTURE_SHADOWCUBE ||
4627             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4628                 ac_prepare_cube_coords(&ctx->ac,
4629                                        opcode == TGSI_OPCODE_TXD,
4630                                        target == TGSI_TEXTURE_CUBE_ARRAY ||
4631                                        target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4632                                        coords, derivs);
4633
4634         if (opcode == TGSI_OPCODE_TXD)
4635                 for (int i = 0; i < num_deriv_channels * 2; i++)
4636                         address[count++] = derivs[i];
4637
4638         /* Pack texture coordinates */
4639         address[count++] = coords[0];
4640         if (num_coords > 1)
4641                 address[count++] = coords[1];
4642         if (num_coords > 2)
4643                 address[count++] = coords[2];
4644
4645         /* 1D textures are allocated and used as 2D on GFX9. */
4646         if (ctx->screen->b.chip_class >= GFX9) {
4647                 LLVMValueRef filler;
4648
4649                 /* Use 0.5, so that we don't sample the border color. */
4650                 if (opcode == TGSI_OPCODE_TXF)
4651                         filler = ctx->i32_0;
4652                 else
4653                         filler = LLVMConstReal(ctx->f32, 0.5);
4654
4655                 if (target == TGSI_TEXTURE_1D ||
4656                     target == TGSI_TEXTURE_SHADOW1D) {
4657                         address[count++] = filler;
4658                 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4659                            target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4660                         address[count] = address[count - 1];
4661                         address[count - 1] = filler;
4662                         count++;
4663                 }
4664         }
4665
4666         /* Pack LOD or sample index */
4667         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4668                 address[count++] = coords[3];
4669         else if (opcode == TGSI_OPCODE_TXL2)
4670                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4671
4672         if (count > 16) {
4673                 assert(!"Cannot handle more than 16 texture address parameters");
4674                 count = 16;
4675         }
4676
4677         for (chan = 0; chan < count; chan++ ) {
4678                 address[chan] = LLVMBuildBitCast(gallivm->builder,
4679                                                  address[chan], ctx->i32, "");
4680         }
4681
4682         /* Adjust the sample index according to FMASK.
4683          *
4684          * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4685          * which is the identity mapping. Each nibble says which physical sample
4686          * should be fetched to get that sample.
4687          *
4688          * For example, 0x11111100 means there are only 2 samples stored and
4689          * the second sample covers 3/4 of the pixel. When reading samples 0
4690          * and 1, return physical sample 0 (determined by the first two 0s
4691          * in FMASK), otherwise return physical sample 1.
4692          *
4693          * The sample index should be adjusted as follows:
4694          *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
4695          */
4696         if (target == TGSI_TEXTURE_2D_MSAA ||
4697             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4698                 struct lp_build_emit_data txf_emit_data = *emit_data;
4699                 LLVMValueRef txf_address[4];
4700                 /* We only need .xy for non-arrays, and .xyz for arrays. */
4701                 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4702                 struct tgsi_full_instruction inst = {};
4703
4704                 memcpy(txf_address, address, sizeof(txf_address));
4705
4706                 /* Read FMASK using TXF_LZ. */
4707                 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4708                 inst.Texture.Texture = target;
4709                 txf_emit_data.inst = &inst;
4710                 txf_emit_data.chan = 0;
4711                 set_tex_fetch_args(ctx, &txf_emit_data,
4712                                    target, fmask_ptr, NULL,
4713                                    txf_address, txf_count, 0xf);
4714                 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4715
4716                 /* Initialize some constants. */
4717                 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4718                 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4719
4720                 /* Apply the formula. */
4721                 LLVMValueRef fmask =
4722                         LLVMBuildExtractElement(gallivm->builder,
4723                                                 txf_emit_data.output[0],
4724                                                 ctx->i32_0, "");
4725
4726                 unsigned sample_chan = txf_count; /* the sample index is last */
4727
4728                 LLVMValueRef sample_index4 =
4729                         LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4730
4731                 LLVMValueRef shifted_fmask =
4732                         LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4733
4734                 LLVMValueRef final_sample =
4735                         LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4736
4737                 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4738                  * resource descriptor is 0 (invalid),
4739                  */
4740                 LLVMValueRef fmask_desc =
4741                         LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4742                                          ctx->v8i32, "");
4743
4744                 LLVMValueRef fmask_word1 =
4745                         LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4746                                                 ctx->i32_1, "");
4747
4748                 LLVMValueRef word1_is_nonzero =
4749                         LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4750                                       fmask_word1, ctx->i32_0, "");
4751
4752                 /* Replace the MSAA sample index. */
4753                 address[sample_chan] =
4754                         LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4755                                         final_sample, address[sample_chan], "");
4756         }
4757
4758         if (opcode == TGSI_OPCODE_TXF ||
4759             opcode == TGSI_OPCODE_TXF_LZ) {
4760                 /* add tex offsets */
4761                 if (inst->Texture.NumOffsets) {
4762                         struct lp_build_context *uint_bld = &bld_base->uint_bld;
4763                         const struct tgsi_texture_offset *off = inst->TexOffsets;
4764
4765                         assert(inst->Texture.NumOffsets == 1);
4766
4767                         switch (target) {
4768                         case TGSI_TEXTURE_3D:
4769                                 address[2] = lp_build_add(uint_bld, address[2],
4770                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4771                                 /* fall through */
4772                         case TGSI_TEXTURE_2D:
4773                         case TGSI_TEXTURE_SHADOW2D:
4774                         case TGSI_TEXTURE_RECT:
4775                         case TGSI_TEXTURE_SHADOWRECT:
4776                         case TGSI_TEXTURE_2D_ARRAY:
4777                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
4778                                 address[1] =
4779                                         lp_build_add(uint_bld, address[1],
4780                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4781                                 /* fall through */
4782                         case TGSI_TEXTURE_1D:
4783                         case TGSI_TEXTURE_SHADOW1D:
4784                         case TGSI_TEXTURE_1D_ARRAY:
4785                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
4786                                 address[0] =
4787                                         lp_build_add(uint_bld, address[0],
4788                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4789                                 break;
4790                                 /* texture offsets do not apply to other texture targets */
4791                         }
4792                 }
4793         }
4794
4795         if (opcode == TGSI_OPCODE_TG4) {
4796                 unsigned gather_comp = 0;
4797
4798                 /* DMASK was repurposed for GATHER4. 4 components are always
4799                  * returned and DMASK works like a swizzle - it selects
4800                  * the component to fetch. The only valid DMASK values are
4801                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4802                  * (red,red,red,red) etc.) The ISA document doesn't mention
4803                  * this.
4804                  */
4805
4806                 /* Get the component index from src1.x for Gather4. */
4807                 if (!tgsi_is_shadow_target(target)) {
4808                         LLVMValueRef comp_imm;
4809                         struct tgsi_src_register src1 = inst->Src[1].Register;
4810
4811                         assert(src1.File == TGSI_FILE_IMMEDIATE);
4812
4813                         comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
4814                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4815                         gather_comp = CLAMP(gather_comp, 0, 3);
4816                 }
4817
4818                 dmask = 1 << gather_comp;
4819         }
4820
4821         set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4822                            samp_ptr, address, count, dmask);
4823 }
4824
4825 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
4826  * incorrectly forces nearest filtering if the texture format is integer.
4827  * The only effect it has on Gather4, which always returns 4 texels for
4828  * bilinear filtering, is that the final coordinates are off by 0.5 of
4829  * the texel size.
4830  *
4831  * The workaround is to subtract 0.5 from the unnormalized coordinates,
4832  * or (0.5 / size) from the normalized coordinates.
4833  */
4834 static void si_lower_gather4_integer(struct si_shader_context *ctx,
4835                                      struct ac_image_args *args,
4836                                      unsigned target)
4837 {
4838         LLVMBuilderRef builder = ctx->gallivm.builder;
4839         LLVMValueRef coord = args->addr;
4840         LLVMValueRef half_texel[2];
4841         /* Texture coordinates start after:
4842          *   {offset, bias, z-compare, derivatives}
4843          * Only the offset and z-compare can occur here.
4844          */
4845         unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
4846         int c;
4847
4848         if (target == TGSI_TEXTURE_RECT ||
4849             target == TGSI_TEXTURE_SHADOWRECT) {
4850                 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
4851         } else {
4852                 struct tgsi_full_instruction txq_inst = {};
4853                 struct lp_build_emit_data txq_emit_data = {};
4854
4855                 /* Query the texture size. */
4856                 txq_inst.Texture.Texture = target;
4857                 txq_emit_data.inst = &txq_inst;
4858                 txq_emit_data.dst_type = ctx->v4i32;
4859                 set_tex_fetch_args(ctx, &txq_emit_data, target,
4860                                    args->resource, NULL, &ctx->i32_0,
4861                                    1, 0xf);
4862                 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
4863
4864                 /* Compute -0.5 / size. */
4865                 for (c = 0; c < 2; c++) {
4866                         half_texel[c] =
4867                                 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
4868                                                         LLVMConstInt(ctx->i32, c, 0), "");
4869                         half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
4870                         half_texel[c] =
4871                                 lp_build_emit_llvm_unary(&ctx->bld_base,
4872                                                          TGSI_OPCODE_RCP, half_texel[c]);
4873                         half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
4874                                                       LLVMConstReal(ctx->f32, -0.5), "");
4875                 }
4876         }
4877
4878         for (c = 0; c < 2; c++) {
4879                 LLVMValueRef tmp;
4880                 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
4881
4882                 tmp = LLVMBuildExtractElement(builder, coord, index, "");
4883                 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4884                 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
4885                 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4886                 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
4887         }
4888
4889         args->addr = coord;
4890 }
4891
4892 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4893                                 struct lp_build_tgsi_context *bld_base,
4894                                 struct lp_build_emit_data *emit_data)
4895 {
4896         struct si_shader_context *ctx = si_shader_context(bld_base);
4897         const struct tgsi_full_instruction *inst = emit_data->inst;
4898         struct ac_image_args args;
4899         unsigned opcode = inst->Instruction.Opcode;
4900         unsigned target = inst->Texture.Texture;
4901
4902         if (target == TGSI_TEXTURE_BUFFER) {
4903                 emit_data->output[emit_data->chan] =
4904                         ac_build_buffer_load_format(&ctx->ac,
4905                                                     emit_data->args[0],
4906                                                     emit_data->args[2],
4907                                                     emit_data->args[1],
4908                                                     true);
4909                 return;
4910         }
4911
4912         memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4913
4914         args.opcode = ac_image_sample;
4915         args.compare = tgsi_is_shadow_target(target);
4916         args.offset = inst->Texture.NumOffsets > 0;
4917
4918         switch (opcode) {
4919         case TGSI_OPCODE_TXF:
4920         case TGSI_OPCODE_TXF_LZ:
4921                 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
4922                               target == TGSI_TEXTURE_2D_MSAA ||
4923                               target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4924                                       ac_image_load : ac_image_load_mip;
4925                 args.compare = false;
4926                 args.offset = false;
4927                 break;
4928         case TGSI_OPCODE_LODQ:
4929                 args.opcode = ac_image_get_lod;
4930                 args.compare = false;
4931                 args.offset = false;
4932                 break;
4933         case TGSI_OPCODE_TEX:
4934         case TGSI_OPCODE_TEX2:
4935         case TGSI_OPCODE_TXP:
4936                 if (ctx->type != PIPE_SHADER_FRAGMENT)
4937                         args.level_zero = true;
4938                 break;
4939         case TGSI_OPCODE_TEX_LZ:
4940                 args.level_zero = true;
4941                 break;
4942         case TGSI_OPCODE_TXB:
4943         case TGSI_OPCODE_TXB2:
4944                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4945                 args.bias = true;
4946                 break;
4947         case TGSI_OPCODE_TXL:
4948         case TGSI_OPCODE_TXL2:
4949                 args.lod = true;
4950                 break;
4951         case TGSI_OPCODE_TXD:
4952                 args.deriv = true;
4953                 break;
4954         case TGSI_OPCODE_TG4:
4955                 args.opcode = ac_image_gather4;
4956                 args.level_zero = true;
4957                 break;
4958         default:
4959                 assert(0);
4960                 return;
4961         }
4962
4963         /* The hardware needs special lowering for Gather4 with integer formats. */
4964         if (ctx->screen->b.chip_class <= VI &&
4965             opcode == TGSI_OPCODE_TG4) {
4966                 struct tgsi_shader_info *info = &ctx->shader->selector->info;
4967                 /* This will also work with non-constant indexing because of how
4968                  * glsl_to_tgsi works and we intent to preserve that behavior.
4969                  */
4970                 const unsigned src_idx = 2;
4971                 unsigned sampler = inst->Src[src_idx].Register.Index;
4972
4973                 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
4974
4975                 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
4976                     info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
4977                         si_lower_gather4_integer(ctx, &args, target);
4978         }
4979
4980         emit_data->output[emit_data->chan] =
4981                 ac_build_image_opcode(&ctx->ac, &args);
4982 }
4983
4984 static void si_llvm_emit_txqs(
4985         const struct lp_build_tgsi_action *action,
4986         struct lp_build_tgsi_context *bld_base,
4987         struct lp_build_emit_data *emit_data)
4988 {
4989         struct si_shader_context *ctx = si_shader_context(bld_base);
4990         struct gallivm_state *gallivm = &ctx->gallivm;
4991         LLVMBuilderRef builder = gallivm->builder;
4992         LLVMValueRef res, samples;
4993         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4994
4995         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4996
4997
4998         /* Read the samples from the descriptor directly. */
4999         res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5000         samples = LLVMBuildExtractElement(
5001                 builder, res,
5002                 LLVMConstInt(ctx->i32, 3, 0), "");
5003         samples = LLVMBuildLShr(builder, samples,
5004                                 LLVMConstInt(ctx->i32, 16, 0), "");
5005         samples = LLVMBuildAnd(builder, samples,
5006                                LLVMConstInt(ctx->i32, 0xf, 0), "");
5007         samples = LLVMBuildShl(builder, ctx->i32_1,
5008                                samples, "");
5009
5010         emit_data->output[emit_data->chan] = samples;
5011 }
5012
5013 static void si_llvm_emit_ddxy(
5014         const struct lp_build_tgsi_action *action,
5015         struct lp_build_tgsi_context *bld_base,
5016         struct lp_build_emit_data *emit_data)
5017 {
5018         struct si_shader_context *ctx = si_shader_context(bld_base);
5019         struct gallivm_state *gallivm = &ctx->gallivm;
5020         unsigned opcode = emit_data->info->opcode;
5021         LLVMValueRef val;
5022         int idx;
5023         unsigned mask;
5024
5025         if (opcode == TGSI_OPCODE_DDX_FINE)
5026                 mask = AC_TID_MASK_LEFT;
5027         else if (opcode == TGSI_OPCODE_DDY_FINE)
5028                 mask = AC_TID_MASK_TOP;
5029         else
5030                 mask = AC_TID_MASK_TOP_LEFT;
5031
5032         /* for DDX we want to next X pixel, DDY next Y pixel. */
5033         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5034
5035         val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5036         val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5037                             mask, idx, ctx->lds, val);
5038         emit_data->output[emit_data->chan] = val;
5039 }
5040
5041 /*
5042  * this takes an I,J coordinate pair,
5043  * and works out the X and Y derivatives.
5044  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5045  */
5046 static LLVMValueRef si_llvm_emit_ddxy_interp(
5047         struct lp_build_tgsi_context *bld_base,
5048         LLVMValueRef interp_ij)
5049 {
5050         struct si_shader_context *ctx = si_shader_context(bld_base);
5051         struct gallivm_state *gallivm = &ctx->gallivm;
5052         LLVMValueRef result[4], a;
5053         unsigned i;
5054
5055         for (i = 0; i < 2; i++) {
5056                 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5057                                             LLVMConstInt(ctx->i32, i, 0), "");
5058                 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5059                 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5060         }
5061
5062         return lp_build_gather_values(gallivm, result, 4);
5063 }
5064
5065 static void interp_fetch_args(
5066         struct lp_build_tgsi_context *bld_base,
5067         struct lp_build_emit_data *emit_data)
5068 {
5069         struct si_shader_context *ctx = si_shader_context(bld_base);
5070         struct gallivm_state *gallivm = &ctx->gallivm;
5071         const struct tgsi_full_instruction *inst = emit_data->inst;
5072
5073         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5074                 /* offset is in second src, first two channels */
5075                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5076                                                          emit_data->inst, 1,
5077                                                          TGSI_CHAN_X);
5078                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5079                                                          emit_data->inst, 1,
5080                                                          TGSI_CHAN_Y);
5081                 emit_data->arg_count = 2;
5082         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5083                 LLVMValueRef sample_position;
5084                 LLVMValueRef sample_id;
5085                 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5086
5087                 /* fetch sample ID, then fetch its sample position,
5088                  * and place into first two channels.
5089                  */
5090                 sample_id = lp_build_emit_fetch(bld_base,
5091                                                 emit_data->inst, 1, TGSI_CHAN_X);
5092                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5093                                              ctx->i32, "");
5094                 sample_position = load_sample_position(ctx, sample_id);
5095
5096                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5097                                                              sample_position,
5098                                                              ctx->i32_0, "");
5099
5100                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5101                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5102                                                              sample_position,
5103                                                              ctx->i32_1, "");
5104                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5105                 emit_data->arg_count = 2;
5106         }
5107 }
5108
5109 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5110                                 struct lp_build_tgsi_context *bld_base,
5111                                 struct lp_build_emit_data *emit_data)
5112 {
5113         struct si_shader_context *ctx = si_shader_context(bld_base);
5114         struct si_shader *shader = ctx->shader;
5115         struct gallivm_state *gallivm = &ctx->gallivm;
5116         LLVMValueRef interp_param;
5117         const struct tgsi_full_instruction *inst = emit_data->inst;
5118         int input_index = inst->Src[0].Register.Index;
5119         int chan;
5120         int i;
5121         LLVMValueRef attr_number;
5122         LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5123         int interp_param_idx;
5124         unsigned interp = shader->selector->info.input_interpolate[input_index];
5125         unsigned location;
5126
5127         assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5128
5129         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5130             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5131                 location = TGSI_INTERPOLATE_LOC_CENTER;
5132         else
5133                 location = TGSI_INTERPOLATE_LOC_CENTROID;
5134
5135         interp_param_idx = lookup_interp_param_index(interp, location);
5136         if (interp_param_idx == -1)
5137                 return;
5138         else if (interp_param_idx)
5139                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5140         else
5141                 interp_param = NULL;
5142
5143         attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5144
5145         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5146             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5147                 LLVMValueRef ij_out[2];
5148                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5149
5150                 /*
5151                  * take the I then J parameters, and the DDX/Y for it, and
5152                  * calculate the IJ inputs for the interpolator.
5153                  * temp1 = ddx * offset/sample.x + I;
5154                  * interp_param.I = ddy * offset/sample.y + temp1;
5155                  * temp1 = ddx * offset/sample.x + J;
5156                  * interp_param.J = ddy * offset/sample.y + temp1;
5157                  */
5158                 for (i = 0; i < 2; i++) {
5159                         LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5160                         LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5161                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5162                                                                       ddxy_out, ix_ll, "");
5163                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5164                                                                       ddxy_out, iy_ll, "");
5165                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5166                                                                          interp_param, ix_ll, "");
5167                         LLVMValueRef temp1, temp2;
5168
5169                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5170                                                      ctx->f32, "");
5171
5172                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5173
5174                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5175
5176                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5177
5178                         ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5179                 }
5180                 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5181         }
5182
5183         for (chan = 0; chan < 4; chan++) {
5184                 LLVMValueRef llvm_chan;
5185                 unsigned schan;
5186
5187                 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5188                 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5189
5190                 if (interp_param) {
5191                         interp_param = LLVMBuildBitCast(gallivm->builder,
5192                                 interp_param, LLVMVectorType(ctx->f32, 2), "");
5193                         LLVMValueRef i = LLVMBuildExtractElement(
5194                                 gallivm->builder, interp_param, ctx->i32_0, "");
5195                         LLVMValueRef j = LLVMBuildExtractElement(
5196                                 gallivm->builder, interp_param, ctx->i32_1, "");
5197                         emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5198                                 llvm_chan, attr_number, params,
5199                                 i, j);
5200                 } else {
5201                         emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5202                                 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5203                                 llvm_chan, attr_number, params);
5204                 }
5205         }
5206 }
5207
5208 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5209                                    LLVMValueRef value)
5210 {
5211         struct gallivm_state *gallivm = &ctx->gallivm;
5212         LLVMValueRef args[3] = {
5213                 value,
5214                 ctx->i32_0,
5215                 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5216         };
5217
5218         /* We currently have no other way to prevent LLVM from lifting the icmp
5219          * calls to a dominating basic block.
5220          */
5221         emit_optimization_barrier(ctx, &args[0]);
5222
5223         if (LLVMTypeOf(args[0]) != ctx->i32)
5224                 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5225
5226         return lp_build_intrinsic(gallivm->builder,
5227                                   "llvm.amdgcn.icmp.i32",
5228                                   ctx->i64, args, 3,
5229                                   LP_FUNC_ATTR_NOUNWIND |
5230                                   LP_FUNC_ATTR_READNONE |
5231                                   LP_FUNC_ATTR_CONVERGENT);
5232 }
5233
5234 static void vote_all_emit(
5235         const struct lp_build_tgsi_action *action,
5236         struct lp_build_tgsi_context *bld_base,
5237         struct lp_build_emit_data *emit_data)
5238 {
5239         struct si_shader_context *ctx = si_shader_context(bld_base);
5240         struct gallivm_state *gallivm = &ctx->gallivm;
5241         LLVMValueRef active_set, vote_set;
5242         LLVMValueRef tmp;
5243
5244         active_set = si_emit_ballot(ctx, ctx->i32_1);
5245         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5246
5247         tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5248         emit_data->output[emit_data->chan] =
5249                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5250 }
5251
5252 static void vote_any_emit(
5253         const struct lp_build_tgsi_action *action,
5254         struct lp_build_tgsi_context *bld_base,
5255         struct lp_build_emit_data *emit_data)
5256 {
5257         struct si_shader_context *ctx = si_shader_context(bld_base);
5258         struct gallivm_state *gallivm = &ctx->gallivm;
5259         LLVMValueRef vote_set;
5260         LLVMValueRef tmp;
5261
5262         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5263
5264         tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5265                             vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5266         emit_data->output[emit_data->chan] =
5267                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5268 }
5269
5270 static void vote_eq_emit(
5271         const struct lp_build_tgsi_action *action,
5272         struct lp_build_tgsi_context *bld_base,
5273         struct lp_build_emit_data *emit_data)
5274 {
5275         struct si_shader_context *ctx = si_shader_context(bld_base);
5276         struct gallivm_state *gallivm = &ctx->gallivm;
5277         LLVMValueRef active_set, vote_set;
5278         LLVMValueRef all, none, tmp;
5279
5280         active_set = si_emit_ballot(ctx, ctx->i32_1);
5281         vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5282
5283         all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5284         none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5285                              vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5286         tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5287         emit_data->output[emit_data->chan] =
5288                 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5289 }
5290
5291 static void ballot_emit(
5292         const struct lp_build_tgsi_action *action,
5293         struct lp_build_tgsi_context *bld_base,
5294         struct lp_build_emit_data *emit_data)
5295 {
5296         struct si_shader_context *ctx = si_shader_context(bld_base);
5297         LLVMBuilderRef builder = ctx->gallivm.builder;
5298         LLVMValueRef tmp;
5299
5300         tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5301         tmp = si_emit_ballot(ctx, tmp);
5302         tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5303
5304         emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5305         emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5306 }
5307
5308 static void read_invoc_fetch_args(
5309         struct lp_build_tgsi_context *bld_base,
5310         struct lp_build_emit_data *emit_data)
5311 {
5312         emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5313                                                  0, emit_data->src_chan);
5314
5315         /* Always read the source invocation (= lane) from the X channel. */
5316         emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5317                                                  1, TGSI_CHAN_X);
5318         emit_data->arg_count = 2;
5319 }
5320
5321 static void read_lane_emit(
5322         const struct lp_build_tgsi_action *action,
5323         struct lp_build_tgsi_context *bld_base,
5324         struct lp_build_emit_data *emit_data)
5325 {
5326         struct si_shader_context *ctx = si_shader_context(bld_base);
5327         LLVMBuilderRef builder = ctx->gallivm.builder;
5328
5329         /* We currently have no other way to prevent LLVM from lifting the icmp
5330          * calls to a dominating basic block.
5331          */
5332         emit_optimization_barrier(ctx, &emit_data->args[0]);
5333
5334         for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5335                 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5336                                                       ctx->i32, "");
5337         }
5338
5339         emit_data->output[emit_data->chan] =
5340                 ac_build_intrinsic(&ctx->ac, action->intr_name,
5341                                    ctx->i32, emit_data->args, emit_data->arg_count,
5342                                    AC_FUNC_ATTR_READNONE |
5343                                    AC_FUNC_ATTR_CONVERGENT);
5344 }
5345
5346 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5347                                        struct lp_build_emit_data *emit_data)
5348 {
5349         struct si_shader_context *ctx = si_shader_context(bld_base);
5350         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5351         LLVMValueRef imm;
5352         unsigned stream;
5353
5354         assert(src0.File == TGSI_FILE_IMMEDIATE);
5355
5356         imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5357         stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5358         return stream;
5359 }
5360
5361 /* Emit one vertex from the geometry shader */
5362 static void si_llvm_emit_vertex(
5363         const struct lp_build_tgsi_action *action,
5364         struct lp_build_tgsi_context *bld_base,
5365         struct lp_build_emit_data *emit_data)
5366 {
5367         struct si_shader_context *ctx = si_shader_context(bld_base);
5368         struct lp_build_context *uint = &bld_base->uint_bld;
5369         struct si_shader *shader = ctx->shader;
5370         struct tgsi_shader_info *info = &shader->selector->info;
5371         struct gallivm_state *gallivm = &ctx->gallivm;
5372         struct lp_build_if_state if_state;
5373         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5374                                             ctx->param_gs2vs_offset);
5375         LLVMValueRef gs_next_vertex;
5376         LLVMValueRef can_emit, kill;
5377         unsigned chan, offset;
5378         int i;
5379         unsigned stream;
5380
5381         stream = si_llvm_get_stream(bld_base, emit_data);
5382
5383         /* Write vertex attribute values to GSVS ring */
5384         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5385                                        ctx->gs_next_vertex[stream],
5386                                        "");
5387
5388         /* If this thread has already emitted the declared maximum number of
5389          * vertices, skip the write: excessive vertex emissions are not
5390          * supposed to have any effect.
5391          *
5392          * If the shader has no writes to memory, kill it instead. This skips
5393          * further memory loads and may allow LLVM to skip to the end
5394          * altogether.
5395          */
5396         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5397                                  LLVMConstInt(ctx->i32,
5398                                               shader->selector->gs_max_out_vertices, 0), "");
5399
5400         bool use_kill = !info->writes_memory;
5401         if (use_kill) {
5402                 kill = lp_build_select(&bld_base->base, can_emit,
5403                                        LLVMConstReal(ctx->f32, 1.0f),
5404                                        LLVMConstReal(ctx->f32, -1.0f));
5405
5406                 ac_build_kill(&ctx->ac, kill);
5407         } else {
5408                 lp_build_if(&if_state, gallivm, can_emit);
5409         }
5410
5411         offset = 0;
5412         for (i = 0; i < info->num_outputs; i++) {
5413                 LLVMValueRef *out_ptr = ctx->outputs[i];
5414
5415                 for (chan = 0; chan < 4; chan++) {
5416                         if (!(info->output_usagemask[i] & (1 << chan)) ||
5417                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5418                                 continue;
5419
5420                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5421                         LLVMValueRef voffset =
5422                                 LLVMConstInt(ctx->i32, offset *
5423                                              shader->selector->gs_max_out_vertices, 0);
5424                         offset++;
5425
5426                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
5427                         voffset = lp_build_mul_imm(uint, voffset, 4);
5428
5429                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5430
5431                         ac_build_buffer_store_dword(&ctx->ac,
5432                                                     ctx->gsvs_ring[stream],
5433                                                     out_val, 1,
5434                                                     voffset, soffset, 0,
5435                                                     1, 1, true, true);
5436                 }
5437         }
5438
5439         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5440                                       ctx->i32_1);
5441
5442         LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5443
5444         /* Signal vertex emission */
5445         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5446                          LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
5447         if (!use_kill)
5448                 lp_build_endif(&if_state);
5449 }
5450
5451 /* Cut one primitive from the geometry shader */
5452 static void si_llvm_emit_primitive(
5453         const struct lp_build_tgsi_action *action,
5454         struct lp_build_tgsi_context *bld_base,
5455         struct lp_build_emit_data *emit_data)
5456 {
5457         struct si_shader_context *ctx = si_shader_context(bld_base);
5458         unsigned stream;
5459
5460         /* Signal primitive cut */
5461         stream = si_llvm_get_stream(bld_base, emit_data);
5462         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5463                          LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id));
5464 }
5465
5466 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5467                                  struct lp_build_tgsi_context *bld_base,
5468                                  struct lp_build_emit_data *emit_data)
5469 {
5470         struct si_shader_context *ctx = si_shader_context(bld_base);
5471         struct gallivm_state *gallivm = &ctx->gallivm;
5472
5473         /* SI only (thanks to a hw bug workaround):
5474          * The real barrier instruction isn’t needed, because an entire patch
5475          * always fits into a single wave.
5476          */
5477         if (HAVE_LLVM >= 0x0309 &&
5478             ctx->screen->b.chip_class == SI &&
5479             ctx->type == PIPE_SHADER_TESS_CTRL) {
5480                 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5481                 return;
5482         }
5483
5484         lp_build_intrinsic(gallivm->builder,
5485                            HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5486                                                : "llvm.AMDGPU.barrier.local",
5487                            ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5488 }
5489
5490 static const struct lp_build_tgsi_action tex_action = {
5491         .fetch_args = tex_fetch_args,
5492         .emit = build_tex_intrinsic,
5493 };
5494
5495 static const struct lp_build_tgsi_action interp_action = {
5496         .fetch_args = interp_fetch_args,
5497         .emit = build_interp_intrinsic,
5498 };
5499
5500 static void si_create_function(struct si_shader_context *ctx,
5501                                const char *name,
5502                                LLVMTypeRef *returns, unsigned num_returns,
5503                                LLVMTypeRef *params, unsigned num_params,
5504                                int last_sgpr)
5505 {
5506         int i;
5507
5508         si_llvm_create_func(ctx, name, returns, num_returns,
5509                             params, num_params);
5510         si_llvm_shader_type(ctx->main_fn, ctx->type);
5511         ctx->return_value = LLVMGetUndef(ctx->return_type);
5512
5513         for (i = 0; i <= last_sgpr; ++i) {
5514                 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5515
5516                 /* The combination of:
5517                  * - ByVal
5518                  * - dereferenceable
5519                  * - invariant.load
5520                  * allows the optimization passes to move loads and reduces
5521                  * SGPR spilling significantly.
5522                  */
5523                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5524                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5525                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5526                         ac_add_attr_dereferenceable(P, UINT64_MAX);
5527                 } else
5528                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5529         }
5530
5531         LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5532                                            "no-signed-zeros-fp-math",
5533                                            "true");
5534
5535         if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5536                 /* These were copied from some LLVM test. */
5537                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5538                                                    "less-precise-fpmad",
5539                                                    "true");
5540                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5541                                                    "no-infs-fp-math",
5542                                                    "true");
5543                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5544                                                    "no-nans-fp-math",
5545                                                    "true");
5546                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5547                                                    "unsafe-fp-math",
5548                                                    "true");
5549         }
5550 }
5551
5552 static void declare_streamout_params(struct si_shader_context *ctx,
5553                                      struct pipe_stream_output_info *so,
5554                                      LLVMTypeRef *params, LLVMTypeRef i32,
5555                                      unsigned *num_params)
5556 {
5557         int i;
5558
5559         /* Streamout SGPRs. */
5560         if (so->num_outputs) {
5561                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5562                         params[ctx->param_streamout_config = (*num_params)++] = i32;
5563                 else
5564                         ctx->param_streamout_config = *num_params - 1;
5565
5566                 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5567         }
5568         /* A streamout buffer offset is loaded if the stride is non-zero. */
5569         for (i = 0; i < 4; i++) {
5570                 if (!so->stride[i])
5571                         continue;
5572
5573                 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5574         }
5575 }
5576
5577 static unsigned llvm_get_type_size(LLVMTypeRef type)
5578 {
5579         LLVMTypeKind kind = LLVMGetTypeKind(type);
5580
5581         switch (kind) {
5582         case LLVMIntegerTypeKind:
5583                 return LLVMGetIntTypeWidth(type) / 8;
5584         case LLVMFloatTypeKind:
5585                 return 4;
5586         case LLVMPointerTypeKind:
5587                 return 8;
5588         case LLVMVectorTypeKind:
5589                 return LLVMGetVectorSize(type) *
5590                        llvm_get_type_size(LLVMGetElementType(type));
5591         case LLVMArrayTypeKind:
5592                 return LLVMGetArrayLength(type) *
5593                        llvm_get_type_size(LLVMGetElementType(type));
5594         default:
5595                 assert(0);
5596                 return 0;
5597         }
5598 }
5599
5600 static void declare_tess_lds(struct si_shader_context *ctx)
5601 {
5602         struct gallivm_state *gallivm = &ctx->gallivm;
5603
5604         unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5605         ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5606                 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5607                 "tess_lds");
5608 }
5609
5610 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5611 {
5612         const unsigned *properties = shader->selector->info.properties;
5613         unsigned max_work_group_size =
5614                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5615                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5616                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5617
5618         if (!max_work_group_size) {
5619                 /* This is a variable group size compute shader,
5620                  * compile it for the maximum possible group size.
5621                  */
5622                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5623         }
5624         return max_work_group_size;
5625 }
5626
5627 static void create_function(struct si_shader_context *ctx)
5628 {
5629         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5630         struct gallivm_state *gallivm = &ctx->gallivm;
5631         struct si_shader *shader = ctx->shader;
5632         LLVMTypeRef params[SI_NUM_PARAMS + SI_MAX_ATTRIBS], v3i32;
5633         LLVMTypeRef returns[16+32*4];
5634         unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5635         unsigned num_returns = 0;
5636         unsigned num_prolog_vgprs = 0;
5637
5638         v3i32 = LLVMVectorType(ctx->i32, 3);
5639
5640         params[ctx->param_rw_buffers = num_params++] =
5641                 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5642         params[ctx->param_const_buffers = num_params++] =
5643                 const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5644         params[ctx->param_samplers = num_params++] =
5645                 const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5646         params[ctx->param_images = num_params++] =
5647                 const_array(ctx->v8i32, SI_NUM_IMAGES);
5648         params[ctx->param_shader_buffers = num_params++] =
5649                 const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5650
5651         switch (ctx->type) {
5652         case PIPE_SHADER_VERTEX:
5653                 params[ctx->param_vertex_buffers = num_params++] =
5654                         const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5655                 params[ctx->param_base_vertex = num_params++] = ctx->i32;
5656                 params[ctx->param_start_instance = num_params++] = ctx->i32;
5657                 params[ctx->param_draw_id = num_params++] = ctx->i32;
5658                 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5659
5660                 if (shader->key.as_es) {
5661                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5662                 } else if (shader->key.as_ls) {
5663                         /* no extra parameters */
5664                 } else {
5665                         if (shader->is_gs_copy_shader)
5666                                 num_params = ctx->param_rw_buffers + 1;
5667
5668                         /* The locations of the other parameters are assigned dynamically. */
5669                         declare_streamout_params(ctx, &shader->selector->so,
5670                                                  params, ctx->i32, &num_params);
5671                 }
5672
5673                 last_sgpr = num_params-1;
5674
5675                 /* VGPRs */
5676                 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5677                 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5678                 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5679                 params[ctx->param_instance_id = num_params++] = ctx->i32;
5680
5681                 if (!shader->is_gs_copy_shader) {
5682                         /* Vertex load indices. */
5683                         ctx->param_vertex_index0 = num_params;
5684
5685                         for (i = 0; i < shader->selector->info.num_inputs; i++)
5686                                 params[num_params++] = ctx->i32;
5687
5688                         num_prolog_vgprs += shader->selector->info.num_inputs;
5689
5690                         /* PrimitiveID output. */
5691                         if (!shader->key.as_es && !shader->key.as_ls)
5692                                 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5693                                         returns[num_returns++] = ctx->f32;
5694                 }
5695                 break;
5696
5697         case PIPE_SHADER_TESS_CTRL:
5698                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5699                 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5700                 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5701                 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5702                 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5703                 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5704                 last_sgpr = num_params - 1;
5705
5706                 /* VGPRs */
5707                 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5708                 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5709
5710                 /* param_tcs_offchip_offset and param_tcs_factor_offset are
5711                  * placed after the user SGPRs.
5712                  */
5713                 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
5714                         returns[num_returns++] = ctx->i32; /* SGPRs */
5715
5716                 for (i = 0; i < 3; i++)
5717                         returns[num_returns++] = ctx->f32; /* VGPRs */
5718                 break;
5719
5720         case PIPE_SHADER_TESS_EVAL:
5721                 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5722
5723                 if (shader->key.as_es) {
5724                         params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5725                         params[num_params++] = ctx->i32;
5726                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5727                 } else {
5728                         params[num_params++] = ctx->i32;
5729                         declare_streamout_params(ctx, &shader->selector->so,
5730                                                  params, ctx->i32, &num_params);
5731                         params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5732                 }
5733                 last_sgpr = num_params - 1;
5734
5735                 /* VGPRs */
5736                 params[ctx->param_tes_u = num_params++] = ctx->f32;
5737                 params[ctx->param_tes_v = num_params++] = ctx->f32;
5738                 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5739                 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5740
5741                 /* PrimitiveID output. */
5742                 if (!shader->key.as_es)
5743                         for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5744                                 returns[num_returns++] = ctx->f32;
5745                 break;
5746
5747         case PIPE_SHADER_GEOMETRY:
5748                 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
5749                 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
5750                 last_sgpr = num_params - 1;
5751
5752                 /* VGPRs */
5753                 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
5754                 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
5755                 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
5756                 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
5757                 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
5758                 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
5759                 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
5760                 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
5761                 break;
5762
5763         case PIPE_SHADER_FRAGMENT:
5764                 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5765                 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5766                 last_sgpr = SI_PARAM_PRIM_MASK;
5767                 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5768                 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5769                 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5770                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5771                 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5772                 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5773                 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5774                 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5775                 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5776                 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5777                 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5778                 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5779                 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5780                 shader->info.face_vgpr_index = 20;
5781                 params[SI_PARAM_ANCILLARY] = ctx->i32;
5782                 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5783                 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5784                 num_params = SI_PARAM_POS_FIXED_PT+1;
5785
5786                 /* Color inputs from the prolog. */
5787                 if (shader->selector->info.colors_read) {
5788                         unsigned num_color_elements =
5789                                 util_bitcount(shader->selector->info.colors_read);
5790
5791                         assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5792                         for (i = 0; i < num_color_elements; i++)
5793                                 params[num_params++] = ctx->f32;
5794
5795                         num_prolog_vgprs += num_color_elements;
5796                 }
5797
5798                 /* Outputs for the epilog. */
5799                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5800                 num_returns =
5801                         num_return_sgprs +
5802                         util_bitcount(shader->selector->info.colors_written) * 4 +
5803                         shader->selector->info.writes_z +
5804                         shader->selector->info.writes_stencil +
5805                         shader->selector->info.writes_samplemask +
5806                         1 /* SampleMaskIn */;
5807
5808                 num_returns = MAX2(num_returns,
5809                                    num_return_sgprs +
5810                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5811
5812                 for (i = 0; i < num_return_sgprs; i++)
5813                         returns[i] = ctx->i32;
5814                 for (; i < num_returns; i++)
5815                         returns[i] = ctx->f32;
5816                 break;
5817
5818         case PIPE_SHADER_COMPUTE:
5819                 params[SI_PARAM_GRID_SIZE] = v3i32;
5820                 params[SI_PARAM_BLOCK_SIZE] = v3i32;
5821                 params[SI_PARAM_BLOCK_ID] = v3i32;
5822                 last_sgpr = SI_PARAM_BLOCK_ID;
5823
5824                 params[SI_PARAM_THREAD_ID] = v3i32;
5825                 num_params = SI_PARAM_THREAD_ID + 1;
5826                 break;
5827         default:
5828                 assert(0 && "unimplemented shader");
5829                 return;
5830         }
5831
5832         assert(num_params <= ARRAY_SIZE(params));
5833
5834         si_create_function(ctx, "main", returns, num_returns, params,
5835                            num_params, last_sgpr);
5836
5837         /* Reserve register locations for VGPR inputs the PS prolog may need. */
5838         if (ctx->type == PIPE_SHADER_FRAGMENT &&
5839             ctx->separate_prolog) {
5840                 si_llvm_add_attribute(ctx->main_fn,
5841                                       "InitialPSInputAddr",
5842                                       S_0286D0_PERSP_SAMPLE_ENA(1) |
5843                                       S_0286D0_PERSP_CENTER_ENA(1) |
5844                                       S_0286D0_PERSP_CENTROID_ENA(1) |
5845                                       S_0286D0_LINEAR_SAMPLE_ENA(1) |
5846                                       S_0286D0_LINEAR_CENTER_ENA(1) |
5847                                       S_0286D0_LINEAR_CENTROID_ENA(1) |
5848                                       S_0286D0_FRONT_FACE_ENA(1) |
5849                                       S_0286D0_POS_FIXED_PT_ENA(1));
5850         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5851                 si_llvm_add_attribute(ctx->main_fn,
5852                                       "amdgpu-max-work-group-size",
5853                                       si_get_max_workgroup_size(shader));
5854         }
5855
5856         shader->info.num_input_sgprs = 0;
5857         shader->info.num_input_vgprs = 0;
5858
5859         for (i = 0; i <= last_sgpr; ++i)
5860                 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5861
5862         for (; i < num_params; ++i)
5863                 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5864
5865         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
5866         shader->info.num_input_vgprs -= num_prolog_vgprs;
5867
5868         if (!ctx->screen->has_ds_bpermute &&
5869             bld_base->info &&
5870             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5871              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5872              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5873              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5874              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5875              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5876                 ctx->lds =
5877                         LLVMAddGlobalInAddressSpace(gallivm->module,
5878                                                     LLVMArrayType(ctx->i32, 64),
5879                                                     "ddxy_lds",
5880                                                     LOCAL_ADDR_SPACE);
5881
5882         if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.as_ls) ||
5883             ctx->type == PIPE_SHADER_TESS_CTRL)
5884                 declare_tess_lds(ctx);
5885 }
5886
5887 /**
5888  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5889  * for later use.
5890  */
5891 static void preload_ring_buffers(struct si_shader_context *ctx)
5892 {
5893         struct gallivm_state *gallivm = &ctx->gallivm;
5894         LLVMBuilderRef builder = gallivm->builder;
5895
5896         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
5897                                             ctx->param_rw_buffers);
5898
5899         if ((ctx->type == PIPE_SHADER_VERTEX &&
5900              ctx->shader->key.as_es) ||
5901             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5902              ctx->shader->key.as_es) ||
5903             ctx->type == PIPE_SHADER_GEOMETRY) {
5904                 unsigned ring =
5905                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5906                                                              : SI_ES_RING_ESGS;
5907                 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
5908
5909                 ctx->esgs_ring =
5910                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
5911         }
5912
5913         if (ctx->shader->is_gs_copy_shader) {
5914                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
5915
5916                 ctx->gsvs_ring[0] =
5917                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
5918         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
5919                 const struct si_shader_selector *sel = ctx->shader->selector;
5920                 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
5921                 LLVMValueRef base_ring;
5922
5923                 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
5924
5925                 /* The conceptual layout of the GSVS ring is
5926                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
5927                  * but the real memory layout is swizzled across
5928                  * threads:
5929                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
5930                  *   t16v0c0 ..
5931                  * Override the buffer descriptor accordingly.
5932                  */
5933                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
5934                 uint64_t stream_offset = 0;
5935
5936                 for (unsigned stream = 0; stream < 4; ++stream) {
5937                         unsigned num_components;
5938                         unsigned stride;
5939                         unsigned num_records;
5940                         LLVMValueRef ring, tmp;
5941
5942                         num_components = sel->info.num_stream_output_components[stream];
5943                         if (!num_components)
5944                                 continue;
5945
5946                         stride = 4 * num_components * sel->gs_max_out_vertices;
5947
5948                         /* Limit on the stride field for <= CIK. */
5949                         assert(stride < (1 << 14));
5950
5951                         num_records = 64;
5952
5953                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
5954                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
5955                         tmp = LLVMBuildAdd(builder, tmp,
5956                                            LLVMConstInt(ctx->i64,
5957                                                         stream_offset, 0), "");
5958                         stream_offset += stride * 64;
5959
5960                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
5961                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
5962                         tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
5963                         tmp = LLVMBuildOr(builder, tmp,
5964                                 LLVMConstInt(ctx->i32,
5965                                              S_008F04_STRIDE(stride) |
5966                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
5967                         ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
5968                         ring = LLVMBuildInsertElement(builder, ring,
5969                                         LLVMConstInt(ctx->i32, num_records, 0),
5970                                         LLVMConstInt(ctx->i32, 2, 0), "");
5971                         ring = LLVMBuildInsertElement(builder, ring,
5972                                 LLVMConstInt(ctx->i32,
5973                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5974                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5975                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5976                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
5977                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5978                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
5979                                              S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
5980                                              S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
5981                                              S_008F0C_ADD_TID_ENABLE(1),
5982                                              0),
5983                                 LLVMConstInt(ctx->i32, 3, 0), "");
5984                         ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
5985
5986                         ctx->gsvs_ring[stream] = ring;
5987                 }
5988         }
5989 }
5990
5991 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5992                                          LLVMValueRef param_rw_buffers,
5993                                          unsigned param_pos_fixed_pt)
5994 {
5995         struct gallivm_state *gallivm = &ctx->gallivm;
5996         LLVMBuilderRef builder = gallivm->builder;
5997         LLVMValueRef slot, desc, offset, row, bit, address[2];
5998
5999         /* Use the fixed-point gl_FragCoord input.
6000          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6001          * per coordinate to get the repeating effect.
6002          */
6003         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6004         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6005
6006         /* Load the buffer descriptor. */
6007         slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6008         desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6009
6010         /* The stipple pattern is 32x32, each row has 32 bits. */
6011         offset = LLVMBuildMul(builder, address[1],
6012                               LLVMConstInt(ctx->i32, 4, 0), "");
6013         row = buffer_load_const(ctx, desc, offset);
6014         row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6015         bit = LLVMBuildLShr(builder, row, address[0], "");
6016         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6017
6018         /* The intrinsic kills the thread if arg < 0. */
6019         bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6020                               LLVMConstReal(ctx->f32, -1), "");
6021         ac_build_kill(&ctx->ac, bit);
6022 }
6023
6024 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6025                                   struct si_shader_config *conf,
6026                                   unsigned symbol_offset)
6027 {
6028         unsigned i;
6029         const unsigned char *config =
6030                 ac_shader_binary_config_start(binary, symbol_offset);
6031         bool really_needs_scratch = false;
6032
6033         /* LLVM adds SGPR spills to the scratch size.
6034          * Find out if we really need the scratch buffer.
6035          */
6036         for (i = 0; i < binary->reloc_count; i++) {
6037                 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6038
6039                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6040                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6041                         really_needs_scratch = true;
6042                         break;
6043                 }
6044         }
6045
6046         /* XXX: We may be able to emit some of these values directly rather than
6047          * extracting fields to be emitted later.
6048          */
6049
6050         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6051                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6052                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6053                 switch (reg) {
6054                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6055                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6056                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6057                 case R_00B848_COMPUTE_PGM_RSRC1:
6058                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6059                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6060                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
6061                         conf->rsrc1 = value;
6062                         break;
6063                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6064                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6065                         break;
6066                 case R_00B84C_COMPUTE_PGM_RSRC2:
6067                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6068                         conf->rsrc2 = value;
6069                         break;
6070                 case R_0286CC_SPI_PS_INPUT_ENA:
6071                         conf->spi_ps_input_ena = value;
6072                         break;
6073                 case R_0286D0_SPI_PS_INPUT_ADDR:
6074                         conf->spi_ps_input_addr = value;
6075                         break;
6076                 case R_0286E8_SPI_TMPRING_SIZE:
6077                 case R_00B860_COMPUTE_TMPRING_SIZE:
6078                         /* WAVESIZE is in units of 256 dwords. */
6079                         if (really_needs_scratch)
6080                                 conf->scratch_bytes_per_wave =
6081                                         G_00B860_WAVESIZE(value) * 256 * 4;
6082                         break;
6083                 case 0x4: /* SPILLED_SGPRS */
6084                         conf->spilled_sgprs = value;
6085                         break;
6086                 case 0x8: /* SPILLED_VGPRS */
6087                         conf->spilled_vgprs = value;
6088                         break;
6089                 default:
6090                         {
6091                                 static bool printed;
6092
6093                                 if (!printed) {
6094                                         fprintf(stderr, "Warning: LLVM emitted unknown "
6095                                                 "config register: 0x%x\n", reg);
6096                                         printed = true;
6097                                 }
6098                         }
6099                         break;
6100                 }
6101         }
6102
6103         if (!conf->spi_ps_input_addr)
6104                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6105 }
6106
6107 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6108                         struct si_shader *shader,
6109                         struct si_shader_config *config,
6110                         uint64_t scratch_va)
6111 {
6112         unsigned i;
6113         uint32_t scratch_rsrc_dword0 = scratch_va;
6114         uint32_t scratch_rsrc_dword1 =
6115                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6116
6117         /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6118          * correctly.
6119          */
6120         if (HAVE_LLVM >= 0x0309)
6121                 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6122         else
6123                 scratch_rsrc_dword1 |=
6124                         S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6125
6126         for (i = 0 ; i < shader->binary.reloc_count; i++) {
6127                 const struct ac_shader_reloc *reloc =
6128                                         &shader->binary.relocs[i];
6129                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6130                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6131                         &scratch_rsrc_dword0, 4);
6132                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6133                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6134                         &scratch_rsrc_dword1, 4);
6135                 }
6136         }
6137 }
6138
6139 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6140 {
6141         unsigned size = shader->binary.code_size;
6142
6143         if (shader->prolog)
6144                 size += shader->prolog->binary.code_size;
6145         if (shader->previous_stage)
6146                 size += shader->previous_stage->binary.code_size;
6147         if (shader->epilog)
6148                 size += shader->epilog->binary.code_size;
6149         return size;
6150 }
6151
6152 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6153 {
6154         const struct ac_shader_binary *prolog =
6155                 shader->prolog ? &shader->prolog->binary : NULL;
6156         const struct ac_shader_binary *previous_stage =
6157                 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6158         const struct ac_shader_binary *epilog =
6159                 shader->epilog ? &shader->epilog->binary : NULL;
6160         const struct ac_shader_binary *mainb = &shader->binary;
6161         unsigned bo_size = si_get_shader_binary_size(shader) +
6162                            (!epilog ? mainb->rodata_size : 0);
6163         unsigned char *ptr;
6164
6165         assert(!prolog || !prolog->rodata_size);
6166         assert(!previous_stage || !previous_stage->rodata_size);
6167         assert((!prolog && !previous_stage && !epilog) || !mainb->rodata_size);
6168         assert(!epilog || !epilog->rodata_size);
6169
6170         /* GFX9 can fetch at most 128 bytes past the end of the shader.
6171          * Prevent VM faults.
6172          */
6173         if (sscreen->b.chip_class >= GFX9)
6174                 bo_size += 128;
6175
6176         r600_resource_reference(&shader->bo, NULL);
6177         shader->bo = (struct r600_resource*)
6178                      pipe_buffer_create(&sscreen->b.b, 0,
6179                                         PIPE_USAGE_IMMUTABLE,
6180                                         align(bo_size, SI_CPDMA_ALIGNMENT));
6181         if (!shader->bo)
6182                 return -ENOMEM;
6183
6184         /* Upload. */
6185         ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6186                                         PIPE_TRANSFER_READ_WRITE |
6187                                         PIPE_TRANSFER_UNSYNCHRONIZED);
6188
6189         if (prolog) {
6190                 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6191                 ptr += prolog->code_size;
6192         }
6193         if (previous_stage) {
6194                 util_memcpy_cpu_to_le32(ptr, previous_stage->code,
6195                                         previous_stage->code_size);
6196                 ptr += previous_stage->code_size;
6197         }
6198
6199         util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6200         ptr += mainb->code_size;
6201
6202         if (epilog)
6203                 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6204         else if (mainb->rodata_size > 0)
6205                 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6206
6207         sscreen->b.ws->buffer_unmap(shader->bo->buf);
6208         return 0;
6209 }
6210
6211 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6212                                        struct pipe_debug_callback *debug,
6213                                        const char *name, FILE *file)
6214 {
6215         char *line, *p;
6216         unsigned i, count;
6217
6218         if (binary->disasm_string) {
6219                 fprintf(file, "Shader %s disassembly:\n", name);
6220                 fprintf(file, "%s", binary->disasm_string);
6221
6222                 if (debug && debug->debug_message) {
6223                         /* Very long debug messages are cut off, so send the
6224                          * disassembly one line at a time. This causes more
6225                          * overhead, but on the plus side it simplifies
6226                          * parsing of resulting logs.
6227                          */
6228                         pipe_debug_message(debug, SHADER_INFO,
6229                                            "Shader Disassembly Begin");
6230
6231                         line = binary->disasm_string;
6232                         while (*line) {
6233                                 p = util_strchrnul(line, '\n');
6234                                 count = p - line;
6235
6236                                 if (count) {
6237                                         pipe_debug_message(debug, SHADER_INFO,
6238                                                            "%.*s", count, line);
6239                                 }
6240
6241                                 if (!*p)
6242                                         break;
6243                                 line = p + 1;
6244                         }
6245
6246                         pipe_debug_message(debug, SHADER_INFO,
6247                                            "Shader Disassembly End");
6248                 }
6249         } else {
6250                 fprintf(file, "Shader %s binary:\n", name);
6251                 for (i = 0; i < binary->code_size; i += 4) {
6252                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6253                                 binary->code[i + 3], binary->code[i + 2],
6254                                 binary->code[i + 1], binary->code[i]);
6255                 }
6256         }
6257 }
6258
6259 static void si_shader_dump_stats(struct si_screen *sscreen,
6260                                  struct si_shader *shader,
6261                                  struct pipe_debug_callback *debug,
6262                                  unsigned processor,
6263                                  FILE *file,
6264                                  bool check_debug_option)
6265 {
6266         struct si_shader_config *conf = &shader->config;
6267         unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6268         unsigned code_size = si_get_shader_binary_size(shader);
6269         unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6270         unsigned lds_per_wave = 0;
6271         unsigned max_simd_waves = 10;
6272
6273         /* Compute LDS usage for PS. */
6274         switch (processor) {
6275         case PIPE_SHADER_FRAGMENT:
6276                 /* The minimum usage per wave is (num_inputs * 48). The maximum
6277                  * usage is (num_inputs * 48 * 16).
6278                  * We can get anything in between and it varies between waves.
6279                  *
6280                  * The 48 bytes per input for a single primitive is equal to
6281                  * 4 bytes/component * 4 components/input * 3 points.
6282                  *
6283                  * Other stages don't know the size at compile time or don't
6284                  * allocate LDS per wave, but instead they do it per thread group.
6285                  */
6286                 lds_per_wave = conf->lds_size * lds_increment +
6287                                align(num_inputs * 48, lds_increment);
6288                 break;
6289         case PIPE_SHADER_COMPUTE:
6290                 if (shader->selector) {
6291                         unsigned max_workgroup_size =
6292                                 si_get_max_workgroup_size(shader);
6293                         lds_per_wave = (conf->lds_size * lds_increment) /
6294                                        DIV_ROUND_UP(max_workgroup_size, 64);
6295                 }
6296                 break;
6297         }
6298
6299         /* Compute the per-SIMD wave counts. */
6300         if (conf->num_sgprs) {
6301                 if (sscreen->b.chip_class >= VI)
6302                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6303                 else
6304                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6305         }
6306
6307         if (conf->num_vgprs)
6308                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6309
6310         /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6311          * 16KB makes some SIMDs unoccupied). */
6312         if (lds_per_wave)
6313                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6314
6315         if (!check_debug_option ||
6316             r600_can_dump_shader(&sscreen->b, processor)) {
6317                 if (processor == PIPE_SHADER_FRAGMENT) {
6318                         fprintf(file, "*** SHADER CONFIG ***\n"
6319                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6320                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
6321                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6322                 }
6323
6324                 fprintf(file, "*** SHADER STATS ***\n"
6325                         "SGPRS: %d\n"
6326                         "VGPRS: %d\n"
6327                         "Spilled SGPRs: %d\n"
6328                         "Spilled VGPRs: %d\n"
6329                         "Private memory VGPRs: %d\n"
6330                         "Code Size: %d bytes\n"
6331                         "LDS: %d blocks\n"
6332                         "Scratch: %d bytes per wave\n"
6333                         "Max Waves: %d\n"
6334                         "********************\n\n\n",
6335                         conf->num_sgprs, conf->num_vgprs,
6336                         conf->spilled_sgprs, conf->spilled_vgprs,
6337                         conf->private_mem_vgprs, code_size,
6338                         conf->lds_size, conf->scratch_bytes_per_wave,
6339                         max_simd_waves);
6340         }
6341
6342         pipe_debug_message(debug, SHADER_INFO,
6343                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6344                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6345                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
6346                            conf->num_sgprs, conf->num_vgprs, code_size,
6347                            conf->lds_size, conf->scratch_bytes_per_wave,
6348                            max_simd_waves, conf->spilled_sgprs,
6349                            conf->spilled_vgprs, conf->private_mem_vgprs);
6350 }
6351
6352 const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
6353 {
6354         switch (processor) {
6355         case PIPE_SHADER_VERTEX:
6356                 if (shader->key.as_es)
6357                         return "Vertex Shader as ES";
6358                 else if (shader->key.as_ls)
6359                         return "Vertex Shader as LS";
6360                 else
6361                         return "Vertex Shader as VS";
6362         case PIPE_SHADER_TESS_CTRL:
6363                 return "Tessellation Control Shader";
6364         case PIPE_SHADER_TESS_EVAL:
6365                 if (shader->key.as_es)
6366                         return "Tessellation Evaluation Shader as ES";
6367                 else
6368                         return "Tessellation Evaluation Shader as VS";
6369         case PIPE_SHADER_GEOMETRY:
6370                 if (shader->is_gs_copy_shader)
6371                         return "GS Copy Shader as VS";
6372                 else
6373                         return "Geometry Shader";
6374         case PIPE_SHADER_FRAGMENT:
6375                 return "Pixel Shader";
6376         case PIPE_SHADER_COMPUTE:
6377                 return "Compute Shader";
6378         default:
6379                 return "Unknown Shader";
6380         }
6381 }
6382
6383 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6384                     struct pipe_debug_callback *debug, unsigned processor,
6385                     FILE *file, bool check_debug_option)
6386 {
6387         if (!check_debug_option ||
6388             r600_can_dump_shader(&sscreen->b, processor))
6389                 si_dump_shader_key(processor, shader, file);
6390
6391         if (!check_debug_option && shader->binary.llvm_ir_string) {
6392                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6393                         si_get_shader_name(shader, processor));
6394                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6395         }
6396
6397         if (!check_debug_option ||
6398             (r600_can_dump_shader(&sscreen->b, processor) &&
6399              !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6400                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6401
6402                 if (shader->prolog)
6403                         si_shader_dump_disassembly(&shader->prolog->binary,
6404                                                    debug, "prolog", file);
6405                 if (shader->previous_stage)
6406                         si_shader_dump_disassembly(&shader->previous_stage->binary,
6407                                                    debug, "previous stage", file);
6408
6409                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6410
6411                 if (shader->epilog)
6412                         si_shader_dump_disassembly(&shader->epilog->binary,
6413                                                    debug, "epilog", file);
6414                 fprintf(file, "\n");
6415         }
6416
6417         si_shader_dump_stats(sscreen, shader, debug, processor, file,
6418                              check_debug_option);
6419 }
6420
6421 int si_compile_llvm(struct si_screen *sscreen,
6422                     struct ac_shader_binary *binary,
6423                     struct si_shader_config *conf,
6424                     LLVMTargetMachineRef tm,
6425                     LLVMModuleRef mod,
6426                     struct pipe_debug_callback *debug,
6427                     unsigned processor,
6428                     const char *name)
6429 {
6430         int r = 0;
6431         unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6432
6433         if (r600_can_dump_shader(&sscreen->b, processor)) {
6434                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6435
6436                 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6437                         fprintf(stderr, "%s LLVM IR:\n\n", name);
6438                         ac_dump_module(mod);
6439                         fprintf(stderr, "\n");
6440                 }
6441         }
6442
6443         if (sscreen->record_llvm_ir) {
6444                 char *ir = LLVMPrintModuleToString(mod);
6445                 binary->llvm_ir_string = strdup(ir);
6446                 LLVMDisposeMessage(ir);
6447         }
6448
6449         if (!si_replace_shader(count, binary)) {
6450                 r = si_llvm_compile(mod, binary, tm, debug);
6451                 if (r)
6452                         return r;
6453         }
6454
6455         si_shader_binary_read_config(binary, conf, 0);
6456
6457         /* Enable 64-bit and 16-bit denormals, because there is no performance
6458          * cost.
6459          *
6460          * If denormals are enabled, all floating-point output modifiers are
6461          * ignored.
6462          *
6463          * Don't enable denormals for 32-bit floats, because:
6464          * - Floating-point output modifiers would be ignored by the hw.
6465          * - Some opcodes don't support denormals, such as v_mad_f32. We would
6466          *   have to stop using those.
6467          * - SI & CI would be very slow.
6468          */
6469         conf->float_mode |= V_00B028_FP_64_DENORMS;
6470
6471         FREE(binary->config);
6472         FREE(binary->global_symbol_offsets);
6473         binary->config = NULL;
6474         binary->global_symbol_offsets = NULL;
6475
6476         /* Some shaders can't have rodata because their binaries can be
6477          * concatenated.
6478          */
6479         if (binary->rodata_size &&
6480             (processor == PIPE_SHADER_VERTEX ||
6481              processor == PIPE_SHADER_TESS_CTRL ||
6482              processor == PIPE_SHADER_TESS_EVAL ||
6483              processor == PIPE_SHADER_FRAGMENT)) {
6484                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6485                 return -EINVAL;
6486         }
6487
6488         return r;
6489 }
6490
6491 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6492 {
6493         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6494                 LLVMBuildRetVoid(ctx->gallivm.builder);
6495         else
6496                 LLVMBuildRet(ctx->gallivm.builder, ret);
6497 }
6498
6499 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6500 struct si_shader *
6501 si_generate_gs_copy_shader(struct si_screen *sscreen,
6502                            LLVMTargetMachineRef tm,
6503                            struct si_shader_selector *gs_selector,
6504                            struct pipe_debug_callback *debug)
6505 {
6506         struct si_shader_context ctx;
6507         struct si_shader *shader;
6508         struct gallivm_state *gallivm = &ctx.gallivm;
6509         LLVMBuilderRef builder;
6510         struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6511         struct lp_build_context *uint = &bld_base->uint_bld;
6512         struct si_shader_output_values *outputs;
6513         struct tgsi_shader_info *gsinfo = &gs_selector->info;
6514         int i, r;
6515
6516         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6517
6518         if (!outputs)
6519                 return NULL;
6520
6521         shader = CALLOC_STRUCT(si_shader);
6522         if (!shader) {
6523                 FREE(outputs);
6524                 return NULL;
6525         }
6526
6527
6528         shader->selector = gs_selector;
6529         shader->is_gs_copy_shader = true;
6530
6531         si_init_shader_ctx(&ctx, sscreen, tm);
6532         ctx.shader = shader;
6533         ctx.type = PIPE_SHADER_VERTEX;
6534
6535         builder = gallivm->builder;
6536
6537         create_function(&ctx);
6538         preload_ring_buffers(&ctx);
6539
6540         LLVMValueRef voffset =
6541                 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6542                                                     ctx.param_vertex_id), 4);
6543
6544         /* Fetch the vertex stream ID.*/
6545         LLVMValueRef stream_id;
6546
6547         if (gs_selector->so.num_outputs)
6548                 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6549         else
6550                 stream_id = ctx.i32_0;
6551
6552         /* Fill in output information. */
6553         for (i = 0; i < gsinfo->num_outputs; ++i) {
6554                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6555                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6556
6557                 for (int chan = 0; chan < 4; chan++) {
6558                         outputs[i].vertex_stream[chan] =
6559                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6560                 }
6561         }
6562
6563         LLVMBasicBlockRef end_bb;
6564         LLVMValueRef switch_inst;
6565
6566         end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6567         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6568
6569         for (int stream = 0; stream < 4; stream++) {
6570                 LLVMBasicBlockRef bb;
6571                 unsigned offset;
6572
6573                 if (!gsinfo->num_stream_output_components[stream])
6574                         continue;
6575
6576                 if (stream > 0 && !gs_selector->so.num_outputs)
6577                         continue;
6578
6579                 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6580                 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
6581                 LLVMPositionBuilderAtEnd(builder, bb);
6582
6583                 /* Fetch vertex data from GSVS ring */
6584                 offset = 0;
6585                 for (i = 0; i < gsinfo->num_outputs; ++i) {
6586                         for (unsigned chan = 0; chan < 4; chan++) {
6587                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6588                                     outputs[i].vertex_stream[chan] != stream) {
6589                                         outputs[i].values[chan] = ctx.bld_base.base.undef;
6590                                         continue;
6591                                 }
6592
6593                                 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
6594                                         offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
6595                                 offset++;
6596
6597                                 outputs[i].values[chan] =
6598                                         ac_build_buffer_load(&ctx.ac,
6599                                                              ctx.gsvs_ring[0], 1,
6600                                                              ctx.i32_0, voffset,
6601                                                              soffset, 0, 1, 1, true);
6602                         }
6603                 }
6604
6605                 /* Streamout and exports. */
6606                 if (gs_selector->so.num_outputs) {
6607                         si_llvm_emit_streamout(&ctx, outputs,
6608                                                gsinfo->num_outputs,
6609                                                stream);
6610                 }
6611
6612                 if (stream == 0)
6613                         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6614
6615                 LLVMBuildBr(builder, end_bb);
6616         }
6617
6618         LLVMPositionBuilderAtEnd(builder, end_bb);
6619
6620         LLVMBuildRetVoid(gallivm->builder);
6621
6622         /* Dump LLVM IR before any optimization passes */
6623         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6624             r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6625                 ac_dump_module(ctx.gallivm.module);
6626
6627         si_llvm_finalize_module(&ctx,
6628                 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6629
6630         r = si_compile_llvm(sscreen, &ctx.shader->binary,
6631                             &ctx.shader->config, ctx.tm,
6632                             ctx.gallivm.module,
6633                             debug, PIPE_SHADER_GEOMETRY,
6634                             "GS Copy Shader");
6635         if (!r) {
6636                 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6637                         fprintf(stderr, "GS Copy Shader:\n");
6638                 si_shader_dump(sscreen, ctx.shader, debug,
6639                                PIPE_SHADER_GEOMETRY, stderr, true);
6640                 r = si_shader_binary_upload(sscreen, ctx.shader);
6641         }
6642
6643         si_llvm_dispose(&ctx);
6644
6645         FREE(outputs);
6646
6647         if (r != 0) {
6648                 FREE(shader);
6649                 shader = NULL;
6650         }
6651         return shader;
6652 }
6653
6654 static void si_dump_shader_key_vs(struct si_shader_key *key,
6655                                   struct si_vs_prolog_bits *prolog,
6656                                   const char *prefix, FILE *f)
6657 {
6658         fprintf(f, "  %s.instance_divisors = {", prefix);
6659         for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
6660                 fprintf(f, !i ? "%u" : ", %u",
6661                         prolog->instance_divisors[i]);
6662         }
6663         fprintf(f, "}\n");
6664
6665         fprintf(f, "  mono.vs.fix_fetch = {");
6666         for (int i = 0; i < SI_MAX_ATTRIBS; i++)
6667                 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
6668         fprintf(f, "}\n");
6669 }
6670
6671 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
6672                                FILE *f)
6673 {
6674         struct si_shader_key *key = &shader->key;
6675
6676         fprintf(f, "SHADER KEY\n");
6677
6678         switch (processor) {
6679         case PIPE_SHADER_VERTEX:
6680                 si_dump_shader_key_vs(key, &key->part.vs.prolog,
6681                                       "part.vs.prolog", f);
6682                 fprintf(f, "  as_es = %u\n", key->as_es);
6683                 fprintf(f, "  as_ls = %u\n", key->as_ls);
6684                 fprintf(f, "  part.vs.epilog.export_prim_id = %u\n",
6685                         key->part.vs.epilog.export_prim_id);
6686                 break;
6687
6688         case PIPE_SHADER_TESS_CTRL:
6689                 if (shader->selector->screen->b.chip_class >= GFX9) {
6690                         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
6691                                               "part.tcs.ls_prolog", f);
6692                 }
6693                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
6694                 fprintf(f, "  mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
6695                 break;
6696
6697         case PIPE_SHADER_TESS_EVAL:
6698                 fprintf(f, "  part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
6699                 fprintf(f, "  as_es = %u\n", key->as_es);
6700                 break;
6701
6702         case PIPE_SHADER_GEOMETRY:
6703                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
6704                 break;
6705
6706         case PIPE_SHADER_COMPUTE:
6707                 break;
6708
6709         case PIPE_SHADER_FRAGMENT:
6710                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
6711                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
6712                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
6713                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
6714                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
6715                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
6716                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
6717                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
6718                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
6719                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
6720                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
6721                 fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
6722                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
6723                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
6724                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
6725                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
6726                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
6727                 break;
6728
6729         default:
6730                 assert(0);
6731         }
6732
6733         if ((processor == PIPE_SHADER_GEOMETRY ||
6734              processor == PIPE_SHADER_TESS_EVAL ||
6735              processor == PIPE_SHADER_VERTEX) &&
6736             !key->as_es && !key->as_ls) {
6737                 fprintf(f, "  opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
6738                 fprintf(f, "  opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
6739                 fprintf(f, "  opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
6740         }
6741 }
6742
6743 static void si_init_shader_ctx(struct si_shader_context *ctx,
6744                                struct si_screen *sscreen,
6745                                LLVMTargetMachineRef tm)
6746 {
6747         struct lp_build_tgsi_context *bld_base;
6748         struct lp_build_tgsi_action tmpl = {};
6749
6750         si_llvm_context_init(ctx, sscreen, tm);
6751
6752         bld_base = &ctx->bld_base;
6753         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6754
6755         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6756         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6757         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6758
6759         bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6760         bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
6761         bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6762         bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6763         bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6764         bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6765         bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6766         bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
6767         bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6768         bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6769         bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6770         bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6771         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6772         bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6773         bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6774         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6775
6776         bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6777         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6778         bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6779         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6780         bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6781         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6782
6783         tmpl.fetch_args = atomic_fetch_args;
6784         tmpl.emit = atomic_emit;
6785         bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6786         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6787         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6788         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6789         bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6790         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6791         bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6792         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6793         bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6794         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6795         bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6796         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6797         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6798         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6799         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6800         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6801         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6802         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6803         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6804         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6805
6806         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6807
6808         bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
6809
6810         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6811         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6812         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6813         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6814
6815         bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
6816         bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
6817         bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
6818         bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
6819         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
6820         bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
6821         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
6822         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
6823         bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
6824
6825         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6826         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6827         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6828 }
6829
6830 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
6831 {
6832         struct si_shader *shader = ctx->shader;
6833         struct tgsi_shader_info *info = &shader->selector->info;
6834
6835         if (ctx->type == PIPE_SHADER_FRAGMENT ||
6836             ctx->type == PIPE_SHADER_COMPUTE ||
6837             shader->key.as_es ||
6838             shader->key.as_ls)
6839                 return;
6840
6841         ac_eliminate_const_vs_outputs(&ctx->ac,
6842                                       ctx->main_fn,
6843                                       shader->info.vs_output_param_offset,
6844                                       info->num_outputs,
6845                                       &shader->info.nr_param_exports);
6846 }
6847
6848 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
6849 {
6850         ctx->shader->config.private_mem_vgprs = 0;
6851
6852         /* Process all LLVM instructions. */
6853         LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
6854         while (bb) {
6855                 LLVMValueRef next = LLVMGetFirstInstruction(bb);
6856
6857                 while (next) {
6858                         LLVMValueRef inst = next;
6859                         next = LLVMGetNextInstruction(next);
6860
6861                         if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
6862                                 continue;
6863
6864                         LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
6865                         /* No idea why LLVM aligns allocas to 4 elements. */
6866                         unsigned alignment = LLVMGetAlignment(inst);
6867                         unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
6868                         ctx->shader->config.private_mem_vgprs += dw_size;
6869                 }
6870                 bb = LLVMGetNextBasicBlock(bb);
6871         }
6872 }
6873
6874 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
6875                                  struct si_shader *shader)
6876 {
6877         struct si_shader_selector *sel = shader->selector;
6878         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6879
6880         switch (ctx->type) {
6881         case PIPE_SHADER_VERTEX:
6882                 ctx->load_input = declare_input_vs;
6883                 if (shader->key.as_ls)
6884                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6885                 else if (shader->key.as_es)
6886                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6887                 else
6888                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6889                 break;
6890         case PIPE_SHADER_TESS_CTRL:
6891                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6892                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6893                 bld_base->emit_store = store_output_tcs;
6894                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6895                 break;
6896         case PIPE_SHADER_TESS_EVAL:
6897                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6898                 if (shader->key.as_es)
6899                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6900                 else
6901                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6902                 break;
6903         case PIPE_SHADER_GEOMETRY:
6904                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6905                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6906                 break;
6907         case PIPE_SHADER_FRAGMENT:
6908                 ctx->load_input = declare_input_fs;
6909                 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6910                 break;
6911         case PIPE_SHADER_COMPUTE:
6912                 ctx->declare_memory_region = declare_compute_memory;
6913                 break;
6914         default:
6915                 assert(!"Unsupported shader type");
6916                 return false;
6917         }
6918
6919         create_function(ctx);
6920         preload_ring_buffers(ctx);
6921
6922         if (ctx->type == PIPE_SHADER_GEOMETRY) {
6923                 int i;
6924                 for (i = 0; i < 4; i++) {
6925                         ctx->gs_next_vertex[i] =
6926                                 lp_build_alloca(&ctx->gallivm,
6927                                                 ctx->i32, "");
6928                 }
6929         }
6930
6931         if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6932                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6933                 return false;
6934         }
6935
6936         si_llvm_build_ret(ctx, ctx->return_value);
6937         return true;
6938 }
6939
6940 /**
6941  * Compute the VS prolog key, which contains all the information needed to
6942  * build the VS prolog function, and set shader->info bits where needed.
6943  *
6944  * \param info             Shader info of the vertex shader.
6945  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
6946  * \param prolog_key       Key of the VS prolog
6947  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
6948  * \param key              Output shader part key.
6949  */
6950 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
6951                                  unsigned num_input_sgprs,
6952                                  const struct si_vs_prolog_bits *prolog_key,
6953                                  struct si_shader *shader_out,
6954                                  union si_shader_part_key *key)
6955 {
6956         memset(key, 0, sizeof(*key));
6957         key->vs_prolog.states = *prolog_key;
6958         key->vs_prolog.num_input_sgprs = num_input_sgprs;
6959         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6960
6961         /* Set the instanceID flag. */
6962         for (unsigned i = 0; i < info->num_inputs; i++)
6963                 if (key->vs_prolog.states.instance_divisors[i])
6964                         shader_out->info.uses_instanceid = true;
6965 }
6966
6967 /**
6968  * Compute the VS epilog key, which contains all the information needed to
6969  * build the VS epilog function, and set the PrimitiveID output offset.
6970  */
6971 static void si_get_vs_epilog_key(struct si_shader *shader,
6972                                  struct si_vs_epilog_bits *states,
6973                                  union si_shader_part_key *key)
6974 {
6975         memset(key, 0, sizeof(*key));
6976         key->vs_epilog.states = *states;
6977
6978         /* Set up the PrimitiveID output. */
6979         if (shader->key.part.vs.epilog.export_prim_id) {
6980                 unsigned index = shader->selector->info.num_outputs;
6981                 unsigned offset = shader->info.nr_param_exports++;
6982
6983                 key->vs_epilog.prim_id_param_offset = offset;
6984                 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
6985                 shader->info.vs_output_param_offset[index] = offset;
6986         }
6987 }
6988
6989 /**
6990  * Compute the PS prolog key, which contains all the information needed to
6991  * build the PS prolog function, and set related bits in shader->config.
6992  */
6993 static void si_get_ps_prolog_key(struct si_shader *shader,
6994                                  union si_shader_part_key *key,
6995                                  bool separate_prolog)
6996 {
6997         struct tgsi_shader_info *info = &shader->selector->info;
6998
6999         memset(key, 0, sizeof(*key));
7000         key->ps_prolog.states = shader->key.part.ps.prolog;
7001         key->ps_prolog.colors_read = info->colors_read;
7002         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7003         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7004         key->ps_prolog.wqm = info->uses_derivatives &&
7005                 (key->ps_prolog.colors_read ||
7006                  key->ps_prolog.states.force_persp_sample_interp ||
7007                  key->ps_prolog.states.force_linear_sample_interp ||
7008                  key->ps_prolog.states.force_persp_center_interp ||
7009                  key->ps_prolog.states.force_linear_center_interp ||
7010                  key->ps_prolog.states.bc_optimize_for_persp ||
7011                  key->ps_prolog.states.bc_optimize_for_linear);
7012
7013         if (info->colors_read) {
7014                 unsigned *color = shader->selector->color_attr_index;
7015
7016                 if (shader->key.part.ps.prolog.color_two_side) {
7017                         /* BCOLORs are stored after the last input. */
7018                         key->ps_prolog.num_interp_inputs = info->num_inputs;
7019                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7020                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7021                 }
7022
7023                 for (unsigned i = 0; i < 2; i++) {
7024                         unsigned interp = info->input_interpolate[color[i]];
7025                         unsigned location = info->input_interpolate_loc[color[i]];
7026
7027                         if (!(info->colors_read & (0xf << i*4)))
7028                                 continue;
7029
7030                         key->ps_prolog.color_attr_index[i] = color[i];
7031
7032                         if (shader->key.part.ps.prolog.flatshade_colors &&
7033                             interp == TGSI_INTERPOLATE_COLOR)
7034                                 interp = TGSI_INTERPOLATE_CONSTANT;
7035
7036                         switch (interp) {
7037                         case TGSI_INTERPOLATE_CONSTANT:
7038                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7039                                 break;
7040                         case TGSI_INTERPOLATE_PERSPECTIVE:
7041                         case TGSI_INTERPOLATE_COLOR:
7042                                 /* Force the interpolation location for colors here. */
7043                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7044                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7045                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
7046                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7047
7048                                 switch (location) {
7049                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7050                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
7051                                         shader->config.spi_ps_input_ena |=
7052                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
7053                                         break;
7054                                 case TGSI_INTERPOLATE_LOC_CENTER:
7055                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
7056                                         shader->config.spi_ps_input_ena |=
7057                                                 S_0286CC_PERSP_CENTER_ENA(1);
7058                                         break;
7059                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7060                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
7061                                         shader->config.spi_ps_input_ena |=
7062                                                 S_0286CC_PERSP_CENTROID_ENA(1);
7063                                         break;
7064                                 default:
7065                                         assert(0);
7066                                 }
7067                                 break;
7068                         case TGSI_INTERPOLATE_LINEAR:
7069                                 /* Force the interpolation location for colors here. */
7070                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7071                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7072                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
7073                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7074
7075                                 /* The VGPR assignment for non-monolithic shaders
7076                                  * works because InitialPSInputAddr is set on the
7077                                  * main shader and PERSP_PULL_MODEL is never used.
7078                                  */
7079                                 switch (location) {
7080                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7081                                         key->ps_prolog.color_interp_vgpr_index[i] =
7082                                                 separate_prolog ? 6 : 9;
7083                                         shader->config.spi_ps_input_ena |=
7084                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
7085                                         break;
7086                                 case TGSI_INTERPOLATE_LOC_CENTER:
7087                                         key->ps_prolog.color_interp_vgpr_index[i] =
7088                                                 separate_prolog ? 8 : 11;
7089                                         shader->config.spi_ps_input_ena |=
7090                                                 S_0286CC_LINEAR_CENTER_ENA(1);
7091                                         break;
7092                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7093                                         key->ps_prolog.color_interp_vgpr_index[i] =
7094                                                 separate_prolog ? 10 : 13;
7095                                         shader->config.spi_ps_input_ena |=
7096                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
7097                                         break;
7098                                 default:
7099                                         assert(0);
7100                                 }
7101                                 break;
7102                         default:
7103                                 assert(0);
7104                         }
7105                 }
7106         }
7107 }
7108
7109 /**
7110  * Check whether a PS prolog is required based on the key.
7111  */
7112 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7113 {
7114         return key->ps_prolog.colors_read ||
7115                key->ps_prolog.states.force_persp_sample_interp ||
7116                key->ps_prolog.states.force_linear_sample_interp ||
7117                key->ps_prolog.states.force_persp_center_interp ||
7118                key->ps_prolog.states.force_linear_center_interp ||
7119                key->ps_prolog.states.bc_optimize_for_persp ||
7120                key->ps_prolog.states.bc_optimize_for_linear ||
7121                key->ps_prolog.states.poly_stipple;
7122 }
7123
7124 /**
7125  * Compute the PS epilog key, which contains all the information needed to
7126  * build the PS epilog function.
7127  */
7128 static void si_get_ps_epilog_key(struct si_shader *shader,
7129                                  union si_shader_part_key *key)
7130 {
7131         struct tgsi_shader_info *info = &shader->selector->info;
7132         memset(key, 0, sizeof(*key));
7133         key->ps_epilog.colors_written = info->colors_written;
7134         key->ps_epilog.writes_z = info->writes_z;
7135         key->ps_epilog.writes_stencil = info->writes_stencil;
7136         key->ps_epilog.writes_samplemask = info->writes_samplemask;
7137         key->ps_epilog.states = shader->key.part.ps.epilog;
7138 }
7139
7140 /**
7141  * Build the GS prolog function. Rotate the input vertices for triangle strips
7142  * with adjacency.
7143  */
7144 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7145                                         union si_shader_part_key *key)
7146 {
7147         const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2;
7148         const unsigned num_vgprs = 8;
7149         struct gallivm_state *gallivm = &ctx->gallivm;
7150         LLVMBuilderRef builder = gallivm->builder;
7151         LLVMTypeRef params[32];
7152         LLVMTypeRef returns[32];
7153         LLVMValueRef func, ret;
7154
7155         for (unsigned i = 0; i < num_sgprs; ++i) {
7156                 params[i] = ctx->i32;
7157                 returns[i] = ctx->i32;
7158         }
7159
7160         for (unsigned i = 0; i < num_vgprs; ++i) {
7161                 params[num_sgprs + i] = ctx->i32;
7162                 returns[num_sgprs + i] = ctx->f32;
7163         }
7164
7165         /* Create the function. */
7166         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7167                            params, num_sgprs + num_vgprs, num_sgprs - 1);
7168         func = ctx->main_fn;
7169
7170         /* Copy inputs to outputs. This should be no-op, as the registers match,
7171          * but it will prevent the compiler from overwriting them unintentionally.
7172          */
7173         ret = ctx->return_value;
7174         for (unsigned i = 0; i < num_sgprs; i++) {
7175                 LLVMValueRef p = LLVMGetParam(func, i);
7176                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7177         }
7178         for (unsigned i = 0; i < num_vgprs; i++) {
7179                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7180                 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7181                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7182         }
7183
7184         if (key->gs_prolog.states.tri_strip_adj_fix) {
7185                 /* Remap the input vertices for every other primitive. */
7186                 const unsigned vtx_params[6] = {
7187                         num_sgprs,
7188                         num_sgprs + 1,
7189                         num_sgprs + 3,
7190                         num_sgprs + 4,
7191                         num_sgprs + 5,
7192                         num_sgprs + 6
7193                 };
7194                 LLVMValueRef prim_id, rotate;
7195
7196                 prim_id = LLVMGetParam(func, num_sgprs + 2);
7197                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7198
7199                 for (unsigned i = 0; i < 6; ++i) {
7200                         LLVMValueRef base, rotated, actual;
7201                         base = LLVMGetParam(func, vtx_params[i]);
7202                         rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
7203                         actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
7204                         actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
7205                         ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
7206                 }
7207         }
7208
7209         LLVMBuildRet(builder, ret);
7210 }
7211
7212 /**
7213  * Given a list of shader part functions, build a wrapper function that
7214  * runs them in sequence to form a monolithic shader.
7215  */
7216 static void si_build_wrapper_function(struct si_shader_context *ctx,
7217                                       LLVMValueRef *parts,
7218                                       unsigned num_parts,
7219                                       unsigned main_part)
7220 {
7221         struct gallivm_state *gallivm = &ctx->gallivm;
7222         LLVMBuilderRef builder = ctx->gallivm.builder;
7223         /* PS epilog has one arg per color component */
7224         LLVMTypeRef param_types[48];
7225         LLVMValueRef out[48];
7226         LLVMTypeRef function_type;
7227         unsigned num_params;
7228         unsigned num_out;
7229         MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7230         unsigned num_sgprs, num_vgprs;
7231         unsigned last_sgpr_param;
7232         unsigned gprs;
7233
7234         for (unsigned i = 0; i < num_parts; ++i) {
7235                 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7236                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7237         }
7238
7239         /* The parameters of the wrapper function correspond to those of the
7240          * first part in terms of SGPRs and VGPRs, but we use the types of the
7241          * main part to get the right types. This is relevant for the
7242          * dereferenceable attribute on descriptor table pointers.
7243          */
7244         num_sgprs = 0;
7245         num_vgprs = 0;
7246
7247         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7248         num_params = LLVMCountParamTypes(function_type);
7249
7250         for (unsigned i = 0; i < num_params; ++i) {
7251                 LLVMValueRef param = LLVMGetParam(parts[0], i);
7252
7253                 if (ac_is_sgpr_param(param)) {
7254                         assert(num_vgprs == 0);
7255                         num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7256                 } else {
7257                         num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7258                 }
7259         }
7260         assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7261
7262         num_params = 0;
7263         last_sgpr_param = 0;
7264         gprs = 0;
7265         while (gprs < num_sgprs + num_vgprs) {
7266                 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7267                 unsigned size;
7268
7269                 param_types[num_params] = LLVMTypeOf(param);
7270                 if (gprs < num_sgprs)
7271                         last_sgpr_param = num_params;
7272                 size = llvm_get_type_size(param_types[num_params]) / 4;
7273                 num_params++;
7274
7275                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7276                 assert(gprs + size <= num_sgprs + num_vgprs &&
7277                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
7278
7279                 gprs += size;
7280         }
7281
7282         si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
7283
7284         /* Record the arguments of the function as if they were an output of
7285          * a previous part.
7286          */
7287         num_out = 0;
7288         num_out_sgpr = 0;
7289
7290         for (unsigned i = 0; i < num_params; ++i) {
7291                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7292                 LLVMTypeRef param_type = LLVMTypeOf(param);
7293                 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7294                 unsigned size = llvm_get_type_size(param_type) / 4;
7295
7296                 if (size == 1) {
7297                         if (param_type != out_type)
7298                                 param = LLVMBuildBitCast(builder, param, out_type, "");
7299                         out[num_out++] = param;
7300                 } else {
7301                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7302
7303                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7304                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7305                                 param_type = ctx->i64;
7306                         }
7307
7308                         if (param_type != vector_type)
7309                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
7310
7311                         for (unsigned j = 0; j < size; ++j)
7312                                 out[num_out++] = LLVMBuildExtractElement(
7313                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7314                 }
7315
7316                 if (i <= last_sgpr_param)
7317                         num_out_sgpr = num_out;
7318         }
7319
7320         /* Now chain the parts. */
7321         for (unsigned part = 0; part < num_parts; ++part) {
7322                 LLVMValueRef in[48];
7323                 LLVMValueRef ret;
7324                 LLVMTypeRef ret_type;
7325                 unsigned out_idx = 0;
7326
7327                 num_params = LLVMCountParams(parts[part]);
7328                 assert(num_params <= ARRAY_SIZE(param_types));
7329
7330                 /* Derive arguments for the next part from outputs of the
7331                  * previous one.
7332                  */
7333                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7334                         LLVMValueRef param;
7335                         LLVMTypeRef param_type;
7336                         bool is_sgpr;
7337                         unsigned param_size;
7338                         LLVMValueRef arg = NULL;
7339
7340                         param = LLVMGetParam(parts[part], param_idx);
7341                         param_type = LLVMTypeOf(param);
7342                         param_size = llvm_get_type_size(param_type) / 4;
7343                         is_sgpr = ac_is_sgpr_param(param);
7344
7345                         if (is_sgpr) {
7346 #if HAVE_LLVM < 0x0400
7347                                 LLVMRemoveAttribute(param, LLVMByValAttribute);
7348 #else
7349                                 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7350                                 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7351 #endif
7352                                 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7353                         }
7354
7355                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7356                         assert(is_sgpr || out_idx >= num_out_sgpr);
7357
7358                         if (param_size == 1)
7359                                 arg = out[out_idx];
7360                         else
7361                                 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7362
7363                         if (LLVMTypeOf(arg) != param_type) {
7364                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7365                                         arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7366                                         arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7367                                 } else {
7368                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
7369                                 }
7370                         }
7371
7372                         in[param_idx] = arg;
7373                         out_idx += param_size;
7374                 }
7375
7376                 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7377                 ret_type = LLVMTypeOf(ret);
7378
7379                 /* Extract the returned GPRs. */
7380                 num_out = 0;
7381                 num_out_sgpr = 0;
7382
7383                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7384                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7385
7386                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7387
7388                         for (unsigned i = 0; i < ret_size; ++i) {
7389                                 LLVMValueRef val =
7390                                         LLVMBuildExtractValue(builder, ret, i, "");
7391
7392                                 out[num_out++] = val;
7393
7394                                 if (LLVMTypeOf(val) == ctx->i32) {
7395                                         assert(num_out_sgpr + 1 == num_out);
7396                                         num_out_sgpr = num_out;
7397                                 }
7398                         }
7399                 }
7400         }
7401
7402         LLVMBuildRetVoid(builder);
7403 }
7404
7405 int si_compile_tgsi_shader(struct si_screen *sscreen,
7406                            LLVMTargetMachineRef tm,
7407                            struct si_shader *shader,
7408                            bool is_monolithic,
7409                            struct pipe_debug_callback *debug)
7410 {
7411         struct si_shader_selector *sel = shader->selector;
7412         struct si_shader_context ctx;
7413         int r = -1;
7414
7415         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7416          * conversion fails. */
7417         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7418             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7419                 tgsi_dump(sel->tokens, 0);
7420                 si_dump_streamout(&sel->so);
7421         }
7422
7423         si_init_shader_ctx(&ctx, sscreen, tm);
7424         si_llvm_context_set_tgsi(&ctx, shader);
7425         ctx.separate_prolog = !is_monolithic;
7426
7427         memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7428                sizeof(shader->info.vs_output_param_offset));
7429
7430         shader->info.uses_instanceid = sel->info.uses_instanceid;
7431
7432         ctx.load_system_value = declare_system_value;
7433
7434         if (!si_compile_tgsi_main(&ctx, shader)) {
7435                 si_llvm_dispose(&ctx);
7436                 return -1;
7437         }
7438
7439         if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7440                 LLVMValueRef parts[3];
7441                 bool need_prolog;
7442                 bool need_epilog;
7443
7444                 need_prolog = sel->vs_needs_prolog;
7445                 need_epilog = !shader->key.as_es && !shader->key.as_ls;
7446
7447                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7448
7449                 if (need_prolog) {
7450                         union si_shader_part_key prolog_key;
7451                         si_get_vs_prolog_key(&sel->info,
7452                                              shader->info.num_input_sgprs,
7453                                              &shader->key.part.vs.prolog,
7454                                              shader, &prolog_key);
7455                         si_build_vs_prolog_function(&ctx, &prolog_key);
7456                         parts[0] = ctx.main_fn;
7457                 }
7458
7459                 if (need_epilog) {
7460                         union si_shader_part_key epilog_key;
7461                         si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
7462                         si_build_vs_epilog_function(&ctx, &epilog_key);
7463                         parts[need_prolog ? 2 : 1] = ctx.main_fn;
7464                 }
7465
7466                 si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
7467                                           need_prolog ? 1 : 0);
7468         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7469                 LLVMValueRef parts[2];
7470                 union si_shader_part_key epilog_key;
7471
7472                 parts[0] = ctx.main_fn;
7473
7474                 memset(&epilog_key, 0, sizeof(epilog_key));
7475                 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7476                 si_build_tcs_epilog_function(&ctx, &epilog_key);
7477                 parts[1] = ctx.main_fn;
7478
7479                 si_build_wrapper_function(&ctx, parts, 2, 0);
7480         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
7481                    !shader->key.as_es) {
7482                 LLVMValueRef parts[2];
7483                 union si_shader_part_key epilog_key;
7484
7485                 parts[0] = ctx.main_fn;
7486
7487                 si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
7488                 si_build_vs_epilog_function(&ctx, &epilog_key);
7489                 parts[1] = ctx.main_fn;
7490
7491                 si_build_wrapper_function(&ctx, parts, 2, 0);
7492         } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
7493                 LLVMValueRef parts[2];
7494                 union si_shader_part_key prolog_key;
7495
7496                 parts[1] = ctx.main_fn;
7497
7498                 memset(&prolog_key, 0, sizeof(prolog_key));
7499                 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7500                 si_build_gs_prolog_function(&ctx, &prolog_key);
7501                 parts[0] = ctx.main_fn;
7502
7503                 si_build_wrapper_function(&ctx, parts, 2, 1);
7504         } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
7505                 LLVMValueRef parts[3];
7506                 union si_shader_part_key prolog_key;
7507                 union si_shader_part_key epilog_key;
7508                 bool need_prolog;
7509
7510                 si_get_ps_prolog_key(shader, &prolog_key, false);
7511                 need_prolog = si_need_ps_prolog(&prolog_key);
7512
7513                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7514
7515                 if (need_prolog) {
7516                         si_build_ps_prolog_function(&ctx, &prolog_key);
7517                         parts[0] = ctx.main_fn;
7518                 }
7519
7520                 si_get_ps_epilog_key(shader, &epilog_key);
7521                 si_build_ps_epilog_function(&ctx, &epilog_key);
7522                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
7523
7524                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
7525         }
7526
7527         /* Dump LLVM IR before any optimization passes */
7528         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
7529             r600_can_dump_shader(&sscreen->b, ctx.type))
7530                 LLVMDumpModule(ctx.gallivm.module);
7531
7532         si_llvm_finalize_module(&ctx,
7533                                     r600_extra_shader_checks(&sscreen->b, ctx.type));
7534
7535         /* Post-optimization transformations and analysis. */
7536         si_eliminate_const_vs_outputs(&ctx);
7537
7538         if ((debug && debug->debug_message) ||
7539             r600_can_dump_shader(&sscreen->b, ctx.type))
7540                 si_count_scratch_private_memory(&ctx);
7541
7542         /* Compile to bytecode. */
7543         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
7544                             ctx.gallivm.module, debug, ctx.type, "TGSI shader");
7545         si_llvm_dispose(&ctx);
7546         if (r) {
7547                 fprintf(stderr, "LLVM failed to compile shader\n");
7548                 return r;
7549         }
7550
7551         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
7552          * LLVM 3.9svn has this bug.
7553          */
7554         if (sel->type == PIPE_SHADER_COMPUTE) {
7555                 unsigned wave_size = 64;
7556                 unsigned max_vgprs = 256;
7557                 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
7558                 unsigned max_sgprs_per_wave = 128;
7559                 unsigned max_block_threads = si_get_max_workgroup_size(shader);
7560                 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
7561                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
7562
7563                 max_vgprs = max_vgprs / min_waves_per_simd;
7564                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
7565
7566                 if (shader->config.num_sgprs > max_sgprs ||
7567                     shader->config.num_vgprs > max_vgprs) {
7568                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
7569                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
7570                                 shader->config.num_sgprs, shader->config.num_vgprs,
7571                                 max_sgprs, max_vgprs);
7572
7573                         /* Just terminate the process, because dependent
7574                          * shaders can hang due to bad input data, but use
7575                          * the env var to allow shader-db to work.
7576                          */
7577                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
7578                                 abort();
7579                 }
7580         }
7581
7582         /* Add the scratch offset to input SGPRs. */
7583         if (shader->config.scratch_bytes_per_wave)
7584                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
7585
7586         /* Calculate the number of fragment input VGPRs. */
7587         if (ctx.type == PIPE_SHADER_FRAGMENT) {
7588                 shader->info.num_input_vgprs = 0;
7589                 shader->info.face_vgpr_index = -1;
7590
7591                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
7592                         shader->info.num_input_vgprs += 2;
7593                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
7594                         shader->info.num_input_vgprs += 2;
7595                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
7596                         shader->info.num_input_vgprs += 2;
7597                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
7598                         shader->info.num_input_vgprs += 3;
7599                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
7600                         shader->info.num_input_vgprs += 2;
7601                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
7602                         shader->info.num_input_vgprs += 2;
7603                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
7604                         shader->info.num_input_vgprs += 2;
7605                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
7606                         shader->info.num_input_vgprs += 1;
7607                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
7608                         shader->info.num_input_vgprs += 1;
7609                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
7610                         shader->info.num_input_vgprs += 1;
7611                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
7612                         shader->info.num_input_vgprs += 1;
7613                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
7614                         shader->info.num_input_vgprs += 1;
7615                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
7616                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
7617                         shader->info.num_input_vgprs += 1;
7618                 }
7619                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
7620                         shader->info.num_input_vgprs += 1;
7621                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
7622                         shader->info.num_input_vgprs += 1;
7623                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
7624                         shader->info.num_input_vgprs += 1;
7625         }
7626
7627         return 0;
7628 }
7629
7630 /**
7631  * Create, compile and return a shader part (prolog or epilog).
7632  *
7633  * \param sscreen       screen
7634  * \param list          list of shader parts of the same category
7635  * \param type          shader type
7636  * \param key           shader part key
7637  * \param prolog        whether the part being requested is a prolog
7638  * \param tm            LLVM target machine
7639  * \param debug         debug callback
7640  * \param build         the callback responsible for building the main function
7641  * \return              non-NULL on success
7642  */
7643 static struct si_shader_part *
7644 si_get_shader_part(struct si_screen *sscreen,
7645                    struct si_shader_part **list,
7646                    enum pipe_shader_type type,
7647                    bool prolog,
7648                    union si_shader_part_key *key,
7649                    LLVMTargetMachineRef tm,
7650                    struct pipe_debug_callback *debug,
7651                    void (*build)(struct si_shader_context *,
7652                                  union si_shader_part_key *),
7653                    const char *name)
7654 {
7655         struct si_shader_part *result;
7656
7657         mtx_lock(&sscreen->shader_parts_mutex);
7658
7659         /* Find existing. */
7660         for (result = *list; result; result = result->next) {
7661                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
7662                         mtx_unlock(&sscreen->shader_parts_mutex);
7663                         return result;
7664                 }
7665         }
7666
7667         /* Compile a new one. */
7668         result = CALLOC_STRUCT(si_shader_part);
7669         result->key = *key;
7670
7671         struct si_shader shader = {};
7672         struct si_shader_context ctx;
7673         struct gallivm_state *gallivm = &ctx.gallivm;
7674
7675         si_init_shader_ctx(&ctx, sscreen, tm);
7676         ctx.shader = &shader;
7677         ctx.type = type;
7678
7679         switch (type) {
7680         case PIPE_SHADER_VERTEX:
7681                 break;
7682         case PIPE_SHADER_TESS_CTRL:
7683                 assert(!prolog);
7684                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
7685                 break;
7686         case PIPE_SHADER_GEOMETRY:
7687                 assert(prolog);
7688                 break;
7689         case PIPE_SHADER_FRAGMENT:
7690                 if (prolog)
7691                         shader.key.part.ps.prolog = key->ps_prolog.states;
7692                 else
7693                         shader.key.part.ps.epilog = key->ps_epilog.states;
7694                 break;
7695         default:
7696                 unreachable("bad shader part");
7697         }
7698
7699         build(&ctx, key);
7700
7701         /* Compile. */
7702         si_llvm_finalize_module(&ctx,
7703                 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
7704
7705         if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
7706                             gallivm->module, debug, ctx.type, name)) {
7707                 FREE(result);
7708                 result = NULL;
7709                 goto out;
7710         }
7711
7712         result->next = *list;
7713         *list = result;
7714
7715 out:
7716         si_llvm_dispose(&ctx);
7717         mtx_unlock(&sscreen->shader_parts_mutex);
7718         return result;
7719 }
7720
7721 /**
7722  * Build the vertex shader prolog function.
7723  *
7724  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
7725  * All inputs are returned unmodified. The vertex load indices are
7726  * stored after them, which will be used by the API VS for fetching inputs.
7727  *
7728  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
7729  *   input_v0,
7730  *   input_v1,
7731  *   input_v2,
7732  *   input_v3,
7733  *   (VertexID + BaseVertex),
7734  *   (InstanceID + StartInstance),
7735  *   (InstanceID / 2 + StartInstance)
7736  */
7737 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
7738                                         union si_shader_part_key *key)
7739 {
7740         struct gallivm_state *gallivm = &ctx->gallivm;
7741         LLVMTypeRef *params, *returns;
7742         LLVMValueRef ret, func;
7743         int last_sgpr, num_params, num_returns, i;
7744
7745         ctx->param_vertex_id = key->vs_prolog.num_input_sgprs;
7746         ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3;
7747
7748         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
7749         params = alloca((key->vs_prolog.num_input_sgprs + 4) *
7750                         sizeof(LLVMTypeRef));
7751         returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
7752                           key->vs_prolog.last_input + 1) *
7753                          sizeof(LLVMTypeRef));
7754         num_params = 0;
7755         num_returns = 0;
7756
7757         /* Declare input and output SGPRs. */
7758         num_params = 0;
7759         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7760                 params[num_params++] = ctx->i32;
7761                 returns[num_returns++] = ctx->i32;
7762         }
7763         last_sgpr = num_params - 1;
7764
7765         /* 4 preloaded VGPRs (outputs must be floats) */
7766         for (i = 0; i < 4; i++) {
7767                 params[num_params++] = ctx->i32;
7768                 returns[num_returns++] = ctx->f32;
7769         }
7770
7771         /* Vertex load indices. */
7772         for (i = 0; i <= key->vs_prolog.last_input; i++)
7773                 returns[num_returns++] = ctx->f32;
7774
7775         /* Create the function. */
7776         si_create_function(ctx, "vs_prolog", returns, num_returns, params,
7777                            num_params, last_sgpr);
7778         func = ctx->main_fn;
7779
7780         /* Copy inputs to outputs. This should be no-op, as the registers match,
7781          * but it will prevent the compiler from overwriting them unintentionally.
7782          */
7783         ret = ctx->return_value;
7784         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7785                 LLVMValueRef p = LLVMGetParam(func, i);
7786                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7787         }
7788         for (i = num_params - 4; i < num_params; i++) {
7789                 LLVMValueRef p = LLVMGetParam(func, i);
7790                 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
7791                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7792         }
7793
7794         /* Compute vertex load indices from instance divisors. */
7795         for (i = 0; i <= key->vs_prolog.last_input; i++) {
7796                 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
7797                 LLVMValueRef index;
7798
7799                 if (divisor) {
7800                         /* InstanceID / Divisor + StartInstance */
7801                         index = get_instance_index_for_fetch(ctx,
7802                                                              SI_SGPR_START_INSTANCE,
7803                                                              divisor);
7804                 } else {
7805                         /* VertexID + BaseVertex */
7806                         index = LLVMBuildAdd(gallivm->builder,
7807                                              LLVMGetParam(func, ctx->param_vertex_id),
7808                                              LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
7809                 }
7810
7811                 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
7812                 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
7813                                            num_params++, "");
7814         }
7815
7816         si_llvm_build_ret(ctx, ret);
7817 }
7818
7819 /**
7820  * Build the vertex shader epilog function. This is also used by the tessellation
7821  * evaluation shader compiled as VS.
7822  *
7823  * The input is PrimitiveID.
7824  *
7825  * If PrimitiveID is required by the pixel shader, export it.
7826  * Otherwise, do nothing.
7827  */
7828 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
7829                                         union si_shader_part_key *key)
7830 {
7831         struct gallivm_state *gallivm = &ctx->gallivm;
7832         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7833         LLVMTypeRef params[5];
7834         int num_params, i;
7835
7836         /* Declare input VGPRs. */
7837         num_params = key->vs_epilog.states.export_prim_id ?
7838                            (VS_EPILOG_PRIMID_LOC + 1) : 0;
7839         assert(num_params <= ARRAY_SIZE(params));
7840
7841         for (i = 0; i < num_params; i++)
7842                 params[i] = ctx->f32;
7843
7844         /* Create the function. */
7845         si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
7846
7847         /* Emit exports. */
7848         if (key->vs_epilog.states.export_prim_id) {
7849                 struct lp_build_context *base = &bld_base->base;
7850                 struct ac_export_args args;
7851
7852                 args.enabled_channels = 0x1; /* enabled channels */
7853                 args.valid_mask = 0; /* whether the EXEC mask is valid */
7854                 args.done = 0; /* DONE bit */
7855                 args.target = V_008DFC_SQ_EXP_PARAM +
7856                               key->vs_epilog.prim_id_param_offset;
7857                 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
7858                 args.out[0] = LLVMGetParam(ctx->main_fn,
7859                                        VS_EPILOG_PRIMID_LOC); /* X */
7860                 args.out[1] = base->undef; /* Y */
7861                 args.out[2] = base->undef; /* Z */
7862                 args.out[3] = base->undef; /* W */
7863
7864                 ac_build_export(&ctx->ac, &args);
7865         }
7866
7867         LLVMBuildRetVoid(gallivm->builder);
7868 }
7869
7870 static bool si_get_vs_prolog(struct si_screen *sscreen,
7871                              LLVMTargetMachineRef tm,
7872                              struct si_shader *shader,
7873                              struct pipe_debug_callback *debug,
7874                              struct si_shader *main_part,
7875                              const struct si_vs_prolog_bits *key)
7876 {
7877         struct si_shader_selector *vs = main_part->selector;
7878
7879         /* The prolog is a no-op if there are no inputs. */
7880         if (!vs->vs_needs_prolog)
7881                 return true;
7882
7883         /* Get the prolog. */
7884         union si_shader_part_key prolog_key;
7885         si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
7886                              key, shader, &prolog_key);
7887
7888         shader->prolog =
7889                 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7890                                    PIPE_SHADER_VERTEX, true, &prolog_key, tm,
7891                                    debug, si_build_vs_prolog_function,
7892                                    "Vertex Shader Prolog");
7893         return shader->prolog != NULL;
7894 }
7895
7896 /**
7897  * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7898  */
7899 static bool si_get_vs_epilog(struct si_screen *sscreen,
7900                              LLVMTargetMachineRef tm,
7901                              struct si_shader *shader,
7902                              struct pipe_debug_callback *debug,
7903                              struct si_vs_epilog_bits *states)
7904 {
7905         union si_shader_part_key epilog_key;
7906
7907         si_get_vs_epilog_key(shader, states, &epilog_key);
7908
7909         shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7910                                             PIPE_SHADER_VERTEX, true,
7911                                             &epilog_key, tm, debug,
7912                                             si_build_vs_epilog_function,
7913                                             "Vertex Shader Epilog");
7914         return shader->epilog != NULL;
7915 }
7916
7917 /**
7918  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7919  */
7920 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7921                                       LLVMTargetMachineRef tm,
7922                                       struct si_shader *shader,
7923                                       struct pipe_debug_callback *debug)
7924 {
7925         if (!si_get_vs_prolog(sscreen, tm, shader, debug, shader,
7926                               &shader->key.part.vs.prolog))
7927                 return false;
7928
7929         /* Get the epilog. */
7930         if (!shader->key.as_es && !shader->key.as_ls &&
7931             !si_get_vs_epilog(sscreen, tm, shader, debug,
7932                               &shader->key.part.vs.epilog))
7933                 return false;
7934
7935         return true;
7936 }
7937
7938 /**
7939  * Select and compile (or reuse) TES parts (epilog).
7940  */
7941 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7942                                        LLVMTargetMachineRef tm,
7943                                        struct si_shader *shader,
7944                                        struct pipe_debug_callback *debug)
7945 {
7946         if (shader->key.as_es)
7947                 return true;
7948
7949         /* TES compiled as VS. */
7950         return si_get_vs_epilog(sscreen, tm, shader, debug,
7951                                 &shader->key.part.tes.epilog);
7952 }
7953
7954 /**
7955  * Compile the TCS epilog function. This writes tesselation factors to memory
7956  * based on the output primitive type of the tesselator (determined by TES).
7957  */
7958 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
7959                                          union si_shader_part_key *key)
7960 {
7961         struct gallivm_state *gallivm = &ctx->gallivm;
7962         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7963         LLVMTypeRef params[16];
7964         LLVMValueRef func;
7965         int last_sgpr, num_params = 0;
7966
7967         /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7968         params[ctx->param_rw_buffers = num_params++] =
7969                 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
7970         params[ctx->param_const_buffers = num_params++] = ctx->i64;
7971         params[ctx->param_samplers = num_params++] = ctx->i64;
7972         params[ctx->param_images = num_params++] = ctx->i64;
7973         params[ctx->param_shader_buffers = num_params++] = ctx->i64;
7974         params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
7975         params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
7976         params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
7977         params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
7978         params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
7979         params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
7980         last_sgpr = num_params - 1;
7981
7982         params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
7983         params[num_params++] = ctx->i32; /* invocation ID within the patch */
7984         params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
7985
7986         /* Create the function. */
7987         si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
7988         declare_tess_lds(ctx);
7989         func = ctx->main_fn;
7990
7991         si_write_tess_factors(bld_base,
7992                               LLVMGetParam(func, last_sgpr + 1),
7993                               LLVMGetParam(func, last_sgpr + 2),
7994                               LLVMGetParam(func, last_sgpr + 3));
7995
7996         LLVMBuildRetVoid(gallivm->builder);
7997 }
7998
7999 /**
8000  * Select and compile (or reuse) TCS parts (epilog).
8001  */
8002 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8003                                        LLVMTargetMachineRef tm,
8004                                        struct si_shader *shader,
8005                                        struct pipe_debug_callback *debug)
8006 {
8007         if (sscreen->b.chip_class >= GFX9) {
8008                 struct si_shader *ls_main_part =
8009                         shader->key.part.tcs.ls->main_shader_part_ls;
8010
8011                 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8012                                       &shader->key.part.tcs.ls_prolog))
8013                         return false;
8014
8015                 shader->previous_stage = ls_main_part;
8016         }
8017
8018         /* Get the epilog. */
8019         union si_shader_part_key epilog_key;
8020         memset(&epilog_key, 0, sizeof(epilog_key));
8021         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8022
8023         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8024                                             PIPE_SHADER_TESS_CTRL, false,
8025                                             &epilog_key, tm, debug,
8026                                             si_build_tcs_epilog_function,
8027                                             "Tessellation Control Shader Epilog");
8028         return shader->epilog != NULL;
8029 }
8030
8031 /**
8032  * Select and compile (or reuse) GS parts (prolog).
8033  */
8034 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8035                                       LLVMTargetMachineRef tm,
8036                                       struct si_shader *shader,
8037                                       struct pipe_debug_callback *debug)
8038 {
8039         union si_shader_part_key prolog_key;
8040
8041         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8042                 return true;
8043
8044         memset(&prolog_key, 0, sizeof(prolog_key));
8045         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8046
8047         shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8048                                             PIPE_SHADER_GEOMETRY, true,
8049                                             &prolog_key, tm, debug,
8050                                             si_build_gs_prolog_function,
8051                                             "Geometry Shader Prolog");
8052         return shader->prolog != NULL;
8053 }
8054
8055 /**
8056  * Build the pixel shader prolog function. This handles:
8057  * - two-side color selection and interpolation
8058  * - overriding interpolation parameters for the API PS
8059  * - polygon stippling
8060  *
8061  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8062  * overriden by other states. (e.g. per-sample interpolation)
8063  * Interpolated colors are stored after the preloaded VGPRs.
8064  */
8065 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8066                                         union si_shader_part_key *key)
8067 {
8068         struct gallivm_state *gallivm = &ctx->gallivm;
8069         LLVMTypeRef *params;
8070         LLVMValueRef ret, func;
8071         int last_sgpr, num_params, num_returns, i, num_color_channels;
8072
8073         assert(si_need_ps_prolog(key));
8074
8075         /* Number of inputs + 8 color elements. */
8076         params = alloca((key->ps_prolog.num_input_sgprs +
8077                          key->ps_prolog.num_input_vgprs + 8) *
8078                         sizeof(LLVMTypeRef));
8079
8080         /* Declare inputs. */
8081         num_params = 0;
8082         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8083                 params[num_params++] = ctx->i32;
8084         last_sgpr = num_params - 1;
8085
8086         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8087                 params[num_params++] = ctx->f32;
8088
8089         /* Declare outputs (same as inputs + add colors if needed) */
8090         num_returns = num_params;
8091         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8092         for (i = 0; i < num_color_channels; i++)
8093                 params[num_returns++] = ctx->f32;
8094
8095         /* Create the function. */
8096         si_create_function(ctx, "ps_prolog", params, num_returns, params,
8097                            num_params, last_sgpr);
8098         func = ctx->main_fn;
8099
8100         /* Copy inputs to outputs. This should be no-op, as the registers match,
8101          * but it will prevent the compiler from overwriting them unintentionally.
8102          */
8103         ret = ctx->return_value;
8104         for (i = 0; i < num_params; i++) {
8105                 LLVMValueRef p = LLVMGetParam(func, i);
8106                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8107         }
8108
8109         /* Polygon stippling. */
8110         if (key->ps_prolog.states.poly_stipple) {
8111                 /* POS_FIXED_PT is always last. */
8112                 unsigned pos = key->ps_prolog.num_input_sgprs +
8113                                key->ps_prolog.num_input_vgprs - 1;
8114                 LLVMValueRef ptr[2], list;
8115
8116                 /* Get the pointer to rw buffers. */
8117                 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8118                 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8119                 list = lp_build_gather_values(gallivm, ptr, 2);
8120                 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8121                 list = LLVMBuildIntToPtr(gallivm->builder, list,
8122                                           const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
8123
8124                 si_llvm_emit_polygon_stipple(ctx, list, pos);
8125         }
8126
8127         if (key->ps_prolog.states.bc_optimize_for_persp ||
8128             key->ps_prolog.states.bc_optimize_for_linear) {
8129                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8130                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8131
8132                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8133                  * The hw doesn't compute CENTROID if the whole wave only
8134                  * contains fully-covered quads.
8135                  *
8136                  * PRIM_MASK is after user SGPRs.
8137                  */
8138                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8139                 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8140                                             LLVMConstInt(ctx->i32, 31, 0), "");
8141                 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8142                                              ctx->i1, "");
8143
8144                 if (key->ps_prolog.states.bc_optimize_for_persp) {
8145                         /* Read PERSP_CENTER. */
8146                         for (i = 0; i < 2; i++)
8147                                 center[i] = LLVMGetParam(func, base + 2 + i);
8148                         /* Read PERSP_CENTROID. */
8149                         for (i = 0; i < 2; i++)
8150                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
8151                         /* Select PERSP_CENTROID. */
8152                         for (i = 0; i < 2; i++) {
8153                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8154                                                       center[i], centroid[i], "");
8155                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8156                                                            tmp, base + 4 + i, "");
8157                         }
8158                 }
8159                 if (key->ps_prolog.states.bc_optimize_for_linear) {
8160                         /* Read LINEAR_CENTER. */
8161                         for (i = 0; i < 2; i++)
8162                                 center[i] = LLVMGetParam(func, base + 8 + i);
8163                         /* Read LINEAR_CENTROID. */
8164                         for (i = 0; i < 2; i++)
8165                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
8166                         /* Select LINEAR_CENTROID. */
8167                         for (i = 0; i < 2; i++) {
8168                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8169                                                       center[i], centroid[i], "");
8170                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8171                                                            tmp, base + 10 + i, "");
8172                         }
8173                 }
8174         }
8175
8176         /* Force per-sample interpolation. */
8177         if (key->ps_prolog.states.force_persp_sample_interp) {
8178                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8179                 LLVMValueRef persp_sample[2];
8180
8181                 /* Read PERSP_SAMPLE. */
8182                 for (i = 0; i < 2; i++)
8183                         persp_sample[i] = LLVMGetParam(func, base + i);
8184                 /* Overwrite PERSP_CENTER. */
8185                 for (i = 0; i < 2; i++)
8186                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8187                                                    persp_sample[i], base + 2 + i, "");
8188                 /* Overwrite PERSP_CENTROID. */
8189                 for (i = 0; i < 2; i++)
8190                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8191                                                    persp_sample[i], base + 4 + i, "");
8192         }
8193         if (key->ps_prolog.states.force_linear_sample_interp) {
8194                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8195                 LLVMValueRef linear_sample[2];
8196
8197                 /* Read LINEAR_SAMPLE. */
8198                 for (i = 0; i < 2; i++)
8199                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8200                 /* Overwrite LINEAR_CENTER. */
8201                 for (i = 0; i < 2; i++)
8202                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8203                                                    linear_sample[i], base + 8 + i, "");
8204                 /* Overwrite LINEAR_CENTROID. */
8205                 for (i = 0; i < 2; i++)
8206                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8207                                                    linear_sample[i], base + 10 + i, "");
8208         }
8209
8210         /* Force center interpolation. */
8211         if (key->ps_prolog.states.force_persp_center_interp) {
8212                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8213                 LLVMValueRef persp_center[2];
8214
8215                 /* Read PERSP_CENTER. */
8216                 for (i = 0; i < 2; i++)
8217                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
8218                 /* Overwrite PERSP_SAMPLE. */
8219                 for (i = 0; i < 2; i++)
8220                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8221                                                    persp_center[i], base + i, "");
8222                 /* Overwrite PERSP_CENTROID. */
8223                 for (i = 0; i < 2; i++)
8224                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8225                                                    persp_center[i], base + 4 + i, "");
8226         }
8227         if (key->ps_prolog.states.force_linear_center_interp) {
8228                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8229                 LLVMValueRef linear_center[2];
8230
8231                 /* Read LINEAR_CENTER. */
8232                 for (i = 0; i < 2; i++)
8233                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
8234                 /* Overwrite LINEAR_SAMPLE. */
8235                 for (i = 0; i < 2; i++)
8236                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8237                                                    linear_center[i], base + 6 + i, "");
8238                 /* Overwrite LINEAR_CENTROID. */
8239                 for (i = 0; i < 2; i++)
8240                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8241                                                    linear_center[i], base + 10 + i, "");
8242         }
8243
8244         /* Interpolate colors. */
8245         for (i = 0; i < 2; i++) {
8246                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8247                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8248                                      key->ps_prolog.face_vgpr_index;
8249                 LLVMValueRef interp[2], color[4];
8250                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8251
8252                 if (!writemask)
8253                         continue;
8254
8255                 /* If the interpolation qualifier is not CONSTANT (-1). */
8256                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8257                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8258                                                key->ps_prolog.color_interp_vgpr_index[i];
8259
8260                         /* Get the (i,j) updated by bc_optimize handling. */
8261                         interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8262                                                           interp_vgpr, "");
8263                         interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8264                                                           interp_vgpr + 1, "");
8265                         interp_ij = lp_build_gather_values(gallivm, interp, 2);
8266                 }
8267
8268                 /* Use the absolute location of the input. */
8269                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8270
8271                 if (key->ps_prolog.states.color_two_side) {
8272                         face = LLVMGetParam(func, face_vgpr);
8273                         face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8274                 }
8275
8276                 interp_fs_input(ctx,
8277                                 key->ps_prolog.color_attr_index[i],
8278                                 TGSI_SEMANTIC_COLOR, i,
8279                                 key->ps_prolog.num_interp_inputs,
8280                                 key->ps_prolog.colors_read, interp_ij,
8281                                 prim_mask, face, color);
8282
8283                 while (writemask) {
8284                         unsigned chan = u_bit_scan(&writemask);
8285                         ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8286                                                    num_params++, "");
8287                 }
8288         }
8289
8290         /* Tell LLVM to insert WQM instruction sequence when needed. */
8291         if (key->ps_prolog.wqm) {
8292                 LLVMAddTargetDependentFunctionAttr(func,
8293                                                    "amdgpu-ps-wqm-outputs", "");
8294         }
8295
8296         si_llvm_build_ret(ctx, ret);
8297 }
8298
8299 /**
8300  * Build the pixel shader epilog function. This handles everything that must be
8301  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8302  */
8303 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8304                                         union si_shader_part_key *key)
8305 {
8306         struct gallivm_state *gallivm = &ctx->gallivm;
8307         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8308         LLVMTypeRef params[16+8*4+3];
8309         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8310         int last_sgpr, num_params = 0, i;
8311         struct si_ps_exports exp = {};
8312
8313         /* Declare input SGPRs. */
8314         params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8315         params[ctx->param_const_buffers = num_params++] = ctx->i64;
8316         params[ctx->param_samplers = num_params++] = ctx->i64;
8317         params[ctx->param_images = num_params++] = ctx->i64;
8318         params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8319         assert(num_params == SI_PARAM_ALPHA_REF);
8320         params[SI_PARAM_ALPHA_REF] = ctx->f32;
8321         last_sgpr = SI_PARAM_ALPHA_REF;
8322
8323         /* Declare input VGPRs. */
8324         num_params = (last_sgpr + 1) +
8325                      util_bitcount(key->ps_epilog.colors_written) * 4 +
8326                      key->ps_epilog.writes_z +
8327                      key->ps_epilog.writes_stencil +
8328                      key->ps_epilog.writes_samplemask;
8329
8330         num_params = MAX2(num_params,
8331                           last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8332
8333         assert(num_params <= ARRAY_SIZE(params));
8334
8335         for (i = last_sgpr + 1; i < num_params; i++)
8336                 params[i] = ctx->f32;
8337
8338         /* Create the function. */
8339         si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
8340         /* Disable elimination of unused inputs. */
8341         si_llvm_add_attribute(ctx->main_fn,
8342                                   "InitialPSInputAddr", 0xffffff);
8343
8344         /* Process colors. */
8345         unsigned vgpr = last_sgpr + 1;
8346         unsigned colors_written = key->ps_epilog.colors_written;
8347         int last_color_export = -1;
8348
8349         /* Find the last color export. */
8350         if (!key->ps_epilog.writes_z &&
8351             !key->ps_epilog.writes_stencil &&
8352             !key->ps_epilog.writes_samplemask) {
8353                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8354
8355                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8356                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8357                         /* Just set this if any of the colorbuffers are enabled. */
8358                         if (spi_format &
8359                             ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8360                                 last_color_export = 0;
8361                 } else {
8362                         for (i = 0; i < 8; i++)
8363                                 if (colors_written & (1 << i) &&
8364                                     (spi_format >> (i * 4)) & 0xf)
8365                                         last_color_export = i;
8366                 }
8367         }
8368
8369         while (colors_written) {
8370                 LLVMValueRef color[4];
8371                 int mrt = u_bit_scan(&colors_written);
8372
8373                 for (i = 0; i < 4; i++)
8374                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8375
8376                 si_export_mrt_color(bld_base, color, mrt,
8377                                     num_params - 1,
8378                                     mrt == last_color_export, &exp);
8379         }
8380
8381         /* Process depth, stencil, samplemask. */
8382         if (key->ps_epilog.writes_z)
8383                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8384         if (key->ps_epilog.writes_stencil)
8385                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8386         if (key->ps_epilog.writes_samplemask)
8387                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8388
8389         if (depth || stencil || samplemask)
8390                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8391         else if (last_color_export == -1)
8392                 si_export_null(bld_base);
8393
8394         if (exp.num)
8395                 si_emit_ps_exports(ctx, &exp);
8396
8397         /* Compile. */
8398         LLVMBuildRetVoid(gallivm->builder);
8399 }
8400
8401 /**
8402  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
8403  */
8404 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
8405                                       LLVMTargetMachineRef tm,
8406                                       struct si_shader *shader,
8407                                       struct pipe_debug_callback *debug)
8408 {
8409         union si_shader_part_key prolog_key;
8410         union si_shader_part_key epilog_key;
8411
8412         /* Get the prolog. */
8413         si_get_ps_prolog_key(shader, &prolog_key, true);
8414
8415         /* The prolog is a no-op if these aren't set. */
8416         if (si_need_ps_prolog(&prolog_key)) {
8417                 shader->prolog =
8418                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
8419                                            PIPE_SHADER_FRAGMENT, true,
8420                                            &prolog_key, tm, debug,
8421                                            si_build_ps_prolog_function,
8422                                            "Fragment Shader Prolog");
8423                 if (!shader->prolog)
8424                         return false;
8425         }
8426
8427         /* Get the epilog. */
8428         si_get_ps_epilog_key(shader, &epilog_key);
8429
8430         shader->epilog =
8431                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
8432                                    PIPE_SHADER_FRAGMENT, false,
8433                                    &epilog_key, tm, debug,
8434                                    si_build_ps_epilog_function,
8435                                    "Fragment Shader Epilog");
8436         if (!shader->epilog)
8437                 return false;
8438
8439         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
8440         if (shader->key.part.ps.prolog.poly_stipple) {
8441                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
8442                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
8443         }
8444
8445         /* Set up the enable bits for per-sample shading if needed. */
8446         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
8447             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8448              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8449                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
8450                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8451                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
8452         }
8453         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
8454             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8455              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8456                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
8457                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8458                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
8459         }
8460         if (shader->key.part.ps.prolog.force_persp_center_interp &&
8461             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8462              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8463                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
8464                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8465                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8466         }
8467         if (shader->key.part.ps.prolog.force_linear_center_interp &&
8468             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8469              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8470                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
8471                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8472                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8473         }
8474
8475         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
8476         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
8477             !(shader->config.spi_ps_input_ena & 0xf)) {
8478                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8479                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
8480         }
8481
8482         /* At least one pair of interpolation weights must be enabled. */
8483         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
8484                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8485                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
8486         }
8487
8488         /* The sample mask input is always enabled, because the API shader always
8489          * passes it through to the epilog. Disable it here if it's unused.
8490          */
8491         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
8492             !shader->selector->info.reads_samplemask)
8493                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
8494
8495         return true;
8496 }
8497
8498 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
8499                                       unsigned *lds_size)
8500 {
8501         /* SPI barrier management bug:
8502          *   Make sure we have at least 4k of LDS in use to avoid the bug.
8503          *   It applies to workgroup sizes of more than one wavefront.
8504          */
8505         if (sscreen->b.family == CHIP_BONAIRE ||
8506             sscreen->b.family == CHIP_KABINI ||
8507             sscreen->b.family == CHIP_MULLINS)
8508                 *lds_size = MAX2(*lds_size, 8);
8509 }
8510
8511 static void si_fix_resource_usage(struct si_screen *sscreen,
8512                                   struct si_shader *shader)
8513 {
8514         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
8515
8516         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
8517
8518         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
8519             si_get_max_workgroup_size(shader) > 64) {
8520                 si_multiwave_lds_size_workaround(sscreen,
8521                                                  &shader->config.lds_size);
8522         }
8523 }
8524
8525 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
8526                      struct si_shader *shader,
8527                      struct pipe_debug_callback *debug)
8528 {
8529         struct si_shader_selector *sel = shader->selector;
8530         struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
8531         int r;
8532
8533         /* LS, ES, VS are compiled on demand if the main part hasn't been
8534          * compiled for that stage.
8535          *
8536          * Vertex shaders are compiled on demand when a vertex fetch
8537          * workaround must be applied.
8538          */
8539         if (shader->is_monolithic) {
8540                 /* Monolithic shader (compiled as a whole, has many variants,
8541                  * may take a long time to compile).
8542                  */
8543                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
8544                 if (r)
8545                         return r;
8546         } else {
8547                 /* The shader consists of 2-3 parts:
8548                  *
8549                  * - the middle part is the user shader, it has 1 variant only
8550                  *   and it was compiled during the creation of the shader
8551                  *   selector
8552                  * - the prolog part is inserted at the beginning
8553                  * - the epilog part is inserted at the end
8554                  *
8555                  * The prolog and epilog have many (but simple) variants.
8556                  */
8557
8558                 /* Copy the compiled TGSI shader data over. */
8559                 shader->is_binary_shared = true;
8560                 shader->binary = mainp->binary;
8561                 shader->config = mainp->config;
8562                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
8563                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
8564                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
8565                 memcpy(shader->info.vs_output_param_offset,
8566                        mainp->info.vs_output_param_offset,
8567                        sizeof(mainp->info.vs_output_param_offset));
8568                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
8569                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
8570                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
8571
8572                 /* Select prologs and/or epilogs. */
8573                 switch (sel->type) {
8574                 case PIPE_SHADER_VERTEX:
8575                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
8576                                 return -1;
8577                         break;
8578                 case PIPE_SHADER_TESS_CTRL:
8579                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
8580                                 return -1;
8581                         break;
8582                 case PIPE_SHADER_TESS_EVAL:
8583                         if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
8584                                 return -1;
8585                         break;
8586                 case PIPE_SHADER_GEOMETRY:
8587                         if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
8588                                 return -1;
8589                         break;
8590                 case PIPE_SHADER_FRAGMENT:
8591                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
8592                                 return -1;
8593
8594                         /* Make sure we have at least as many VGPRs as there
8595                          * are allocated inputs.
8596                          */
8597                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8598                                                         shader->info.num_input_vgprs);
8599                         break;
8600                 }
8601
8602                 /* Update SGPR and VGPR counts. */
8603                 if (shader->prolog) {
8604                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8605                                                         shader->prolog->config.num_sgprs);
8606                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8607                                                         shader->prolog->config.num_vgprs);
8608                 }
8609                 if (shader->previous_stage) {
8610                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8611                                                         shader->previous_stage->config.num_sgprs);
8612                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8613                                                         shader->previous_stage->config.num_vgprs);
8614                         shader->config.spilled_sgprs =
8615                                 MAX2(shader->config.spilled_sgprs,
8616                                      shader->previous_stage->config.spilled_sgprs);
8617                         shader->config.spilled_vgprs =
8618                                 MAX2(shader->config.spilled_vgprs,
8619                                      shader->previous_stage->config.spilled_vgprs);
8620                         shader->config.private_mem_vgprs =
8621                                 MAX2(shader->config.private_mem_vgprs,
8622                                      shader->previous_stage->config.private_mem_vgprs);
8623                         shader->config.scratch_bytes_per_wave =
8624                                 MAX2(shader->config.scratch_bytes_per_wave,
8625                                      shader->previous_stage->config.scratch_bytes_per_wave);
8626                         shader->info.uses_instanceid |=
8627                                 shader->previous_stage->info.uses_instanceid;
8628                 }
8629                 if (shader->epilog) {
8630                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8631                                                         shader->epilog->config.num_sgprs);
8632                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8633                                                         shader->epilog->config.num_vgprs);
8634                 }
8635         }
8636
8637         si_fix_resource_usage(sscreen, shader);
8638         si_shader_dump(sscreen, shader, debug, sel->info.processor,
8639                        stderr, true);
8640
8641         /* Upload. */
8642         r = si_shader_binary_upload(sscreen, shader);
8643         if (r) {
8644                 fprintf(stderr, "LLVM failed to upload shader\n");
8645                 return r;
8646         }
8647
8648         return 0;
8649 }
8650
8651 void si_shader_destroy(struct si_shader *shader)
8652 {
8653         if (shader->scratch_bo)
8654                 r600_resource_reference(&shader->scratch_bo, NULL);
8655
8656         r600_resource_reference(&shader->bo, NULL);
8657
8658         if (!shader->is_binary_shared)
8659                 radeon_shader_binary_clean(&shader->binary);
8660
8661         free(shader->shader_log);
8662 }