src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_flow.h"
  35 #include "gallivm/lp_bld_misc.h"
  36 #include "radeon/radeon_elf_util.h"
  37 #include "util/u_memory.h"
  38 #include "util/u_string.h"
  39 #include "tgsi/tgsi_build.h"
  40 #include "tgsi/tgsi_util.h"
  41 #include "tgsi/tgsi_dump.h"
  42
  43 #include "ac_llvm_util.h"
  44 #include "si_shader_internal.h"
  45 #include "si_pipe.h"
  46 #include "sid.h"
  47
  48
  49 static const char *scratch_rsrc_dword0_symbol =
  50         "SCRATCH_RSRC_DWORD0";
  51
  52 static const char *scratch_rsrc_dword1_symbol =
  53         "SCRATCH_RSRC_DWORD1";
  54
  55 struct si_shader_output_values
  56 {
  57         LLVMValueRef values[4];
  58         unsigned semantic_name;
  59         unsigned semantic_index;
  60         ubyte vertex_stream[4];
  61 };
  62
  63 static void si_init_shader_ctx(struct si_shader_context *ctx,
  64                                struct si_screen *sscreen,
  65                                struct si_shader *shader,
  66                                LLVMTargetMachineRef tm);
  67
  68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
  69                                  struct lp_build_tgsi_context *bld_base,
  70                                  struct lp_build_emit_data *emit_data);
  71
  72 static void si_dump_shader_key(unsigned shader, struct si_shader_key *key,
  73                                FILE *f);
  74
  75 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
  76                                         union si_shader_part_key *key);
  77 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
  78                                         union si_shader_part_key *key);
  79 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
  80                                          union si_shader_part_key *key);
  81 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
  82                                         union si_shader_part_key *key);
  83 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
  84                                         union si_shader_part_key *key);
  85
  86 /* Ideally pass the sample mask input to the PS epilog as v13, which
  87  * is its usual location, so that the shader doesn't have to add v_mov.
  88  */
  89 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
  90
  91 /* The VS location of the PrimitiveID input is the same in the epilog,
  92  * so that the main shader part doesn't have to move it.
  93  */
  94 #define VS_EPILOG_PRIMID_LOC 2
  95
  96 enum {
  97         CONST_ADDR_SPACE = 2,
  98         LOCAL_ADDR_SPACE = 3,
  99 };
 100
 101 #define SENDMSG_GS 2
 102 #define SENDMSG_GS_DONE 3
 103
 104 #define SENDMSG_GS_OP_NOP      (0 << 4)
 105 #define SENDMSG_GS_OP_CUT      (1 << 4)
 106 #define SENDMSG_GS_OP_EMIT     (2 << 4)
 107 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
 108
 109 /**
 110  * Returns a unique index for a semantic name and index. The index must be
 111  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 112  * calculated.
 113  */
 114 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 115 {
 116         switch (semantic_name) {
 117         case TGSI_SEMANTIC_POSITION:
 118                 return 0;
 119         case TGSI_SEMANTIC_PSIZE:
 120                 return 1;
 121         case TGSI_SEMANTIC_CLIPDIST:
 122                 assert(index <= 1);
 123                 return 2 + index;
 124         case TGSI_SEMANTIC_GENERIC:
 125                 if (index <= 63-4)
 126                         return 4 + index;
 127
 128                 assert(!"invalid generic index");
 129                 return 0;
 130
 131         /* patch indices are completely separate and thus start from 0 */
 132         case TGSI_SEMANTIC_TESSOUTER:
 133                 return 0;
 134         case TGSI_SEMANTIC_TESSINNER:
 135                 return 1;
 136         case TGSI_SEMANTIC_PATCH:
 137                 return 2 + index;
 138
 139         default:
 140                 assert(!"invalid semantic name");
 141                 return 0;
 142         }
 143 }
 144
 145 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
 146 {
 147         switch (name) {
 148         case TGSI_SEMANTIC_FOG:
 149                 return 0;
 150         case TGSI_SEMANTIC_LAYER:
 151                 return 1;
 152         case TGSI_SEMANTIC_VIEWPORT_INDEX:
 153                 return 2;
 154         case TGSI_SEMANTIC_PRIMID:
 155                 return 3;
 156         case TGSI_SEMANTIC_COLOR: /* these alias */
 157         case TGSI_SEMANTIC_BCOLOR:
 158                 return 4 + index;
 159         case TGSI_SEMANTIC_TEXCOORD:
 160                 return 6 + index;
 161         default:
 162                 assert(!"invalid semantic name");
 163                 return 0;
 164         }
 165 }
 166
 167 /**
 168  * Get the value of a shader input parameter and extract a bitfield.
 169  */
 170 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 171                                  unsigned param, unsigned rshift,
 172                                  unsigned bitwidth)
 173 {
 174         struct gallivm_state *gallivm = &ctx->gallivm;
 175         LLVMValueRef value = LLVMGetParam(ctx->main_fn,
 176                                           param);
 177
 178         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 179                 value = bitcast(&ctx->bld_base,
 180                                 TGSI_TYPE_UNSIGNED, value);
 181
 182         if (rshift)
 183                 value = LLVMBuildLShr(gallivm->builder, value,
 184                                       lp_build_const_int32(gallivm, rshift), "");
 185
 186         if (rshift + bitwidth < 32) {
 187                 unsigned mask = (1 << bitwidth) - 1;
 188                 value = LLVMBuildAnd(gallivm->builder, value,
 189                                      lp_build_const_int32(gallivm, mask), "");
 190         }
 191
 192         return value;
 193 }
 194
 195 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 196 {
 197         switch (ctx->type) {
 198         case PIPE_SHADER_TESS_CTRL:
 199                 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
 200
 201         case PIPE_SHADER_TESS_EVAL:
 202                 return LLVMGetParam(ctx->main_fn,
 203                                     ctx->param_tes_rel_patch_id);
 204
 205         default:
 206                 assert(0);
 207                 return NULL;
 208         }
 209 }
 210
 211 /* Tessellation shaders pass outputs to the next shader using LDS.
 212  *
 213  * LS outputs = TCS inputs
 214  * TCS outputs = TES inputs
 215  *
 216  * The LDS layout is:
 217  * - TCS inputs for patch 0
 218  * - TCS inputs for patch 1
 219  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 220  * - ...
 221  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 222  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 223  * - TCS outputs for patch 1
 224  * - Per-patch TCS outputs for patch 1
 225  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 226  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 227  * - ...
 228  *
 229  * All three shaders VS(LS), TCS, TES share the same LDS space.
 230  */
 231
 232 static LLVMValueRef
 233 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 234 {
 235         if (ctx->type == PIPE_SHADER_VERTEX)
 236                 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
 237         else if (ctx->type == PIPE_SHADER_TESS_CTRL)
 238                 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
 239         else {
 240                 assert(0);
 241                 return NULL;
 242         }
 243 }
 244
 245 static LLVMValueRef
 246 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 247 {
 248         return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
 249 }
 250
 251 static LLVMValueRef
 252 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 253 {
 254         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 255                                 unpack_param(ctx,
 256                                              SI_PARAM_TCS_OUT_OFFSETS,
 257                                              0, 16),
 258                                 4);
 259 }
 260
 261 static LLVMValueRef
 262 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 263 {
 264         return lp_build_mul_imm(&ctx->bld_base.uint_bld,
 265                                 unpack_param(ctx,
 266                                              SI_PARAM_TCS_OUT_OFFSETS,
 267                                              16, 16),
 268                                 4);
 269 }
 270
 271 static LLVMValueRef
 272 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 273 {
 274         struct gallivm_state *gallivm = &ctx->gallivm;
 275         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 276         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 277
 278         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 279 }
 280
 281 static LLVMValueRef
 282 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 283 {
 284         struct gallivm_state *gallivm = &ctx->gallivm;
 285         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 286         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 287         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 288
 289         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 290                             LLVMBuildMul(gallivm->builder, patch_stride,
 291                                          rel_patch_id, ""),
 292                             "");
 293 }
 294
 295 static LLVMValueRef
 296 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 297 {
 298         struct gallivm_state *gallivm = &ctx->gallivm;
 299         LLVMValueRef patch0_patch_data_offset =
 300                 get_tcs_out_patch0_patch_data_offset(ctx);
 301         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 302         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 303
 304         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 305                             LLVMBuildMul(gallivm->builder, patch_stride,
 306                                          rel_patch_id, ""),
 307                             "");
 308 }
 309
 310 static LLVMValueRef get_instance_index_for_fetch(
 311         struct si_shader_context *radeon_bld,
 312         unsigned param_start_instance, unsigned divisor)
 313 {
 314         struct si_shader_context *ctx =
 315                 si_shader_context(&radeon_bld->bld_base);
 316         struct gallivm_state *gallivm = radeon_bld->bld_base.base.gallivm;
 317
 318         LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
 319                                            ctx->param_instance_id);
 320
 321         /* The division must be done before START_INSTANCE is added. */
 322         if (divisor > 1)
 323                 result = LLVMBuildUDiv(gallivm->builder, result,
 324                                 lp_build_const_int32(gallivm, divisor), "");
 325
 326         return LLVMBuildAdd(gallivm->builder, result,
 327                             LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 328 }
 329
 330 static void declare_input_vs(
 331         struct si_shader_context *ctx,
 332         unsigned input_index,
 333         const struct tgsi_full_declaration *decl,
 334         LLVMValueRef out[4])
 335 {
 336         struct lp_build_context *base = &ctx->bld_base.base;
 337         struct gallivm_state *gallivm = base->gallivm;
 338
 339         unsigned chan;
 340         unsigned fix_fetch;
 341
 342         LLVMValueRef t_list_ptr;
 343         LLVMValueRef t_offset;
 344         LLVMValueRef t_list;
 345         LLVMValueRef attribute_offset;
 346         LLVMValueRef buffer_index;
 347         LLVMValueRef args[3];
 348         LLVMValueRef input;
 349
 350         /* Load the T list */
 351         t_list_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_VERTEX_BUFFERS);
 352
 353         t_offset = lp_build_const_int32(gallivm, input_index);
 354
 355         t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
 356
 357         /* Build the attribute offset */
 358         attribute_offset = lp_build_const_int32(gallivm, 0);
 359
 360         buffer_index = LLVMGetParam(ctx->main_fn,
 361                                     ctx->param_vertex_index0 +
 362                                     input_index);
 363
 364         args[0] = t_list;
 365         args[1] = attribute_offset;
 366         args[2] = buffer_index;
 367         input = lp_build_intrinsic(gallivm->builder,
 368                 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
 369                 LP_FUNC_ATTR_READNONE);
 370
 371         /* Break up the vec4 into individual components */
 372         for (chan = 0; chan < 4; chan++) {
 373                 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 374                 out[chan] = LLVMBuildExtractElement(gallivm->builder,
 375                                                     input, llvm_chan, "");
 376         }
 377
 378         fix_fetch = (ctx->shader->key.mono.vs.fix_fetch >> (4 * input_index)) & 0xf;
 379
 380         switch (fix_fetch) {
 381         case SI_FIX_FETCH_A2_SNORM:
 382         case SI_FIX_FETCH_A2_SSCALED:
 383         case SI_FIX_FETCH_A2_SINT: {
 384                 /* The hardware returns an unsigned value; convert it to a
 385                  * signed one.
 386                  */
 387                 LLVMValueRef tmp = out[3];
 388                 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 389
 390                 /* First, recover the sign-extended signed integer value. */
 391                 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
 392                         tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
 393                 else
 394                         tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
 395
 396                 /* For the integer-like cases, do a natural sign extension.
 397                  *
 398                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 399                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 400                  * exponent.
 401                  */
 402                 tmp = LLVMBuildShl(gallivm->builder, tmp,
 403                                    fix_fetch == SI_FIX_FETCH_A2_SNORM ?
 404                                    LLVMConstInt(ctx->i32, 7, 0) : c30, "");
 405                 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
 406
 407                 /* Convert back to the right type. */
 408                 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
 409                         LLVMValueRef clamp;
 410                         LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
 411                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 412                         clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
 413                         tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
 414                 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
 415                         tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
 416                 }
 417
 418                 out[3] = tmp;
 419                 break;
 420         }
 421         case SI_FIX_FETCH_RGBA_32_UNORM:
 422         case SI_FIX_FETCH_RGBX_32_UNORM:
 423                 for (chan = 0; chan < 4; chan++) {
 424                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 425                                                      ctx->i32, "");
 426                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 427                                                     out[chan], ctx->f32, "");
 428                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 429                                                   LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
 430                 }
 431                 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 432                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
 433                         out[3] = LLVMConstReal(ctx->f32, 1);
 434                 break;
 435         case SI_FIX_FETCH_RGBA_32_SNORM:
 436         case SI_FIX_FETCH_RGBX_32_SNORM:
 437         case SI_FIX_FETCH_RGBA_32_FIXED:
 438         case SI_FIX_FETCH_RGBX_32_FIXED: {
 439                 double scale;
 440                 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
 441                         scale = 1.0 / 0x10000;
 442                 else
 443                         scale = 1.0 / INT_MAX;
 444
 445                 for (chan = 0; chan < 4; chan++) {
 446                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 447                                                      ctx->i32, "");
 448                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 449                                                     out[chan], ctx->f32, "");
 450                         out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
 451                                                   LLVMConstReal(ctx->f32, scale), "");
 452                 }
 453                 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
 454                 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
 455                     fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
 456                         out[3] = LLVMConstReal(ctx->f32, 1);
 457                 break;
 458         }
 459         case SI_FIX_FETCH_RGBA_32_USCALED:
 460                 for (chan = 0; chan < 4; chan++) {
 461                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 462                                                      ctx->i32, "");
 463                         out[chan] = LLVMBuildUIToFP(gallivm->builder,
 464                                                     out[chan], ctx->f32, "");
 465                 }
 466                 break;
 467         case SI_FIX_FETCH_RGBA_32_SSCALED:
 468                 for (chan = 0; chan < 4; chan++) {
 469                         out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 470                                                      ctx->i32, "");
 471                         out[chan] = LLVMBuildSIToFP(gallivm->builder,
 472                                                     out[chan], ctx->f32, "");
 473                 }
 474                 break;
 475         }
 476 }
 477
 478 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 479                                      unsigned swizzle)
 480 {
 481         struct si_shader_context *ctx = si_shader_context(bld_base);
 482
 483         if (swizzle > 0)
 484                 return bld_base->uint_bld.zero;
 485
 486         switch (ctx->type) {
 487         case PIPE_SHADER_VERTEX:
 488                 return LLVMGetParam(ctx->main_fn,
 489                                     ctx->param_vs_prim_id);
 490         case PIPE_SHADER_TESS_CTRL:
 491                 return LLVMGetParam(ctx->main_fn,
 492                                     SI_PARAM_PATCH_ID);
 493         case PIPE_SHADER_TESS_EVAL:
 494                 return LLVMGetParam(ctx->main_fn,
 495                                     ctx->param_tes_patch_id);
 496         case PIPE_SHADER_GEOMETRY:
 497                 return LLVMGetParam(ctx->main_fn,
 498                                     SI_PARAM_PRIMITIVE_ID);
 499         default:
 500                 assert(0);
 501                 return bld_base->uint_bld.zero;
 502         }
 503 }
 504
 505 /**
 506  * Return the value of tgsi_ind_register for indexing.
 507  * This is the indirect index with the constant offset added to it.
 508  */
 509 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 510                                        const struct tgsi_ind_register *ind,
 511                                        int rel_index)
 512 {
 513         struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
 514         LLVMValueRef result;
 515
 516         result = ctx->addrs[ind->Index][ind->Swizzle];
 517         result = LLVMBuildLoad(gallivm->builder, result, "");
 518         result = LLVMBuildAdd(gallivm->builder, result,
 519                               lp_build_const_int32(gallivm, rel_index), "");
 520         return result;
 521 }
 522
 523 /**
 524  * Like get_indirect_index, but restricts the return value to a (possibly
 525  * undefined) value inside [0..num).
 526  */
 527 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
 528                                                const struct tgsi_ind_register *ind,
 529                                                int rel_index, unsigned num)
 530 {
 531         LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
 532
 533         /* LLVM 3.8: If indirect resource indexing is used:
 534          * - SI & CIK hang
 535          * - VI crashes
 536          */
 537         if (HAVE_LLVM <= 0x0308)
 538                 return LLVMGetUndef(ctx->i32);
 539
 540         return si_llvm_bound_index(ctx, result, num);
 541 }
 542
 543
 544 /**
 545  * Calculate a dword address given an input or output register and a stride.
 546  */
 547 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 548                                    const struct tgsi_full_dst_register *dst,
 549                                    const struct tgsi_full_src_register *src,
 550                                    LLVMValueRef vertex_dw_stride,
 551                                    LLVMValueRef base_addr)
 552 {
 553         struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
 554         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 555         ubyte *name, *index, *array_first;
 556         int first, param;
 557         struct tgsi_full_dst_register reg;
 558
 559         /* Set the register description. The address computation is the same
 560          * for sources and destinations. */
 561         if (src) {
 562                 reg.Register.File = src->Register.File;
 563                 reg.Register.Index = src->Register.Index;
 564                 reg.Register.Indirect = src->Register.Indirect;
 565                 reg.Register.Dimension = src->Register.Dimension;
 566                 reg.Indirect = src->Indirect;
 567                 reg.Dimension = src->Dimension;
 568                 reg.DimIndirect = src->DimIndirect;
 569         } else
 570                 reg = *dst;
 571
 572         /* If the register is 2-dimensional (e.g. an array of vertices
 573          * in a primitive), calculate the base address of the vertex. */
 574         if (reg.Register.Dimension) {
 575                 LLVMValueRef index;
 576
 577                 if (reg.Dimension.Indirect)
 578                         index = get_indirect_index(ctx, &reg.DimIndirect,
 579                                                    reg.Dimension.Index);
 580                 else
 581                         index = lp_build_const_int32(gallivm, reg.Dimension.Index);
 582
 583                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 584                                          LLVMBuildMul(gallivm->builder, index,
 585                                                       vertex_dw_stride, ""), "");
 586         }
 587
 588         /* Get information about the register. */
 589         if (reg.Register.File == TGSI_FILE_INPUT) {
 590                 name = info->input_semantic_name;
 591                 index = info->input_semantic_index;
 592                 array_first = info->input_array_first;
 593         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 594                 name = info->output_semantic_name;
 595                 index = info->output_semantic_index;
 596                 array_first = info->output_array_first;
 597         } else {
 598                 assert(0);
 599                 return NULL;
 600         }
 601
 602         if (reg.Register.Indirect) {
 603                 /* Add the relative address of the element. */
 604                 LLVMValueRef ind_index;
 605
 606                 if (reg.Indirect.ArrayID)
 607                         first = array_first[reg.Indirect.ArrayID];
 608                 else
 609                         first = reg.Register.Index;
 610
 611                 ind_index = get_indirect_index(ctx, &reg.Indirect,
 612                                            reg.Register.Index - first);
 613
 614                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 615                                     LLVMBuildMul(gallivm->builder, ind_index,
 616                                                  lp_build_const_int32(gallivm, 4), ""), "");
 617
 618                 param = si_shader_io_get_unique_index(name[first], index[first]);
 619         } else {
 620                 param = si_shader_io_get_unique_index(name[reg.Register.Index],
 621                                                       index[reg.Register.Index]);
 622         }
 623
 624         /* Add the base address of the element. */
 625         return LLVMBuildAdd(gallivm->builder, base_addr,
 626                             lp_build_const_int32(gallivm, param * 4), "");
 627 }
 628
 629 /* The offchip buffer layout for TCS->TES is
 630  *
 631  * - attribute 0 of patch 0 vertex 0
 632  * - attribute 0 of patch 0 vertex 1
 633  * - attribute 0 of patch 0 vertex 2
 634  *   ...
 635  * - attribute 0 of patch 1 vertex 0
 636  * - attribute 0 of patch 1 vertex 1
 637  *   ...
 638  * - attribute 1 of patch 0 vertex 0
 639  * - attribute 1 of patch 0 vertex 1
 640  *   ...
 641  * - per patch attribute 0 of patch 0
 642  * - per patch attribute 0 of patch 1
 643  *   ...
 644  *
 645  * Note that every attribute has 4 components.
 646  */
 647 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 648                                                LLVMValueRef vertex_index,
 649                                                LLVMValueRef param_index)
 650 {
 651         struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
 652         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 653         LLVMValueRef param_stride, constant16;
 654
 655         vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
 656         num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
 657         total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 658                                       num_patches, "");
 659
 660         constant16 = lp_build_const_int32(gallivm, 16);
 661         if (vertex_index) {
 662                 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
 663                                          vertices_per_patch, "");
 664
 665                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 666                                          vertex_index, "");
 667
 668                 param_stride = total_vertices;
 669         } else {
 670                 base_addr = get_rel_patch_id(ctx);
 671                 param_stride = num_patches;
 672         }
 673
 674         base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 675                                  LLVMBuildMul(gallivm->builder, param_index,
 676                                               param_stride, ""), "");
 677
 678         base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 679
 680         if (!vertex_index) {
 681                 LLVMValueRef patch_data_offset =
 682                            unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
 683
 684                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 685                                          patch_data_offset, "");
 686         }
 687         return base_addr;
 688 }
 689
 690 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 691                                        struct si_shader_context *ctx,
 692                                        const struct tgsi_full_dst_register *dst,
 693                                        const struct tgsi_full_src_register *src)
 694 {
 695         struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
 696         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 697         ubyte *name, *index, *array_first;
 698         struct tgsi_full_src_register reg;
 699         LLVMValueRef vertex_index = NULL;
 700         LLVMValueRef param_index = NULL;
 701         unsigned param_index_base, param_base;
 702
 703         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 704
 705         if (reg.Register.Dimension) {
 706
 707                 if (reg.Dimension.Indirect)
 708                         vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
 709                                                           reg.Dimension.Index);
 710                 else
 711                         vertex_index = lp_build_const_int32(gallivm,
 712                                                             reg.Dimension.Index);
 713         }
 714
 715         /* Get information about the register. */
 716         if (reg.Register.File == TGSI_FILE_INPUT) {
 717                 name = info->input_semantic_name;
 718                 index = info->input_semantic_index;
 719                 array_first = info->input_array_first;
 720         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 721                 name = info->output_semantic_name;
 722                 index = info->output_semantic_index;
 723                 array_first = info->output_array_first;
 724         } else {
 725                 assert(0);
 726                 return NULL;
 727         }
 728
 729         if (reg.Register.Indirect) {
 730                 if (reg.Indirect.ArrayID)
 731                         param_base = array_first[reg.Indirect.ArrayID];
 732                 else
 733                         param_base = reg.Register.Index;
 734
 735                 param_index = get_indirect_index(ctx, &reg.Indirect,
 736                                                  reg.Register.Index - param_base);
 737
 738         } else {
 739                 param_base = reg.Register.Index;
 740                 param_index = lp_build_const_int32(gallivm, 0);
 741         }
 742
 743         param_index_base = si_shader_io_get_unique_index(name[param_base],
 744                                                          index[param_base]);
 745
 746         param_index = LLVMBuildAdd(gallivm->builder, param_index,
 747                                    lp_build_const_int32(gallivm, param_index_base),
 748                                    "");
 749
 750         return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
 751 }
 752
 753 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
 754  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
 755  * or v4i32 (num_channels=3,4). */
 756 static void build_tbuffer_store(struct si_shader_context *ctx,
 757                                 LLVMValueRef rsrc,
 758                                 LLVMValueRef vdata,
 759                                 unsigned num_channels,
 760                                 LLVMValueRef vaddr,
 761                                 LLVMValueRef soffset,
 762                                 unsigned inst_offset,
 763                                 unsigned dfmt,
 764                                 unsigned nfmt,
 765                                 unsigned offen,
 766                                 unsigned idxen,
 767                                 unsigned glc,
 768                                 unsigned slc,
 769                                 unsigned tfe)
 770 {
 771         struct gallivm_state *gallivm = &ctx->gallivm;
 772         LLVMValueRef args[] = {
 773                 rsrc,
 774                 vdata,
 775                 LLVMConstInt(ctx->i32, num_channels, 0),
 776                 vaddr,
 777                 soffset,
 778                 LLVMConstInt(ctx->i32, inst_offset, 0),
 779                 LLVMConstInt(ctx->i32, dfmt, 0),
 780                 LLVMConstInt(ctx->i32, nfmt, 0),
 781                 LLVMConstInt(ctx->i32, offen, 0),
 782                 LLVMConstInt(ctx->i32, idxen, 0),
 783                 LLVMConstInt(ctx->i32, glc, 0),
 784                 LLVMConstInt(ctx->i32, slc, 0),
 785                 LLVMConstInt(ctx->i32, tfe, 0)
 786         };
 787
 788         /* The instruction offset field has 12 bits */
 789         assert(offen || inst_offset < (1 << 12));
 790
 791         /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
 792         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 793         const char *types[] = {"i32", "v2i32", "v4i32"};
 794         char name[256];
 795         snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
 796
 797         lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
 798                            args, ARRAY_SIZE(args), 0);
 799 }
 800
 801 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
 802                                      LLVMValueRef rsrc,
 803                                      LLVMValueRef vdata,
 804                                      unsigned num_channels,
 805                                      LLVMValueRef vaddr,
 806                                      LLVMValueRef soffset,
 807                                      unsigned inst_offset)
 808 {
 809         static unsigned dfmt[] = {
 810                 V_008F0C_BUF_DATA_FORMAT_32,
 811                 V_008F0C_BUF_DATA_FORMAT_32_32,
 812                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
 813                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
 814         };
 815         assert(num_channels >= 1 && num_channels <= 4);
 816
 817         build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
 818                             inst_offset, dfmt[num_channels-1],
 819                             V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
 820 }
 821
 822 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
 823                                       LLVMValueRef rsrc,
 824                                       int num_channels,
 825                                       LLVMValueRef vindex,
 826                                       LLVMValueRef voffset,
 827                                       LLVMValueRef soffset,
 828                                       unsigned inst_offset,
 829                                       unsigned glc,
 830                                       unsigned slc)
 831 {
 832         struct gallivm_state *gallivm = &ctx->gallivm;
 833         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 834
 835         if (HAVE_LLVM >= 0x309) {
 836                 LLVMValueRef args[] = {
 837                         LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
 838                         vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
 839                         LLVMConstInt(ctx->i32, inst_offset, 0),
 840                         LLVMConstInt(ctx->i1, glc, 0),
 841                         LLVMConstInt(ctx->i1, slc, 0)
 842                 };
 843
 844                 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
 845                                        ctx->v4f32};
 846                 const char *type_names[] = {"f32", "v2f32", "v4f32"};
 847                 char name[256];
 848
 849                 if (voffset) {
 850                         args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
 851                                                "");
 852                 }
 853
 854                 if (soffset) {
 855                         args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
 856                                                "");
 857                 }
 858
 859                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
 860                          type_names[func]);
 861
 862                 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
 863                                           ARRAY_SIZE(args), LP_FUNC_ATTR_READONLY);
 864         } else {
 865                 LLVMValueRef args[] = {
 866                         LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
 867                         voffset ? voffset : vindex,
 868                         soffset,
 869                         LLVMConstInt(ctx->i32, inst_offset, 0),
 870                         LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
 871                         LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
 872                         LLVMConstInt(ctx->i32, glc, 0),
 873                         LLVMConstInt(ctx->i32, slc, 0),
 874                         LLVMConstInt(ctx->i32, 0, 0), // TFE
 875                 };
 876
 877                 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
 878                                        ctx->v4i32};
 879                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
 880                 const char *arg_type = "i32";
 881                 char name[256];
 882
 883                 if (voffset && vindex) {
 884                         LLVMValueRef vaddr[] = {vindex, voffset};
 885
 886                         arg_type = "v2i32";
 887                         args[1] = lp_build_gather_values(gallivm, vaddr, 2);
 888                 }
 889
 890                 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
 891                          type_names[func], arg_type);
 892
 893                 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
 894                                           ARRAY_SIZE(args), LP_FUNC_ATTR_READONLY);
 895         }
 896 }
 897
 898 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 899                                 enum tgsi_opcode_type type, unsigned swizzle,
 900                                 LLVMValueRef buffer, LLVMValueRef offset,
 901                                 LLVMValueRef base)
 902 {
 903         struct si_shader_context *ctx = si_shader_context(bld_base);
 904         struct gallivm_state *gallivm = bld_base->base.gallivm;
 905         LLVMValueRef value, value2;
 906         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
 907         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 908
 909         if (swizzle == ~0) {
 910                 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
 911                                           0, 1, 0);
 912
 913                 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 914         }
 915
 916         if (!tgsi_type_is_64bit(type)) {
 917                 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
 918                                           0, 1, 0);
 919
 920                 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 921                 return LLVMBuildExtractElement(gallivm->builder, value,
 922                                     lp_build_const_int32(gallivm, swizzle), "");
 923         }
 924
 925         value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
 926                                   swizzle * 4, 1, 0);
 927
 928         value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
 929                                    swizzle * 4 + 4, 1, 0);
 930
 931         return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 932 }
 933
 934 /**
 935  * Load from LDS.
 936  *
 937  * \param type          output value type
 938  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 939  * \param dw_addr       address in dwords
 940  */
 941 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 942                              enum tgsi_opcode_type type, unsigned swizzle,
 943                              LLVMValueRef dw_addr)
 944 {
 945         struct si_shader_context *ctx = si_shader_context(bld_base);
 946         struct gallivm_state *gallivm = bld_base->base.gallivm;
 947         LLVMValueRef value;
 948
 949         if (swizzle == ~0) {
 950                 LLVMValueRef values[TGSI_NUM_CHANNELS];
 951
 952                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
 953                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
 954
 955                 return lp_build_gather_values(bld_base->base.gallivm, values,
 956                                               TGSI_NUM_CHANNELS);
 957         }
 958
 959         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 960                             lp_build_const_int32(gallivm, swizzle));
 961
 962         value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 963         if (tgsi_type_is_64bit(type)) {
 964                 LLVMValueRef value2;
 965                 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 966                                        lp_build_const_int32(gallivm, 1));
 967                 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
 968                 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 969         }
 970
 971         return LLVMBuildBitCast(gallivm->builder, value,
 972                                 tgsi2llvmtype(bld_base, type), "");
 973 }
 974
 975 /**
 976  * Store to LDS.
 977  *
 978  * \param swizzle       offset (typically 0..3)
 979  * \param dw_addr       address in dwords
 980  * \param value         value to store
 981  */
 982 static void lds_store(struct lp_build_tgsi_context *bld_base,
 983                       unsigned swizzle, LLVMValueRef dw_addr,
 984                       LLVMValueRef value)
 985 {
 986         struct si_shader_context *ctx = si_shader_context(bld_base);
 987         struct gallivm_state *gallivm = bld_base->base.gallivm;
 988
 989         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 990                             lp_build_const_int32(gallivm, swizzle));
 991
 992         value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
 993         ac_build_indexed_store(&ctx->ac, ctx->lds,
 994                                dw_addr, value);
 995 }
 996
 997 static LLVMValueRef fetch_input_tcs(
 998         struct lp_build_tgsi_context *bld_base,
 999         const struct tgsi_full_src_register *reg,
1000         enum tgsi_opcode_type type, unsigned swizzle)
1001 {
1002         struct si_shader_context *ctx = si_shader_context(bld_base);
1003         LLVMValueRef dw_addr, stride;
1004
1005         stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1006         dw_addr = get_tcs_in_current_patch_offset(ctx);
1007         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1008
1009         return lds_load(bld_base, type, swizzle, dw_addr);
1010 }
1011
1012 static LLVMValueRef fetch_output_tcs(
1013                 struct lp_build_tgsi_context *bld_base,
1014                 const struct tgsi_full_src_register *reg,
1015                 enum tgsi_opcode_type type, unsigned swizzle)
1016 {
1017         struct si_shader_context *ctx = si_shader_context(bld_base);
1018         LLVMValueRef dw_addr, stride;
1019
1020         if (reg->Register.Dimension) {
1021                 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1022                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1023                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1024         } else {
1025                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1026                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1027         }
1028
1029         return lds_load(bld_base, type, swizzle, dw_addr);
1030 }
1031
1032 static LLVMValueRef fetch_input_tes(
1033         struct lp_build_tgsi_context *bld_base,
1034         const struct tgsi_full_src_register *reg,
1035         enum tgsi_opcode_type type, unsigned swizzle)
1036 {
1037         struct si_shader_context *ctx = si_shader_context(bld_base);
1038         struct gallivm_state *gallivm = bld_base->base.gallivm;
1039         LLVMValueRef rw_buffers, buffer, base, addr;
1040
1041         rw_buffers = LLVMGetParam(ctx->main_fn,
1042                                   SI_PARAM_RW_BUFFERS);
1043         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1044                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1045
1046         base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
1047         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1048
1049         return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1050 }
1051
1052 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1053                              const struct tgsi_full_instruction *inst,
1054                              const struct tgsi_opcode_info *info,
1055                              LLVMValueRef dst[4])
1056 {
1057         struct si_shader_context *ctx = si_shader_context(bld_base);
1058         struct gallivm_state *gallivm = bld_base->base.gallivm;
1059         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1060         unsigned chan_index;
1061         LLVMValueRef dw_addr, stride;
1062         LLVMValueRef rw_buffers, buffer, base, buf_addr;
1063         LLVMValueRef values[4];
1064
1065         /* Only handle per-patch and per-vertex outputs here.
1066          * Vectors will be lowered to scalars and this function will be called again.
1067          */
1068         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1069             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1070                 si_llvm_emit_store(bld_base, inst, info, dst);
1071                 return;
1072         }
1073
1074         if (reg->Register.Dimension) {
1075                 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1076                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1077                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1078         } else {
1079                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1080                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1081         }
1082
1083         rw_buffers = LLVMGetParam(ctx->main_fn,
1084                                   SI_PARAM_RW_BUFFERS);
1085         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1086                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1087
1088         base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
1089         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1090
1091
1092         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1093                 LLVMValueRef value = dst[chan_index];
1094
1095                 if (inst->Instruction.Saturate)
1096                         value = si_llvm_saturate(bld_base, value);
1097
1098                 lds_store(bld_base, chan_index, dw_addr, value);
1099
1100                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1101                 values[chan_index] = value;
1102
1103                 if (inst->Dst[0].Register.WriteMask != 0xF) {
1104                         build_tbuffer_store_dwords(ctx, buffer, value, 1,
1105                                                    buf_addr, base,
1106                                                    4 * chan_index);
1107                 }
1108         }
1109
1110         if (inst->Dst[0].Register.WriteMask == 0xF) {
1111                 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1112                                                             values, 4);
1113                 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1114                                            base, 0);
1115         }
1116 }
1117
1118 static LLVMValueRef fetch_input_gs(
1119         struct lp_build_tgsi_context *bld_base,
1120         const struct tgsi_full_src_register *reg,
1121         enum tgsi_opcode_type type,
1122         unsigned swizzle)
1123 {
1124         struct lp_build_context *base = &bld_base->base;
1125         struct si_shader_context *ctx = si_shader_context(bld_base);
1126         struct si_shader *shader = ctx->shader;
1127         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1128         struct gallivm_state *gallivm = base->gallivm;
1129         LLVMValueRef vtx_offset;
1130         LLVMValueRef args[9];
1131         unsigned vtx_offset_param;
1132         struct tgsi_shader_info *info = &shader->selector->info;
1133         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1134         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1135         unsigned param;
1136         LLVMValueRef value;
1137
1138         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1139                 return get_primitive_id(bld_base, swizzle);
1140
1141         if (!reg->Register.Dimension)
1142                 return NULL;
1143
1144         if (swizzle == ~0) {
1145                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1146                 unsigned chan;
1147                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1148                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1149                 }
1150                 return lp_build_gather_values(bld_base->base.gallivm, values,
1151                                               TGSI_NUM_CHANNELS);
1152         }
1153
1154         /* Get the vertex offset parameter */
1155         vtx_offset_param = reg->Dimension.Index;
1156         if (vtx_offset_param < 2) {
1157                 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1158         } else {
1159                 assert(vtx_offset_param < 6);
1160                 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1161         }
1162         vtx_offset = lp_build_mul_imm(uint,
1163                                       LLVMGetParam(ctx->main_fn,
1164                                                    vtx_offset_param),
1165                                       4);
1166
1167         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1168         args[0] = ctx->esgs_ring;
1169         args[1] = vtx_offset;
1170         args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1171         args[3] = uint->zero;
1172         args[4] = uint->one;  /* OFFEN */
1173         args[5] = uint->zero; /* IDXEN */
1174         args[6] = uint->one;  /* GLC */
1175         args[7] = uint->zero; /* SLC */
1176         args[8] = uint->zero; /* TFE */
1177
1178         value = lp_build_intrinsic(gallivm->builder,
1179                                    "llvm.SI.buffer.load.dword.i32.i32",
1180                                    ctx->i32, args, 9,
1181                                    LP_FUNC_ATTR_READONLY);
1182         if (tgsi_type_is_64bit(type)) {
1183                 LLVMValueRef value2;
1184                 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1185                 value2 = lp_build_intrinsic(gallivm->builder,
1186                                             "llvm.SI.buffer.load.dword.i32.i32",
1187                                             ctx->i32, args, 9,
1188                                             LP_FUNC_ATTR_READONLY);
1189                 return si_llvm_emit_fetch_64bit(bld_base, type,
1190                                                 value, value2);
1191         }
1192         return LLVMBuildBitCast(gallivm->builder,
1193                                 value,
1194                                 tgsi2llvmtype(bld_base, type), "");
1195 }
1196
1197 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1198 {
1199         switch (interpolate) {
1200         case TGSI_INTERPOLATE_CONSTANT:
1201                 return 0;
1202
1203         case TGSI_INTERPOLATE_LINEAR:
1204                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1205                         return SI_PARAM_LINEAR_SAMPLE;
1206                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1207                         return SI_PARAM_LINEAR_CENTROID;
1208                 else
1209                         return SI_PARAM_LINEAR_CENTER;
1210                 break;
1211         case TGSI_INTERPOLATE_COLOR:
1212         case TGSI_INTERPOLATE_PERSPECTIVE:
1213                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1214                         return SI_PARAM_PERSP_SAMPLE;
1215                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1216                         return SI_PARAM_PERSP_CENTROID;
1217                 else
1218                         return SI_PARAM_PERSP_CENTER;
1219                 break;
1220         default:
1221                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1222                 return -1;
1223         }
1224 }
1225
1226 /**
1227  * Interpolate a fragment shader input.
1228  *
1229  * @param ctx           context
1230  * @param input_index           index of the input in hardware
1231  * @param semantic_name         TGSI_SEMANTIC_*
1232  * @param semantic_index        semantic index
1233  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1234  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1235  * @param interp_param          interpolation weights (i,j)
1236  * @param prim_mask             SI_PARAM_PRIM_MASK
1237  * @param face                  SI_PARAM_FRONT_FACE
1238  * @param result                the return value (4 components)
1239  */
1240 static void interp_fs_input(struct si_shader_context *ctx,
1241                             unsigned input_index,
1242                             unsigned semantic_name,
1243                             unsigned semantic_index,
1244                             unsigned num_interp_inputs,
1245                             unsigned colors_read_mask,
1246                             LLVMValueRef interp_param,
1247                             LLVMValueRef prim_mask,
1248                             LLVMValueRef face,
1249                             LLVMValueRef result[4])
1250 {
1251         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1252         struct lp_build_context *base = &bld_base->base;
1253         struct lp_build_context *uint = &bld_base->uint_bld;
1254         struct gallivm_state *gallivm = base->gallivm;
1255         LLVMValueRef attr_number;
1256         LLVMValueRef i, j;
1257
1258         unsigned chan;
1259
1260         /* fs.constant returns the param from the middle vertex, so it's not
1261          * really useful for flat shading. It's meant to be used for custom
1262          * interpolation (but the intrinsic can't fetch from the other two
1263          * vertices).
1264          *
1265          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1266          * to do the right thing. The only reason we use fs.constant is that
1267          * fs.interp cannot be used on integers, because they can be equal
1268          * to NaN.
1269          *
1270          * When interp is false we will use fs.constant or for newer llvm,
1271          * amdgcn.interp.mov.
1272          */
1273         bool interp = interp_param != NULL;
1274
1275         attr_number = lp_build_const_int32(gallivm, input_index);
1276
1277         if (interp) {
1278                 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1279                                                 LLVMVectorType(ctx->f32, 2), "");
1280
1281                 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1282                                                 uint->zero, "");
1283                 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1284                                                 uint->one, "");
1285         }
1286
1287         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1288             ctx->shader->key.part.ps.prolog.color_two_side) {
1289                 LLVMValueRef is_face_positive;
1290                 LLVMValueRef back_attr_number;
1291
1292                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1293                  * otherwise it's at offset "num_inputs".
1294                  */
1295                 unsigned back_attr_offset = num_interp_inputs;
1296                 if (semantic_index == 1 && colors_read_mask & 0xf)
1297                         back_attr_offset += 1;
1298
1299                 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1300
1301                 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1302                                                  face, uint->zero, "");
1303
1304                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1305                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1306                         LLVMValueRef front, back;
1307
1308                         if (interp) {
1309                                 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1310                                                         attr_number, prim_mask,
1311                                                         i, j);
1312                                 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1313                                                         back_attr_number, prim_mask,
1314                                                         i, j);
1315                         } else {
1316                                 front = ac_build_fs_interp_mov(&ctx->ac,
1317                                         lp_build_const_int32(gallivm, 2), /* P0 */
1318                                         llvm_chan, attr_number, prim_mask);
1319                                 back = ac_build_fs_interp_mov(&ctx->ac,
1320                                         lp_build_const_int32(gallivm, 2), /* P0 */
1321                                         llvm_chan, back_attr_number, prim_mask);
1322                         }
1323
1324                         result[chan] = LLVMBuildSelect(gallivm->builder,
1325                                                 is_face_positive,
1326                                                 front,
1327                                                 back,
1328                                                 "");
1329                 }
1330         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1331                 if (interp) {
1332                         result[0] = ac_build_fs_interp(&ctx->ac, uint->zero,
1333                                                        attr_number, prim_mask, i, j);
1334                 } else {
1335                         result[0] = ac_build_fs_interp_mov(&ctx->ac, uint->zero,
1336                                                            lp_build_const_int32(gallivm, 2), /* P0 */
1337                                                            attr_number, prim_mask);
1338                 }
1339                 result[1] =
1340                 result[2] = lp_build_const_float(gallivm, 0.0f);
1341                 result[3] = lp_build_const_float(gallivm, 1.0f);
1342         } else {
1343                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1344                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1345
1346                         if (interp) {
1347                                 result[chan] = ac_build_fs_interp(&ctx->ac,
1348                                         llvm_chan, attr_number, prim_mask, i, j);
1349                         } else {
1350                                 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1351                                         lp_build_const_int32(gallivm, 2), /* P0 */
1352                                         llvm_chan, attr_number, prim_mask);
1353                         }
1354                 }
1355         }
1356 }
1357
1358 static void declare_input_fs(
1359         struct si_shader_context *radeon_bld,
1360         unsigned input_index,
1361         const struct tgsi_full_declaration *decl,
1362         LLVMValueRef out[4])
1363 {
1364         struct lp_build_context *base = &radeon_bld->bld_base.base;
1365         struct si_shader_context *ctx =
1366                 si_shader_context(&radeon_bld->bld_base);
1367         struct si_shader *shader = ctx->shader;
1368         LLVMValueRef main_fn = radeon_bld->main_fn;
1369         LLVMValueRef interp_param = NULL;
1370         int interp_param_idx;
1371
1372         /* Get colors from input VGPRs (set by the prolog). */
1373         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1374                 unsigned i = decl->Semantic.Index;
1375                 unsigned colors_read = shader->selector->info.colors_read;
1376                 unsigned mask = colors_read >> (i * 4);
1377                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1378                                   (i ? util_bitcount(colors_read & 0xf) : 0);
1379
1380                 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1381                 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1382                 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1383                 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1384                 return;
1385         }
1386
1387         interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1388                                                      decl->Interp.Location);
1389         if (interp_param_idx == -1)
1390                 return;
1391         else if (interp_param_idx) {
1392                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1393         }
1394
1395         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1396             decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1397             ctx->shader->key.part.ps.prolog.flatshade_colors)
1398                 interp_param = NULL; /* load the constant color */
1399
1400         interp_fs_input(ctx, input_index, decl->Semantic.Name,
1401                         decl->Semantic.Index, shader->selector->info.num_inputs,
1402                         shader->selector->info.colors_read, interp_param,
1403                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1404                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1405                         &out[0]);
1406 }
1407
1408 static LLVMValueRef get_sample_id(struct si_shader_context *radeon_bld)
1409 {
1410         return unpack_param(si_shader_context(&radeon_bld->bld_base),
1411                             SI_PARAM_ANCILLARY, 8, 4);
1412 }
1413
1414 /**
1415  * Set range metadata on an instruction.  This can only be used on load and
1416  * call instructions.  If you know an instruction can only produce the values
1417  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1418  * \p lo is the minimum value inclusive.
1419  * \p hi is the maximum value exclusive.
1420  */
1421 static void set_range_metadata(struct si_shader_context *ctx,
1422                                LLVMValueRef value, unsigned lo, unsigned hi)
1423 {
1424         LLVMValueRef range_md, md_args[2];
1425         LLVMTypeRef type = LLVMTypeOf(value);
1426         LLVMContextRef context = LLVMGetTypeContext(type);
1427
1428         md_args[0] = LLVMConstInt(type, lo, false);
1429         md_args[1] = LLVMConstInt(type, hi, false);
1430         range_md = LLVMMDNodeInContext(context, md_args, 2);
1431         LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1432 }
1433
1434 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1435 {
1436         struct gallivm_state *gallivm = &ctx->gallivm;
1437         LLVMValueRef tid;
1438
1439         if (HAVE_LLVM < 0x0308) {
1440                 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1441                                 ctx->i32,   NULL, 0, LP_FUNC_ATTR_READNONE);
1442         } else {
1443                 LLVMValueRef tid_args[2];
1444                 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1445                 tid_args[1] = lp_build_const_int32(gallivm, 0);
1446                 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1447                                         "llvm.amdgcn.mbcnt.lo", ctx->i32,
1448                                         tid_args, 2, LP_FUNC_ATTR_READNONE);
1449
1450                 tid = lp_build_intrinsic(gallivm->builder,
1451                                         "llvm.amdgcn.mbcnt.hi", ctx->i32,
1452                                         tid_args, 2, LP_FUNC_ATTR_READNONE);
1453         }
1454         set_range_metadata(ctx, tid, 0, 64);
1455         return tid;
1456 }
1457
1458 /**
1459  * Load a dword from a constant buffer.
1460  */
1461 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1462                                       LLVMValueRef resource,
1463                                       LLVMValueRef offset)
1464 {
1465         LLVMBuilderRef builder = ctx->gallivm.builder;
1466         LLVMValueRef args[2] = {resource, offset};
1467
1468         return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1469                                LP_FUNC_ATTR_READNONE);
1470 }
1471
1472 static LLVMValueRef load_sample_position(struct si_shader_context *radeon_bld, LLVMValueRef sample_id)
1473 {
1474         struct si_shader_context *ctx =
1475                 si_shader_context(&radeon_bld->bld_base);
1476         struct lp_build_context *uint_bld = &radeon_bld->bld_base.uint_bld;
1477         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1478         LLVMBuilderRef builder = gallivm->builder;
1479         LLVMValueRef desc = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS);
1480         LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1481         LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1482
1483         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1484         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1485         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1486
1487         LLVMValueRef pos[4] = {
1488                 buffer_load_const(ctx, resource, offset0),
1489                 buffer_load_const(ctx, resource, offset1),
1490                 lp_build_const_float(gallivm, 0),
1491                 lp_build_const_float(gallivm, 0)
1492         };
1493
1494         return lp_build_gather_values(gallivm, pos, 4);
1495 }
1496
1497 static void declare_system_value(
1498         struct si_shader_context *radeon_bld,
1499         unsigned index,
1500         const struct tgsi_full_declaration *decl)
1501 {
1502         struct si_shader_context *ctx =
1503                 si_shader_context(&radeon_bld->bld_base);
1504         struct lp_build_context *bld = &radeon_bld->bld_base.base;
1505         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1506         LLVMValueRef value = 0;
1507
1508         switch (decl->Semantic.Name) {
1509         case TGSI_SEMANTIC_INSTANCEID:
1510                 value = LLVMGetParam(radeon_bld->main_fn,
1511                                      ctx->param_instance_id);
1512                 break;
1513
1514         case TGSI_SEMANTIC_VERTEXID:
1515                 value = LLVMBuildAdd(gallivm->builder,
1516                                      LLVMGetParam(radeon_bld->main_fn,
1517                                                   ctx->param_vertex_id),
1518                                      LLVMGetParam(radeon_bld->main_fn,
1519                                                   SI_PARAM_BASE_VERTEX), "");
1520                 break;
1521
1522         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1523                 value = LLVMGetParam(radeon_bld->main_fn,
1524                                      ctx->param_vertex_id);
1525                 break;
1526
1527         case TGSI_SEMANTIC_BASEVERTEX:
1528                 value = LLVMGetParam(radeon_bld->main_fn,
1529                                      SI_PARAM_BASE_VERTEX);
1530                 break;
1531
1532         case TGSI_SEMANTIC_BASEINSTANCE:
1533                 value = LLVMGetParam(radeon_bld->main_fn,
1534                                      SI_PARAM_START_INSTANCE);
1535                 break;
1536
1537         case TGSI_SEMANTIC_DRAWID:
1538                 value = LLVMGetParam(radeon_bld->main_fn,
1539                                      SI_PARAM_DRAWID);
1540                 break;
1541
1542         case TGSI_SEMANTIC_INVOCATIONID:
1543                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1544                         value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1545                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1546                         value = LLVMGetParam(radeon_bld->main_fn,
1547                                              SI_PARAM_GS_INSTANCE_ID);
1548                 else
1549                         assert(!"INVOCATIONID not implemented");
1550                 break;
1551
1552         case TGSI_SEMANTIC_POSITION:
1553         {
1554                 LLVMValueRef pos[4] = {
1555                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1556                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1557                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1558                         lp_build_emit_llvm_unary(&radeon_bld->bld_base, TGSI_OPCODE_RCP,
1559                                                  LLVMGetParam(radeon_bld->main_fn,
1560                                                               SI_PARAM_POS_W_FLOAT)),
1561                 };
1562                 value = lp_build_gather_values(gallivm, pos, 4);
1563                 break;
1564         }
1565
1566         case TGSI_SEMANTIC_FACE:
1567                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1568                 break;
1569
1570         case TGSI_SEMANTIC_SAMPLEID:
1571                 value = get_sample_id(radeon_bld);
1572                 break;
1573
1574         case TGSI_SEMANTIC_SAMPLEPOS: {
1575                 LLVMValueRef pos[4] = {
1576                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1577                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1578                         lp_build_const_float(gallivm, 0),
1579                         lp_build_const_float(gallivm, 0)
1580                 };
1581                 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->bld_base,
1582                                                   TGSI_OPCODE_FRC, pos[0]);
1583                 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->bld_base,
1584                                                   TGSI_OPCODE_FRC, pos[1]);
1585                 value = lp_build_gather_values(gallivm, pos, 4);
1586                 break;
1587         }
1588
1589         case TGSI_SEMANTIC_SAMPLEMASK:
1590                 /* This can only occur with the OpenGL Core profile, which
1591                  * doesn't support smoothing.
1592                  */
1593                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1594                 break;
1595
1596         case TGSI_SEMANTIC_TESSCOORD:
1597         {
1598                 LLVMValueRef coord[4] = {
1599                         LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1600                         LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1601                         bld->zero,
1602                         bld->zero
1603                 };
1604
1605                 /* For triangles, the vector should be (u, v, 1-u-v). */
1606                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1607                     PIPE_PRIM_TRIANGLES)
1608                         coord[2] = lp_build_sub(bld, bld->one,
1609                                                 lp_build_add(bld, coord[0], coord[1]));
1610
1611                 value = lp_build_gather_values(gallivm, coord, 4);
1612                 break;
1613         }
1614
1615         case TGSI_SEMANTIC_VERTICESIN:
1616                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1617                         value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1618                 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1619                         value = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 7);
1620                 else
1621                         assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1622                 break;
1623
1624         case TGSI_SEMANTIC_TESSINNER:
1625         case TGSI_SEMANTIC_TESSOUTER:
1626         {
1627                 LLVMValueRef rw_buffers, buffer, base, addr;
1628                 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1629
1630                 rw_buffers = LLVMGetParam(ctx->main_fn,
1631                                         SI_PARAM_RW_BUFFERS);
1632                 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1633                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1634
1635                 base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
1636                 addr = get_tcs_tes_buffer_address(ctx, NULL,
1637                                           lp_build_const_int32(gallivm, param));
1638
1639                 value = buffer_load(&radeon_bld->bld_base, TGSI_TYPE_FLOAT,
1640                                     ~0, buffer, base, addr);
1641
1642                 break;
1643         }
1644
1645         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1646         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1647         {
1648                 LLVMValueRef buf, slot, val[4];
1649                 int i, offset;
1650
1651                 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1652                 buf = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS);
1653                 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1654                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1655
1656                 for (i = 0; i < 4; i++)
1657                         val[i] = buffer_load_const(ctx, buf,
1658                                                    lp_build_const_int32(gallivm, (offset + i) * 4));
1659                 value = lp_build_gather_values(gallivm, val, 4);
1660                 break;
1661         }
1662
1663         case TGSI_SEMANTIC_PRIMID:
1664                 value = get_primitive_id(&radeon_bld->bld_base, 0);
1665                 break;
1666
1667         case TGSI_SEMANTIC_GRID_SIZE:
1668                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1669                 break;
1670
1671         case TGSI_SEMANTIC_BLOCK_SIZE:
1672         {
1673                 LLVMValueRef values[3];
1674                 unsigned i;
1675                 unsigned *properties = ctx->shader->selector->info.properties;
1676
1677                 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1678                         unsigned sizes[3] = {
1679                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1680                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1681                                 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1682                         };
1683
1684                         for (i = 0; i < 3; ++i)
1685                                 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1686
1687                         value = lp_build_gather_values(gallivm, values, 3);
1688                 } else {
1689                         value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE);
1690                 }
1691                 break;
1692         }
1693
1694         case TGSI_SEMANTIC_BLOCK_ID:
1695                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1696                 break;
1697
1698         case TGSI_SEMANTIC_THREAD_ID:
1699                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1700                 break;
1701
1702         case TGSI_SEMANTIC_HELPER_INVOCATION:
1703                 if (HAVE_LLVM >= 0x0309) {
1704                         value = lp_build_intrinsic(gallivm->builder,
1705                                                    "llvm.amdgcn.ps.live",
1706                                                    ctx->i1, NULL, 0,
1707                                                    LP_FUNC_ATTR_READNONE);
1708                         value = LLVMBuildNot(gallivm->builder, value, "");
1709                         value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1710                 } else {
1711                         assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
1712                         return;
1713                 }
1714                 break;
1715
1716         default:
1717                 assert(!"unknown system value");
1718                 return;
1719         }
1720
1721         radeon_bld->system_values[index] = value;
1722 }
1723
1724 static void declare_compute_memory(struct si_shader_context *radeon_bld,
1725                                    const struct tgsi_full_declaration *decl)
1726 {
1727         struct si_shader_context *ctx =
1728                 si_shader_context(&radeon_bld->bld_base);
1729         struct si_shader_selector *sel = ctx->shader->selector;
1730         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1731
1732         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1733         LLVMValueRef var;
1734
1735         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1736         assert(decl->Range.First == decl->Range.Last);
1737         assert(!ctx->shared_memory);
1738
1739         var = LLVMAddGlobalInAddressSpace(gallivm->module,
1740                                           LLVMArrayType(ctx->i8, sel->local_size),
1741                                           "compute_lds",
1742                                           LOCAL_ADDR_SPACE);
1743         LLVMSetAlignment(var, 4);
1744
1745         ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1746 }
1747
1748 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1749 {
1750         LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1751                                              SI_PARAM_CONST_BUFFERS);
1752
1753         return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1754                                         LLVMConstInt(ctx->i32, i, 0));
1755 }
1756
1757 static LLVMValueRef fetch_constant(
1758         struct lp_build_tgsi_context *bld_base,
1759         const struct tgsi_full_src_register *reg,
1760         enum tgsi_opcode_type type,
1761         unsigned swizzle)
1762 {
1763         struct si_shader_context *ctx = si_shader_context(bld_base);
1764         struct lp_build_context *base = &bld_base->base;
1765         const struct tgsi_ind_register *ireg = &reg->Indirect;
1766         unsigned buf, idx;
1767
1768         LLVMValueRef addr, bufp;
1769         LLVMValueRef result;
1770
1771         if (swizzle == LP_CHAN_ALL) {
1772                 unsigned chan;
1773                 LLVMValueRef values[4];
1774                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1775                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1776
1777                 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1778         }
1779
1780         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1781         idx = reg->Register.Index * 4 + swizzle;
1782
1783         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1784                 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_CONST_BUFFERS);
1785                 LLVMValueRef index;
1786                 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1787                                                    reg->Dimension.Index,
1788                                                    SI_NUM_CONST_BUFFERS);
1789                 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1790         } else
1791                 bufp = load_const_buffer_desc(ctx, buf);
1792
1793         if (reg->Register.Indirect) {
1794                 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1795                 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1796                 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1797                 addr = lp_build_add(&bld_base->uint_bld, addr,
1798                                     lp_build_const_int32(base->gallivm, idx * 4));
1799         } else {
1800                 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1801         }
1802
1803         result = buffer_load_const(ctx, bufp, addr);
1804
1805         if (!tgsi_type_is_64bit(type))
1806                 result = bitcast(bld_base, type, result);
1807         else {
1808                 LLVMValueRef addr2, result2;
1809
1810                 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1811                                      LLVMConstInt(ctx->i32, 4, 0));
1812                 result2 = buffer_load_const(ctx, bufp, addr2);
1813
1814                 result = si_llvm_emit_fetch_64bit(bld_base, type,
1815                                                   result, result2);
1816         }
1817         return result;
1818 }
1819
1820 /* Upper 16 bits must be zero. */
1821 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1822                                            LLVMValueRef val[2])
1823 {
1824         return LLVMBuildOr(gallivm->builder, val[0],
1825                            LLVMBuildShl(gallivm->builder, val[1],
1826                                         lp_build_const_int32(gallivm, 16),
1827                                         ""), "");
1828 }
1829
1830 /* Upper 16 bits are ignored and will be dropped. */
1831 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1832                                                     LLVMValueRef val[2])
1833 {
1834         LLVMValueRef v[2] = {
1835                 LLVMBuildAnd(gallivm->builder, val[0],
1836                              lp_build_const_int32(gallivm, 0xffff), ""),
1837                 val[1],
1838         };
1839         return si_llvm_pack_two_int16(gallivm, v);
1840 }
1841
1842 /* Initialize arguments for the shader export intrinsic */
1843 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1844                                      LLVMValueRef *values,
1845                                      unsigned target,
1846                                      LLVMValueRef *args)
1847 {
1848         struct si_shader_context *ctx = si_shader_context(bld_base);
1849         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1850         struct lp_build_context *base = &bld_base->base;
1851         struct gallivm_state *gallivm = base->gallivm;
1852         LLVMBuilderRef builder = base->gallivm->builder;
1853         LLVMValueRef val[4];
1854         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1855         unsigned chan;
1856         bool is_int8;
1857
1858         /* Default is 0xf. Adjusted below depending on the format. */
1859         args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1860
1861         /* Specify whether the EXEC mask represents the valid mask */
1862         args[1] = uint->zero;
1863
1864         /* Specify whether this is the last export */
1865         args[2] = uint->zero;
1866
1867         /* Specify the target we are exporting */
1868         args[3] = lp_build_const_int32(base->gallivm, target);
1869
1870         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1871                 const struct si_shader_key *key = &ctx->shader->key;
1872                 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1873                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1874
1875                 assert(cbuf >= 0 && cbuf < 8);
1876                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1877                 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1878         }
1879
1880         args[4] = uint->zero; /* COMPR flag */
1881         args[5] = base->undef;
1882         args[6] = base->undef;
1883         args[7] = base->undef;
1884         args[8] = base->undef;
1885
1886         switch (spi_shader_col_format) {
1887         case V_028714_SPI_SHADER_ZERO:
1888                 args[0] = uint->zero; /* writemask */
1889                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
1890                 break;
1891
1892         case V_028714_SPI_SHADER_32_R:
1893                 args[0] = uint->one; /* writemask */
1894                 args[5] = values[0];
1895                 break;
1896
1897         case V_028714_SPI_SHADER_32_GR:
1898                 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
1899                 args[5] = values[0];
1900                 args[6] = values[1];
1901                 break;
1902
1903         case V_028714_SPI_SHADER_32_AR:
1904                 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
1905                 args[5] = values[0];
1906                 args[8] = values[3];
1907                 break;
1908
1909         case V_028714_SPI_SHADER_FP16_ABGR:
1910                 args[4] = uint->one; /* COMPR flag */
1911
1912                 for (chan = 0; chan < 2; chan++) {
1913                         LLVMValueRef pack_args[2] = {
1914                                 values[2 * chan],
1915                                 values[2 * chan + 1]
1916                         };
1917                         LLVMValueRef packed;
1918
1919                         packed = lp_build_intrinsic(base->gallivm->builder,
1920                                                     "llvm.SI.packf16",
1921                                                     ctx->i32, pack_args, 2,
1922                                                     LP_FUNC_ATTR_READNONE);
1923                         args[chan + 5] =
1924                                 LLVMBuildBitCast(base->gallivm->builder,
1925                                                  packed, ctx->f32, "");
1926                 }
1927                 break;
1928
1929         case V_028714_SPI_SHADER_UNORM16_ABGR:
1930                 for (chan = 0; chan < 4; chan++) {
1931                         val[chan] = si_llvm_saturate(bld_base, values[chan]);
1932                         val[chan] = LLVMBuildFMul(builder, val[chan],
1933                                                   lp_build_const_float(gallivm, 65535), "");
1934                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1935                                                   lp_build_const_float(gallivm, 0.5), "");
1936                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
1937                                                     ctx->i32, "");
1938                 }
1939
1940                 args[4] = uint->one; /* COMPR flag */
1941                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1942                                   si_llvm_pack_two_int16(gallivm, val));
1943                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1944                                   si_llvm_pack_two_int16(gallivm, val+2));
1945                 break;
1946
1947         case V_028714_SPI_SHADER_SNORM16_ABGR:
1948                 for (chan = 0; chan < 4; chan++) {
1949                         /* Clamp between [-1, 1]. */
1950                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1951                                                               values[chan],
1952                                                               lp_build_const_float(gallivm, 1));
1953                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1954                                                               val[chan],
1955                                                               lp_build_const_float(gallivm, -1));
1956                         /* Convert to a signed integer in [-32767, 32767]. */
1957                         val[chan] = LLVMBuildFMul(builder, val[chan],
1958                                                   lp_build_const_float(gallivm, 32767), "");
1959                         /* If positive, add 0.5, else add -0.5. */
1960                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1961                                         LLVMBuildSelect(builder,
1962                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
1963                                                               val[chan], base->zero, ""),
1964                                                 lp_build_const_float(gallivm, 0.5),
1965                                                 lp_build_const_float(gallivm, -0.5), ""), "");
1966                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1967                 }
1968
1969                 args[4] = uint->one; /* COMPR flag */
1970                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1971                                   si_llvm_pack_two_int32_as_int16(gallivm, val));
1972                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1973                                   si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1974                 break;
1975
1976         case V_028714_SPI_SHADER_UINT16_ABGR: {
1977                 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1978                                                         255 : 65535);
1979                 /* Clamp. */
1980                 for (chan = 0; chan < 4; chan++) {
1981                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1982                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1983                                                               val[chan], max);
1984                 }
1985
1986                 args[4] = uint->one; /* COMPR flag */
1987                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1988                                   si_llvm_pack_two_int16(gallivm, val));
1989                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1990                                   si_llvm_pack_two_int16(gallivm, val+2));
1991                 break;
1992         }
1993
1994         case V_028714_SPI_SHADER_SINT16_ABGR: {
1995                 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1996                                                         127 : 32767);
1997                 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
1998                                                         -128 : -32768);
1999                 /* Clamp. */
2000                 for (chan = 0; chan < 4; chan++) {
2001                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2002                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2003                                                               TGSI_OPCODE_IMIN,
2004                                                               val[chan], max);
2005                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2006                                                               TGSI_OPCODE_IMAX,
2007                                                               val[chan], min);
2008                 }
2009
2010                 args[4] = uint->one; /* COMPR flag */
2011                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2012                                   si_llvm_pack_two_int32_as_int16(gallivm, val));
2013                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2014                                   si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2015                 break;
2016         }
2017
2018         case V_028714_SPI_SHADER_32_ABGR:
2019                 memcpy(&args[5], values, sizeof(values[0]) * 4);
2020                 break;
2021         }
2022 }
2023
2024 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2025                           LLVMValueRef alpha)
2026 {
2027         struct si_shader_context *ctx = si_shader_context(bld_base);
2028         struct gallivm_state *gallivm = bld_base->base.gallivm;
2029
2030         if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2031                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2032                                 SI_PARAM_ALPHA_REF);
2033
2034                 LLVMValueRef alpha_pass =
2035                         lp_build_cmp(&bld_base->base,
2036                                      ctx->shader->key.part.ps.epilog.alpha_func,
2037                                      alpha, alpha_ref);
2038                 LLVMValueRef arg =
2039                         lp_build_select(&bld_base->base,
2040                                         alpha_pass,
2041                                         lp_build_const_float(gallivm, 1.0f),
2042                                         lp_build_const_float(gallivm, -1.0f));
2043
2044                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2045                                    ctx->voidt, &arg, 1, 0);
2046         } else {
2047                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2048                                    ctx->voidt, NULL, 0, 0);
2049         }
2050 }
2051
2052 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2053                                                   LLVMValueRef alpha,
2054                                                   unsigned samplemask_param)
2055 {
2056         struct si_shader_context *ctx = si_shader_context(bld_base);
2057         struct gallivm_state *gallivm = bld_base->base.gallivm;
2058         LLVMValueRef coverage;
2059
2060         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2061         coverage = LLVMGetParam(ctx->main_fn,
2062                                 samplemask_param);
2063         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2064
2065         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2066                                    ctx->i32,
2067                                    &coverage, 1, LP_FUNC_ATTR_READNONE);
2068
2069         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2070                                    ctx->f32, "");
2071
2072         coverage = LLVMBuildFMul(gallivm->builder, coverage,
2073                                  lp_build_const_float(gallivm,
2074                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2075
2076         return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2077 }
2078
2079 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2080                                     LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2081 {
2082         struct si_shader_context *ctx = si_shader_context(bld_base);
2083         struct lp_build_context *base = &bld_base->base;
2084         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
2085         unsigned reg_index;
2086         unsigned chan;
2087         unsigned const_chan;
2088         LLVMValueRef base_elt;
2089         LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS);
2090         LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2091                                                            SI_VS_CONST_CLIP_PLANES);
2092         LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2093
2094         for (reg_index = 0; reg_index < 2; reg_index ++) {
2095                 LLVMValueRef *args = pos[2 + reg_index];
2096
2097                 args[5] =
2098                 args[6] =
2099                 args[7] =
2100                 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2101
2102                 /* Compute dot products of position and user clip plane vectors */
2103                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2104                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2105                                 args[1] = lp_build_const_int32(base->gallivm,
2106                                                                ((reg_index * 4 + chan) * 4 +
2107                                                                 const_chan) * 4);
2108                                 base_elt = buffer_load_const(ctx, const_resource,
2109                                                              args[1]);
2110                                 args[5 + chan] =
2111                                         lp_build_add(base, args[5 + chan],
2112                                                      lp_build_mul(base, base_elt,
2113                                                                   out_elts[const_chan]));
2114                         }
2115                 }
2116
2117                 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2118                 args[1] = uint->zero;
2119                 args[2] = uint->zero;
2120                 args[3] = lp_build_const_int32(base->gallivm,
2121                                                V_008DFC_SQ_EXP_POS + 2 + reg_index);
2122                 args[4] = uint->zero;
2123         }
2124 }
2125
2126 static void si_dump_streamout(struct pipe_stream_output_info *so)
2127 {
2128         unsigned i;
2129
2130         if (so->num_outputs)
2131                 fprintf(stderr, "STREAMOUT\n");
2132
2133         for (i = 0; i < so->num_outputs; i++) {
2134                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2135                                 so->output[i].start_component;
2136                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2137                         i, so->output[i].output_buffer,
2138                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2139                         so->output[i].register_index,
2140                         mask & 1 ? "x" : "",
2141                         mask & 2 ? "y" : "",
2142                         mask & 4 ? "z" : "",
2143                         mask & 8 ? "w" : "");
2144         }
2145 }
2146
2147 static void emit_streamout_output(struct si_shader_context *ctx,
2148                                   LLVMValueRef const *so_buffers,
2149                                   LLVMValueRef const *so_write_offsets,
2150                                   struct pipe_stream_output *stream_out,
2151                                   struct si_shader_output_values *shader_out)
2152 {
2153         struct gallivm_state *gallivm = &ctx->gallivm;
2154         LLVMBuilderRef builder = gallivm->builder;
2155         unsigned buf_idx = stream_out->output_buffer;
2156         unsigned start = stream_out->start_component;
2157         unsigned num_comps = stream_out->num_components;
2158         LLVMValueRef out[4];
2159
2160         assert(num_comps && num_comps <= 4);
2161         if (!num_comps || num_comps > 4)
2162                 return;
2163
2164         /* Load the output as int. */
2165         for (int j = 0; j < num_comps; j++) {
2166                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2167
2168                 out[j] = LLVMBuildBitCast(builder,
2169                                           shader_out->values[start + j],
2170                                 ctx->i32, "");
2171         }
2172
2173         /* Pack the output. */
2174         LLVMValueRef vdata = NULL;
2175
2176         switch (num_comps) {
2177         case 1: /* as i32 */
2178                 vdata = out[0];
2179                 break;
2180         case 2: /* as v2i32 */
2181         case 3: /* as v4i32 (aligned to 4) */
2182         case 4: /* as v4i32 */
2183                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2184                 for (int j = 0; j < num_comps; j++) {
2185                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2186                                                        LLVMConstInt(ctx->i32, j, 0), "");
2187                 }
2188                 break;
2189         }
2190
2191         build_tbuffer_store_dwords(ctx, so_buffers[buf_idx],
2192                                    vdata, num_comps,
2193                                    so_write_offsets[buf_idx],
2194                                    LLVMConstInt(ctx->i32, 0, 0),
2195                                    stream_out->dst_offset * 4);
2196 }
2197
2198 /**
2199  * Write streamout data to buffers for vertex stream @p stream (different
2200  * vertex streams can occur for GS copy shaders).
2201  */
2202 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2203                                    struct si_shader_output_values *outputs,
2204                                    unsigned noutput, unsigned stream)
2205 {
2206         struct si_shader_selector *sel = ctx->shader->selector;
2207         struct pipe_stream_output_info *so = &sel->so;
2208         struct gallivm_state *gallivm = &ctx->gallivm;
2209         LLVMBuilderRef builder = gallivm->builder;
2210         int i;
2211         struct lp_build_if_state if_ctx;
2212
2213         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2214         LLVMValueRef so_vtx_count =
2215                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2216
2217         LLVMValueRef tid = get_thread_id(ctx);
2218
2219         /* can_emit = tid < so_vtx_count; */
2220         LLVMValueRef can_emit =
2221                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2222
2223         /* Emit the streamout code conditionally. This actually avoids
2224          * out-of-bounds buffer access. The hw tells us via the SGPR
2225          * (so_vtx_count) which threads are allowed to emit streamout data. */
2226         lp_build_if(&if_ctx, gallivm, can_emit);
2227         {
2228                 /* The buffer offset is computed as follows:
2229                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2230                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2231                  *                attrib_offset
2232                  */
2233
2234                 LLVMValueRef so_write_index =
2235                         LLVMGetParam(ctx->main_fn,
2236                                      ctx->param_streamout_write_index);
2237
2238                 /* Compute (streamout_write_index + thread_id). */
2239                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2240
2241                 /* Load the descriptor and compute the write offset for each
2242                  * enabled buffer. */
2243                 LLVMValueRef so_write_offset[4] = {};
2244                 LLVMValueRef so_buffers[4];
2245                 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2246                                                     SI_PARAM_RW_BUFFERS);
2247
2248                 for (i = 0; i < 4; i++) {
2249                         if (!so->stride[i])
2250                                 continue;
2251
2252                         LLVMValueRef offset = lp_build_const_int32(gallivm,
2253                                                                    SI_VS_STREAMOUT_BUF0 + i);
2254
2255                         so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2256
2257                         LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2258                                                               ctx->param_streamout_offset[i]);
2259                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2260
2261                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2262                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2263                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2264                 }
2265
2266                 /* Write streamout data. */
2267                 for (i = 0; i < so->num_outputs; i++) {
2268                         unsigned reg = so->output[i].register_index;
2269
2270                         if (reg >= noutput)
2271                                 continue;
2272
2273                         if (stream != so->output[i].stream)
2274                                 continue;
2275
2276                         emit_streamout_output(ctx, so_buffers, so_write_offset,
2277                                               &so->output[i], &outputs[reg]);
2278                 }
2279         }
2280         lp_build_endif(&if_ctx);
2281 }
2282
2283
2284 /* Generate export instructions for hardware VS shader stage */
2285 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2286                               struct si_shader_output_values *outputs,
2287                               unsigned noutput)
2288 {
2289         struct si_shader_context *ctx = si_shader_context(bld_base);
2290         struct si_shader *shader = ctx->shader;
2291         struct lp_build_context *base = &bld_base->base;
2292         struct lp_build_context *uint = &ctx->bld_base.uint_bld;
2293         LLVMValueRef args[9];
2294         LLVMValueRef pos_args[4][9] = { { 0 } };
2295         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2296         unsigned semantic_name, semantic_index;
2297         unsigned target;
2298         unsigned param_count = 0;
2299         unsigned pos_idx;
2300         int i;
2301
2302         for (i = 0; i < noutput; i++) {
2303                 semantic_name = outputs[i].semantic_name;
2304                 semantic_index = outputs[i].semantic_index;
2305                 bool export_param = true;
2306
2307                 switch (semantic_name) {
2308                 case TGSI_SEMANTIC_POSITION: /* ignore these */
2309                 case TGSI_SEMANTIC_PSIZE:
2310                 case TGSI_SEMANTIC_CLIPVERTEX:
2311                 case TGSI_SEMANTIC_EDGEFLAG:
2312                         break;
2313                 case TGSI_SEMANTIC_GENERIC:
2314                 case TGSI_SEMANTIC_CLIPDIST:
2315                         if (shader->key.opt.hw_vs.kill_outputs &
2316                             (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2317                                 export_param = false;
2318                         break;
2319                 default:
2320                         if (shader->key.opt.hw_vs.kill_outputs2 &
2321                             (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2322                                 export_param = false;
2323                         break;
2324                 }
2325
2326                 if (outputs[i].vertex_stream[0] != 0 &&
2327                     outputs[i].vertex_stream[1] != 0 &&
2328                     outputs[i].vertex_stream[2] != 0 &&
2329                     outputs[i].vertex_stream[3] != 0)
2330                         export_param = false;
2331
2332 handle_semantic:
2333                 /* Select the correct target */
2334                 switch(semantic_name) {
2335                 case TGSI_SEMANTIC_PSIZE:
2336                         psize_value = outputs[i].values[0];
2337                         continue;
2338                 case TGSI_SEMANTIC_EDGEFLAG:
2339                         edgeflag_value = outputs[i].values[0];
2340                         continue;
2341                 case TGSI_SEMANTIC_LAYER:
2342                         layer_value = outputs[i].values[0];
2343                         semantic_name = TGSI_SEMANTIC_GENERIC;
2344                         goto handle_semantic;
2345                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2346                         viewport_index_value = outputs[i].values[0];
2347                         semantic_name = TGSI_SEMANTIC_GENERIC;
2348                         goto handle_semantic;
2349                 case TGSI_SEMANTIC_POSITION:
2350                         target = V_008DFC_SQ_EXP_POS;
2351                         break;
2352                 case TGSI_SEMANTIC_CLIPDIST:
2353                         if (shader->key.opt.hw_vs.clip_disable) {
2354                                 semantic_name = TGSI_SEMANTIC_GENERIC;
2355                                 goto handle_semantic;
2356                         }
2357                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2358                         break;
2359                 case TGSI_SEMANTIC_CLIPVERTEX:
2360                         if (shader->key.opt.hw_vs.clip_disable)
2361                                 continue;
2362                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2363                         continue;
2364                 case TGSI_SEMANTIC_COLOR:
2365                 case TGSI_SEMANTIC_BCOLOR:
2366                 case TGSI_SEMANTIC_PRIMID:
2367                 case TGSI_SEMANTIC_FOG:
2368                 case TGSI_SEMANTIC_TEXCOORD:
2369                 case TGSI_SEMANTIC_GENERIC:
2370                         if (!export_param)
2371                                 continue;
2372                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2373                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2374                         shader->info.vs_output_param_offset[i] = param_count;
2375                         param_count++;
2376                         break;
2377                 default:
2378                         target = 0;
2379                         fprintf(stderr,
2380                                 "Warning: SI unhandled vs output type:%d\n",
2381                                 semantic_name);
2382                 }
2383
2384                 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2385
2386                 if (target >= V_008DFC_SQ_EXP_POS &&
2387                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
2388                         memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2389                                args, sizeof(args));
2390                 } else {
2391                         lp_build_intrinsic(base->gallivm->builder,
2392                                            "llvm.SI.export", ctx->voidt,
2393                                            args, 9, 0);
2394                 }
2395
2396                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2397                         semantic_name = TGSI_SEMANTIC_GENERIC;
2398                         goto handle_semantic;
2399                 }
2400         }
2401
2402         shader->info.nr_param_exports = param_count;
2403
2404         /* We need to add the position output manually if it's missing. */
2405         if (!pos_args[0][0]) {
2406                 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2407                 pos_args[0][1] = uint->zero; /* EXEC mask */
2408                 pos_args[0][2] = uint->zero; /* last export? */
2409                 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2410                 pos_args[0][4] = uint->zero; /* COMPR flag */
2411                 pos_args[0][5] = base->zero; /* X */
2412                 pos_args[0][6] = base->zero; /* Y */
2413                 pos_args[0][7] = base->zero; /* Z */
2414                 pos_args[0][8] = base->one;  /* W */
2415         }
2416
2417         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2418         if (shader->selector->info.writes_psize ||
2419             shader->selector->info.writes_edgeflag ||
2420             shader->selector->info.writes_viewport_index ||
2421             shader->selector->info.writes_layer) {
2422                 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2423                                                       shader->selector->info.writes_psize |
2424                                                       (shader->selector->info.writes_edgeflag << 1) |
2425                                                       (shader->selector->info.writes_layer << 2) |
2426                                                       (shader->selector->info.writes_viewport_index << 3));
2427                 pos_args[1][1] = uint->zero; /* EXEC mask */
2428                 pos_args[1][2] = uint->zero; /* last export? */
2429                 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2430                 pos_args[1][4] = uint->zero; /* COMPR flag */
2431                 pos_args[1][5] = base->zero; /* X */
2432                 pos_args[1][6] = base->zero; /* Y */
2433                 pos_args[1][7] = base->zero; /* Z */
2434                 pos_args[1][8] = base->zero; /* W */
2435
2436                 if (shader->selector->info.writes_psize)
2437                         pos_args[1][5] = psize_value;
2438
2439                 if (shader->selector->info.writes_edgeflag) {
2440                         /* The output is a float, but the hw expects an integer
2441                          * with the first bit containing the edge flag. */
2442                         edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2443                                                          edgeflag_value,
2444                                                          ctx->i32, "");
2445                         edgeflag_value = lp_build_min(&bld_base->int_bld,
2446                                                       edgeflag_value,
2447                                                       bld_base->int_bld.one);
2448
2449                         /* The LLVM intrinsic expects a float. */
2450                         pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2451                                                           edgeflag_value,
2452                                                           ctx->f32, "");
2453                 }
2454
2455                 if (shader->selector->info.writes_layer)
2456                         pos_args[1][7] = layer_value;
2457
2458                 if (shader->selector->info.writes_viewport_index)
2459                         pos_args[1][8] = viewport_index_value;
2460         }
2461
2462         for (i = 0; i < 4; i++)
2463                 if (pos_args[i][0])
2464                         shader->info.nr_pos_exports++;
2465
2466         pos_idx = 0;
2467         for (i = 0; i < 4; i++) {
2468                 if (!pos_args[i][0])
2469                         continue;
2470
2471                 /* Specify the target we are exporting */
2472                 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2473
2474                 if (pos_idx == shader->info.nr_pos_exports)
2475                         /* Specify that this is the last export */
2476                         pos_args[i][2] = uint->one;
2477
2478                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2479                                    ctx->voidt, pos_args[i], 9, 0);
2480         }
2481 }
2482
2483 /**
2484  * Forward all outputs from the vertex shader to the TES. This is only used
2485  * for the fixed function TCS.
2486  */
2487 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2488 {
2489         struct si_shader_context *ctx = si_shader_context(bld_base);
2490         struct gallivm_state *gallivm = bld_base->base.gallivm;
2491         LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2492         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2493         uint64_t inputs;
2494
2495         invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2496
2497         rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS);
2498         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2499                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2500
2501         buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds);
2502
2503         lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2504         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2505                                          lds_vertex_stride, "");
2506         lds_base = get_tcs_in_current_patch_offset(ctx);
2507         lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2508
2509         inputs = ctx->shader->key.mono.tcs.inputs_to_copy;
2510         while (inputs) {
2511                 unsigned i = u_bit_scan64(&inputs);
2512
2513                 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2514                                             lp_build_const_int32(gallivm, 4 * i),
2515                                              "");
2516
2517                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2518                                               invocation_id,
2519                                               lp_build_const_int32(gallivm, i));
2520
2521                 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2522                                               lds_ptr);
2523
2524                 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2525                                            buffer_offset, 0);
2526         }
2527 }
2528
2529 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2530                                   LLVMValueRef rel_patch_id,
2531                                   LLVMValueRef invocation_id,
2532                                   LLVMValueRef tcs_out_current_patch_data_offset)
2533 {
2534         struct si_shader_context *ctx = si_shader_context(bld_base);
2535         struct gallivm_state *gallivm = bld_base->base.gallivm;
2536         struct si_shader *shader = ctx->shader;
2537         unsigned tess_inner_index, tess_outer_index;
2538         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2539         LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2540         unsigned stride, outer_comps, inner_comps, i;
2541         struct lp_build_if_state if_ctx, inner_if_ctx;
2542
2543         si_llvm_emit_barrier(NULL, bld_base, NULL);
2544
2545         /* Do this only for invocation 0, because the tess levels are per-patch,
2546          * not per-vertex.
2547          *
2548          * This can't jump, because invocation 0 executes this. It should
2549          * at least mask out the loads and stores for other invocations.
2550          */
2551         lp_build_if(&if_ctx, gallivm,
2552                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2553                                   invocation_id, bld_base->uint_bld.zero, ""));
2554
2555         /* Determine the layout of one tess factor element in the buffer. */
2556         switch (shader->key.part.tcs.epilog.prim_mode) {
2557         case PIPE_PRIM_LINES:
2558                 stride = 2; /* 2 dwords, 1 vec2 store */
2559                 outer_comps = 2;
2560                 inner_comps = 0;
2561                 break;
2562         case PIPE_PRIM_TRIANGLES:
2563                 stride = 4; /* 4 dwords, 1 vec4 store */
2564                 outer_comps = 3;
2565                 inner_comps = 1;
2566                 break;
2567         case PIPE_PRIM_QUADS:
2568                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2569                 outer_comps = 4;
2570                 inner_comps = 2;
2571                 break;
2572         default:
2573                 assert(0);
2574                 return;
2575         }
2576
2577         /* Load tess_inner and tess_outer from LDS.
2578          * Any invocation can write them, so we can't get them from a temporary.
2579          */
2580         tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2581         tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2582
2583         lds_base = tcs_out_current_patch_data_offset;
2584         lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2585                                  lp_build_const_int32(gallivm,
2586                                                       tess_inner_index * 4), "");
2587         lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2588                                  lp_build_const_int32(gallivm,
2589                                                       tess_outer_index * 4), "");
2590
2591         if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2592                 /* For isolines, the hardware expects tess factors in the
2593                  * reverse order from what GLSL / TGSI specify.
2594                  */
2595                 out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2596                 out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2597         } else {
2598                 for (i = 0; i < outer_comps; i++)
2599                         out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2600                 for (i = 0; i < inner_comps; i++)
2601                         out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2602         }
2603
2604         /* Convert the outputs to vectors for stores. */
2605         vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2606         vec1 = NULL;
2607
2608         if (stride > 4)
2609                 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2610
2611         /* Get the buffer. */
2612         rw_buffers = LLVMGetParam(ctx->main_fn,
2613                                   SI_PARAM_RW_BUFFERS);
2614         buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2615                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2616
2617         /* Get the offset. */
2618         tf_base = LLVMGetParam(ctx->main_fn,
2619                                SI_PARAM_TESS_FACTOR_OFFSET);
2620         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2621                                   lp_build_const_int32(gallivm, 4 * stride), "");
2622
2623         lp_build_if(&inner_if_ctx, gallivm,
2624                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2625                                   rel_patch_id, bld_base->uint_bld.zero, ""));
2626
2627         /* Store the dynamic HS control word. */
2628         build_tbuffer_store_dwords(ctx, buffer,
2629                                    lp_build_const_int32(gallivm, 0x80000000),
2630                                    1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2631
2632         lp_build_endif(&inner_if_ctx);
2633
2634         /* Store the tessellation factors. */
2635         build_tbuffer_store_dwords(ctx, buffer, vec0,
2636                                    MIN2(stride, 4), byteoffset, tf_base, 4);
2637         if (vec1)
2638                 build_tbuffer_store_dwords(ctx, buffer, vec1,
2639                                            stride - 4, byteoffset, tf_base, 20);
2640         lp_build_endif(&if_ctx);
2641 }
2642
2643 /* This only writes the tessellation factor levels. */
2644 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2645 {
2646         struct si_shader_context *ctx = si_shader_context(bld_base);
2647         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2648
2649         si_copy_tcs_inputs(bld_base);
2650
2651         rel_patch_id = get_rel_patch_id(ctx);
2652         invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2653         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2654
2655         /* Return epilog parameters from this function. */
2656         LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2657         LLVMValueRef ret = ctx->return_value;
2658         LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2659         unsigned vgpr;
2660
2661         /* RW_BUFFERS pointer */
2662         rw_buffers = LLVMGetParam(ctx->main_fn,
2663                                   SI_PARAM_RW_BUFFERS);
2664         rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2665         rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2666         rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2667                                       bld_base->uint_bld.zero, "");
2668         rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2669                                       bld_base->uint_bld.one, "");
2670         ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2671         ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2672
2673         /* Tess factor buffer soffset is after user SGPRs. */
2674         tf_soffset = LLVMGetParam(ctx->main_fn,
2675                                   SI_PARAM_TESS_FACTOR_OFFSET);
2676         ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2677                                    SI_TCS_NUM_USER_SGPR + 1, "");
2678
2679         /* VGPRs */
2680         rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2681         invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2682         tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2683
2684         vgpr = SI_TCS_NUM_USER_SGPR + 2;
2685         ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2686         ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2687         ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2688         ctx->return_value = ret;
2689 }
2690
2691 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2692 {
2693         struct si_shader_context *ctx = si_shader_context(bld_base);
2694         struct si_shader *shader = ctx->shader;
2695         struct tgsi_shader_info *info = &shader->selector->info;
2696         struct gallivm_state *gallivm = bld_base->base.gallivm;
2697         unsigned i, chan;
2698         LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2699                                               ctx->param_rel_auto_id);
2700         LLVMValueRef vertex_dw_stride =
2701                 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2702         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2703                                                  vertex_dw_stride, "");
2704
2705         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2706          * its inputs from it. */
2707         for (i = 0; i < info->num_outputs; i++) {
2708                 LLVMValueRef *out_ptr = ctx->outputs[i];
2709                 unsigned name = info->output_semantic_name[i];
2710                 unsigned index = info->output_semantic_index[i];
2711                 int param = si_shader_io_get_unique_index(name, index);
2712                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2713                                         lp_build_const_int32(gallivm, param * 4), "");
2714
2715                 for (chan = 0; chan < 4; chan++) {
2716                         lds_store(bld_base, chan, dw_addr,
2717                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2718                 }
2719         }
2720 }
2721
2722 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2723 {
2724         struct si_shader_context *ctx = si_shader_context(bld_base);
2725         struct gallivm_state *gallivm = bld_base->base.gallivm;
2726         struct si_shader *es = ctx->shader;
2727         struct tgsi_shader_info *info = &es->selector->info;
2728         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2729                                             ctx->param_es2gs_offset);
2730         unsigned chan;
2731         int i;
2732
2733         for (i = 0; i < info->num_outputs; i++) {
2734                 LLVMValueRef *out_ptr = ctx->outputs[i];
2735                 int param_index;
2736
2737                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2738                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2739                         continue;
2740
2741                 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2742                                                             info->output_semantic_index[i]);
2743
2744                 for (chan = 0; chan < 4; chan++) {
2745                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2746                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2747
2748                         build_tbuffer_store(ctx,
2749                                             ctx->esgs_ring,
2750                                             out_val, 1,
2751                                             LLVMGetUndef(ctx->i32), soffset,
2752                                             (4 * param_index + chan) * 4,
2753                                             V_008F0C_BUF_DATA_FORMAT_32,
2754                                             V_008F0C_BUF_NUM_FORMAT_UINT,
2755                                             0, 0, 1, 1, 0);
2756                 }
2757         }
2758 }
2759
2760 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2761 {
2762         struct si_shader_context *ctx = si_shader_context(bld_base);
2763         struct gallivm_state *gallivm = bld_base->base.gallivm;
2764         LLVMValueRef args[2];
2765
2766         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2767         args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID);
2768         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2769                            ctx->voidt, args, 2, 0);
2770 }
2771
2772 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2773 {
2774         struct si_shader_context *ctx = si_shader_context(bld_base);
2775         struct gallivm_state *gallivm = bld_base->base.gallivm;
2776         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2777         struct si_shader_output_values *outputs = NULL;
2778         int i,j;
2779
2780         assert(!ctx->shader->is_gs_copy_shader);
2781
2782         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2783
2784         /* Vertex color clamping.
2785          *
2786          * This uses a state constant loaded in a user data SGPR and
2787          * an IF statement is added that clamps all colors if the constant
2788          * is true.
2789          */
2790         if (ctx->type == PIPE_SHADER_VERTEX) {
2791                 struct lp_build_if_state if_ctx;
2792                 LLVMValueRef cond = NULL;
2793                 LLVMValueRef addr, val;
2794
2795                 for (i = 0; i < info->num_outputs; i++) {
2796                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2797                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2798                                 continue;
2799
2800                         /* We've found a color. */
2801                         if (!cond) {
2802                                 /* The state is in the first bit of the user SGPR. */
2803                                 cond = LLVMGetParam(ctx->main_fn,
2804                                                     SI_PARAM_VS_STATE_BITS);
2805                                 cond = LLVMBuildTrunc(gallivm->builder, cond,
2806                                                       ctx->i1, "");
2807                                 lp_build_if(&if_ctx, gallivm, cond);
2808                         }
2809
2810                         for (j = 0; j < 4; j++) {
2811                                 addr = ctx->outputs[i][j];
2812                                 val = LLVMBuildLoad(gallivm->builder, addr, "");
2813                                 val = si_llvm_saturate(bld_base, val);
2814                                 LLVMBuildStore(gallivm->builder, val, addr);
2815                         }
2816                 }
2817
2818                 if (cond)
2819                         lp_build_endif(&if_ctx);
2820         }
2821
2822         for (i = 0; i < info->num_outputs; i++) {
2823                 outputs[i].semantic_name = info->output_semantic_name[i];
2824                 outputs[i].semantic_index = info->output_semantic_index[i];
2825
2826                 for (j = 0; j < 4; j++) {
2827                         outputs[i].values[j] =
2828                                 LLVMBuildLoad(gallivm->builder,
2829                                               ctx->outputs[i][j],
2830                                               "");
2831                         outputs[i].vertex_stream[j] =
2832                                 (info->output_streams[i] >> (2 * j)) & 3;
2833                 }
2834
2835         }
2836
2837         /* Return the primitive ID from the LLVM function. */
2838         ctx->return_value =
2839                 LLVMBuildInsertValue(gallivm->builder,
2840                                      ctx->return_value,
2841                                      bitcast(bld_base, TGSI_TYPE_FLOAT,
2842                                              get_primitive_id(bld_base, 0)),
2843                                      VS_EPILOG_PRIMID_LOC, "");
2844
2845         if (ctx->shader->selector->so.num_outputs)
2846                 si_llvm_emit_streamout(ctx, outputs, i, 0);
2847         si_llvm_export_vs(bld_base, outputs, i);
2848         FREE(outputs);
2849 }
2850
2851 struct si_ps_exports {
2852         unsigned num;
2853         LLVMValueRef args[10][9];
2854 };
2855
2856 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
2857                                     bool writes_samplemask)
2858 {
2859         if (writes_z) {
2860                 /* Z needs 32 bits. */
2861                 if (writes_samplemask)
2862                         return V_028710_SPI_SHADER_32_ABGR;
2863                 else if (writes_stencil)
2864                         return V_028710_SPI_SHADER_32_GR;
2865                 else
2866                         return V_028710_SPI_SHADER_32_R;
2867         } else if (writes_stencil || writes_samplemask) {
2868                 /* Both stencil and sample mask need only 16 bits. */
2869                 return V_028710_SPI_SHADER_UINT16_ABGR;
2870         } else {
2871                 return V_028710_SPI_SHADER_ZERO;
2872         }
2873 }
2874
2875 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2876                             LLVMValueRef depth, LLVMValueRef stencil,
2877                             LLVMValueRef samplemask, struct si_ps_exports *exp)
2878 {
2879         struct si_shader_context *ctx = si_shader_context(bld_base);
2880         struct lp_build_context *base = &bld_base->base;
2881         struct lp_build_context *uint = &bld_base->uint_bld;
2882         LLVMValueRef args[9];
2883         unsigned mask = 0;
2884         unsigned format = si_get_spi_shader_z_format(depth != NULL,
2885                                                      stencil != NULL,
2886                                                      samplemask != NULL);
2887
2888         assert(depth || stencil || samplemask);
2889
2890         args[1] = uint->one; /* whether the EXEC mask is valid */
2891         args[2] = uint->one; /* DONE bit */
2892
2893         /* Specify the target we are exporting */
2894         args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2895
2896         args[4] = uint->zero; /* COMP flag */
2897         args[5] = base->undef; /* R, depth */
2898         args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2899         args[7] = base->undef; /* B, sample mask */
2900         args[8] = base->undef; /* A, alpha to mask */
2901
2902         if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
2903                 assert(!depth);
2904                 args[4] = uint->one; /* COMPR flag */
2905
2906                 if (stencil) {
2907                         /* Stencil should be in X[23:16]. */
2908                         stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
2909                         stencil = LLVMBuildShl(base->gallivm->builder, stencil,
2910                                                LLVMConstInt(ctx->i32, 16, 0), "");
2911                         args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
2912                         mask |= 0x3;
2913                 }
2914                 if (samplemask) {
2915                         /* SampleMask should be in Y[15:0]. */
2916                         args[6] = samplemask;
2917                         mask |= 0xc;
2918                 }
2919         } else {
2920                 if (depth) {
2921                         args[5] = depth;
2922                         mask |= 0x1;
2923                 }
2924                 if (stencil) {
2925                         args[6] = stencil;
2926                         mask |= 0x2;
2927                 }
2928                 if (samplemask) {
2929                         args[7] = samplemask;
2930                         mask |= 0x4;
2931                 }
2932         }
2933
2934         /* SI (except OLAND and HAINAN) has a bug that it only looks
2935          * at the X writemask component. */
2936         if (ctx->screen->b.chip_class == SI &&
2937             ctx->screen->b.family != CHIP_OLAND &&
2938             ctx->screen->b.family != CHIP_HAINAN)
2939                 mask |= 0x1;
2940
2941         /* Specify which components to enable */
2942         args[0] = lp_build_const_int32(base->gallivm, mask);
2943
2944         memcpy(exp->args[exp->num++], args, sizeof(args));
2945 }
2946
2947 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2948                                 LLVMValueRef *color, unsigned index,
2949                                 unsigned samplemask_param,
2950                                 bool is_last, struct si_ps_exports *exp)
2951 {
2952         struct si_shader_context *ctx = si_shader_context(bld_base);
2953         struct lp_build_context *base = &bld_base->base;
2954         int i;
2955
2956         /* Clamp color */
2957         if (ctx->shader->key.part.ps.epilog.clamp_color)
2958                 for (i = 0; i < 4; i++)
2959                         color[i] = si_llvm_saturate(bld_base, color[i]);
2960
2961         /* Alpha to one */
2962         if (ctx->shader->key.part.ps.epilog.alpha_to_one)
2963                 color[3] = base->one;
2964
2965         /* Alpha test */
2966         if (index == 0 &&
2967             ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2968                 si_alpha_test(bld_base, color[3]);
2969
2970         /* Line & polygon smoothing */
2971         if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
2972                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2973                                                          samplemask_param);
2974
2975         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2976         if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
2977                 LLVMValueRef args[8][9];
2978                 int c, last = -1;
2979
2980                 /* Get the export arguments, also find out what the last one is. */
2981                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
2982                         si_llvm_init_export_args(bld_base, color,
2983                                                  V_008DFC_SQ_EXP_MRT + c, args[c]);
2984                         if (args[c][0] != bld_base->uint_bld.zero)
2985                                 last = c;
2986                 }
2987
2988                 /* Emit all exports. */
2989                 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
2990                         if (is_last && last == c) {
2991                                 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2992                                 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
2993                         } else if (args[c][0] == bld_base->uint_bld.zero)
2994                                 continue; /* unnecessary NULL export */
2995
2996                         memcpy(exp->args[exp->num++], args[c], sizeof(args[c]));
2997                 }
2998         } else {
2999                 LLVMValueRef args[9];
3000
3001                 /* Export */
3002                 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3003                                          args);
3004                 if (is_last) {
3005                         args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3006                         args[2] = bld_base->uint_bld.one; /* DONE bit */
3007                 } else if (args[0] == bld_base->uint_bld.zero)
3008                         return; /* unnecessary NULL export */
3009
3010                 memcpy(exp->args[exp->num++], args, sizeof(args));
3011         }
3012 }
3013
3014 static void si_emit_ps_exports(struct si_shader_context *ctx,
3015                                struct si_ps_exports *exp)
3016 {
3017         for (unsigned i = 0; i < exp->num; i++)
3018                 lp_build_intrinsic(ctx->gallivm.builder,
3019                                    "llvm.SI.export", ctx->voidt,
3020                                    exp->args[i], 9, 0);
3021 }
3022
3023 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3024 {
3025         struct si_shader_context *ctx = si_shader_context(bld_base);
3026         struct lp_build_context *base = &bld_base->base;
3027         struct lp_build_context *uint = &bld_base->uint_bld;
3028         LLVMValueRef args[9];
3029
3030         args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3031         args[1] = uint->one; /* whether the EXEC mask is valid */
3032         args[2] = uint->one; /* DONE bit */
3033         args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3034         args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3035         args[5] = base->undef; /* R */
3036         args[6] = base->undef; /* G */
3037         args[7] = base->undef; /* B */
3038         args[8] = base->undef; /* A */
3039
3040         lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3041                            ctx->voidt, args, 9, 0);
3042 }
3043
3044 /**
3045  * Return PS outputs in this order:
3046  *
3047  * v[0:3] = color0.xyzw
3048  * v[4:7] = color1.xyzw
3049  * ...
3050  * vN+0 = Depth
3051  * vN+1 = Stencil
3052  * vN+2 = SampleMask
3053  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3054  *
3055  * The alpha-ref SGPR is returned via its original location.
3056  */
3057 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3058 {
3059         struct si_shader_context *ctx = si_shader_context(bld_base);
3060         struct si_shader *shader = ctx->shader;
3061         struct lp_build_context *base = &bld_base->base;
3062         struct tgsi_shader_info *info = &shader->selector->info;
3063         LLVMBuilderRef builder = base->gallivm->builder;
3064         unsigned i, j, first_vgpr, vgpr;
3065
3066         LLVMValueRef color[8][4] = {};
3067         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3068         LLVMValueRef ret;
3069
3070         /* Read the output values. */
3071         for (i = 0; i < info->num_outputs; i++) {
3072                 unsigned semantic_name = info->output_semantic_name[i];
3073                 unsigned semantic_index = info->output_semantic_index[i];
3074
3075                 switch (semantic_name) {
3076                 case TGSI_SEMANTIC_COLOR:
3077                         assert(semantic_index < 8);
3078                         for (j = 0; j < 4; j++) {
3079                                 LLVMValueRef ptr = ctx->outputs[i][j];
3080                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3081                                 color[semantic_index][j] = result;
3082                         }
3083                         break;
3084                 case TGSI_SEMANTIC_POSITION:
3085                         depth = LLVMBuildLoad(builder,
3086                                               ctx->outputs[i][2], "");
3087                         break;
3088                 case TGSI_SEMANTIC_STENCIL:
3089                         stencil = LLVMBuildLoad(builder,
3090                                                 ctx->outputs[i][1], "");
3091                         break;
3092                 case TGSI_SEMANTIC_SAMPLEMASK:
3093                         samplemask = LLVMBuildLoad(builder,
3094                                                    ctx->outputs[i][0], "");
3095                         break;
3096                 default:
3097                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3098                                 semantic_name);
3099                 }
3100         }
3101
3102         /* Fill the return structure. */
3103         ret = ctx->return_value;
3104
3105         /* Set SGPRs. */
3106         ret = LLVMBuildInsertValue(builder, ret,
3107                                    bitcast(bld_base, TGSI_TYPE_SIGNED,
3108                                            LLVMGetParam(ctx->main_fn,
3109                                                         SI_PARAM_ALPHA_REF)),
3110                                    SI_SGPR_ALPHA_REF, "");
3111
3112         /* Set VGPRs */
3113         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3114         for (i = 0; i < ARRAY_SIZE(color); i++) {
3115                 if (!color[i][0])
3116                         continue;
3117
3118                 for (j = 0; j < 4; j++)
3119                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3120         }
3121         if (depth)
3122                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3123         if (stencil)
3124                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3125         if (samplemask)
3126                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3127
3128         /* Add the input sample mask for smoothing at the end. */
3129         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3130                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3131         ret = LLVMBuildInsertValue(builder, ret,
3132                                    LLVMGetParam(ctx->main_fn,
3133                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3134
3135         ctx->return_value = ret;
3136 }
3137
3138 /**
3139  * Given a v8i32 resource descriptor for a buffer, extract the size of the
3140  * buffer in number of elements and return it as an i32.
3141  */
3142 static LLVMValueRef get_buffer_size(
3143         struct lp_build_tgsi_context *bld_base,
3144         LLVMValueRef descriptor)
3145 {
3146         struct si_shader_context *ctx = si_shader_context(bld_base);
3147         struct gallivm_state *gallivm = bld_base->base.gallivm;
3148         LLVMBuilderRef builder = gallivm->builder;
3149         LLVMValueRef size =
3150                 LLVMBuildExtractElement(builder, descriptor,
3151                                         lp_build_const_int32(gallivm, 2), "");
3152
3153         if (ctx->screen->b.chip_class >= VI) {
3154                 /* On VI, the descriptor contains the size in bytes,
3155                  * but TXQ must return the size in elements.
3156                  * The stride is always non-zero for resources using TXQ.
3157                  */
3158                 LLVMValueRef stride =
3159                         LLVMBuildExtractElement(builder, descriptor,
3160                                                 lp_build_const_int32(gallivm, 1), "");
3161                 stride = LLVMBuildLShr(builder, stride,
3162                                        lp_build_const_int32(gallivm, 16), "");
3163                 stride = LLVMBuildAnd(builder, stride,
3164                                       lp_build_const_int32(gallivm, 0x3FFF), "");
3165
3166                 size = LLVMBuildUDiv(builder, size, stride, "");
3167         }
3168
3169         return size;
3170 }
3171
3172 /**
3173  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3174  * intrinsic names).
3175  */
3176 static void build_type_name_for_intr(
3177         LLVMTypeRef type,
3178         char *buf, unsigned bufsize)
3179 {
3180         LLVMTypeRef elem_type = type;
3181
3182         assert(bufsize >= 8);
3183
3184         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
3185                 int ret = snprintf(buf, bufsize, "v%u",
3186                                         LLVMGetVectorSize(type));
3187                 if (ret < 0) {
3188                         char *type_name = LLVMPrintTypeToString(type);
3189                         fprintf(stderr, "Error building type name for: %s\n",
3190                                 type_name);
3191                         return;
3192                 }
3193                 elem_type = LLVMGetElementType(type);
3194                 buf += ret;
3195                 bufsize -= ret;
3196         }
3197         switch (LLVMGetTypeKind(elem_type)) {
3198         default: break;
3199         case LLVMIntegerTypeKind:
3200                 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
3201                 break;
3202         case LLVMFloatTypeKind:
3203                 snprintf(buf, bufsize, "f32");
3204                 break;
3205         case LLVMDoubleTypeKind:
3206                 snprintf(buf, bufsize, "f64");
3207                 break;
3208         }
3209 }
3210
3211 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3212                                 struct lp_build_tgsi_context *bld_base,
3213                                 struct lp_build_emit_data *emit_data);
3214
3215 /* Prevent optimizations (at least of memory accesses) across the current
3216  * point in the program by emitting empty inline assembly that is marked as
3217  * having side effects.
3218  */
3219 #if 0 /* unused currently */
3220 static void emit_optimization_barrier(struct si_shader_context *ctx)
3221 {
3222         LLVMBuilderRef builder = ctx->gallivm.builder;
3223         LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3224         LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3225         LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3226 }
3227 #endif
3228
3229 /* Combine these with & instead of |. */
3230 #define NOOP_WAITCNT 0xf7f
3231 #define LGKM_CNT 0x07f
3232 #define VM_CNT 0xf70
3233
3234 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3235 {
3236         struct gallivm_state *gallivm = &ctx->gallivm;
3237         LLVMBuilderRef builder = gallivm->builder;
3238         LLVMValueRef args[1] = {
3239                 lp_build_const_int32(gallivm, simm16)
3240         };
3241         lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3242                            ctx->voidt, args, 1, 0);
3243 }
3244
3245 static void membar_emit(
3246                 const struct lp_build_tgsi_action *action,
3247                 struct lp_build_tgsi_context *bld_base,
3248                 struct lp_build_emit_data *emit_data)
3249 {
3250         struct si_shader_context *ctx = si_shader_context(bld_base);
3251         LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3252         unsigned flags = LLVMConstIntGetZExtValue(src0);
3253         unsigned waitcnt = NOOP_WAITCNT;
3254
3255         if (flags & TGSI_MEMBAR_THREAD_GROUP)
3256                 waitcnt &= VM_CNT & LGKM_CNT;
3257
3258         if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3259                      TGSI_MEMBAR_SHADER_BUFFER |
3260                      TGSI_MEMBAR_SHADER_IMAGE))
3261                 waitcnt &= VM_CNT;
3262
3263         if (flags & TGSI_MEMBAR_SHARED)
3264                 waitcnt &= LGKM_CNT;
3265
3266         if (waitcnt != NOOP_WAITCNT)
3267                 emit_waitcnt(ctx, waitcnt);
3268 }
3269
3270 static LLVMValueRef
3271 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3272                          const struct tgsi_full_src_register *reg)
3273 {
3274         LLVMValueRef index;
3275         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3276                                              SI_PARAM_SHADER_BUFFERS);
3277
3278         if (!reg->Register.Indirect)
3279                 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3280         else
3281                 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3282                                                    reg->Register.Index,
3283                                                    SI_NUM_SHADER_BUFFERS);
3284
3285         return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3286 }
3287
3288 static bool tgsi_is_array_sampler(unsigned target)
3289 {
3290         return target == TGSI_TEXTURE_1D_ARRAY ||
3291                target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3292                target == TGSI_TEXTURE_2D_ARRAY ||
3293                target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3294                target == TGSI_TEXTURE_CUBE_ARRAY ||
3295                target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3296                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3297 }
3298
3299 static bool tgsi_is_array_image(unsigned target)
3300 {
3301         return target == TGSI_TEXTURE_3D ||
3302                target == TGSI_TEXTURE_CUBE ||
3303                target == TGSI_TEXTURE_1D_ARRAY ||
3304                target == TGSI_TEXTURE_2D_ARRAY ||
3305                target == TGSI_TEXTURE_CUBE_ARRAY ||
3306                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3307 }
3308
3309 /**
3310  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3311  *
3312  * At least on Tonga, executing image stores on images with DCC enabled and
3313  * non-trivial can eventually lead to lockups. This can occur when an
3314  * application binds an image as read-only but then uses a shader that writes
3315  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3316  * program termination) in this case, but it doesn't cost much to be a bit
3317  * nicer: disabling DCC in the shader still leads to undefined results but
3318  * avoids the lockup.
3319  */
3320 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3321                                   LLVMValueRef rsrc)
3322 {
3323         if (ctx->screen->b.chip_class <= CIK) {
3324                 return rsrc;
3325         } else {
3326                 LLVMBuilderRef builder = ctx->gallivm.builder;
3327                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3328                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3329                 LLVMValueRef tmp;
3330
3331                 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3332                 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3333                 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3334         }
3335 }
3336
3337 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3338 {
3339         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3340                                CONST_ADDR_SPACE);
3341 }
3342
3343 /**
3344  * Load the resource descriptor for \p image.
3345  */
3346 static void
3347 image_fetch_rsrc(
3348         struct lp_build_tgsi_context *bld_base,
3349         const struct tgsi_full_src_register *image,
3350         bool is_store, unsigned target,
3351         LLVMValueRef *rsrc)
3352 {
3353         struct si_shader_context *ctx = si_shader_context(bld_base);
3354         LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3355                                              SI_PARAM_IMAGES);
3356         LLVMValueRef index, tmp;
3357         bool dcc_off = target != TGSI_TEXTURE_BUFFER && is_store;
3358
3359         assert(image->Register.File == TGSI_FILE_IMAGE);
3360
3361         if (!image->Register.Indirect) {
3362                 const struct tgsi_shader_info *info = bld_base->info;
3363
3364                 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3365
3366                 if (info->images_writemask & (1 << image->Register.Index) &&
3367                     target != TGSI_TEXTURE_BUFFER)
3368                         dcc_off = true;
3369         } else {
3370                 /* From the GL_ARB_shader_image_load_store extension spec:
3371                  *
3372                  *    If a shader performs an image load, store, or atomic
3373                  *    operation using an image variable declared as an array,
3374                  *    and if the index used to select an individual element is
3375                  *    negative or greater than or equal to the size of the
3376                  *    array, the results of the operation are undefined but may
3377                  *    not lead to termination.
3378                  */
3379                 index = get_bounded_indirect_index(ctx, &image->Indirect,
3380                                                    image->Register.Index,
3381                                                    SI_NUM_IMAGES);
3382         }
3383
3384         if (target == TGSI_TEXTURE_BUFFER) {
3385                 LLVMBuilderRef builder = ctx->gallivm.builder;
3386
3387                 rsrc_ptr = LLVMBuildPointerCast(builder, rsrc_ptr,
3388                                                 const_array(ctx->v4i32, 0), "");
3389                 index = LLVMBuildMul(builder, index,
3390                                      LLVMConstInt(ctx->i32, 2, 0), "");
3391                 index = LLVMBuildAdd(builder, index,
3392                                      LLVMConstInt(ctx->i32, 1, 0), "");
3393                 *rsrc = ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3394                 return;
3395         }
3396
3397         tmp = ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3398         if (dcc_off)
3399                 tmp = force_dcc_off(ctx, tmp);
3400         *rsrc = tmp;
3401 }
3402
3403 static LLVMValueRef image_fetch_coords(
3404                 struct lp_build_tgsi_context *bld_base,
3405                 const struct tgsi_full_instruction *inst,
3406                 unsigned src)
3407 {
3408         struct gallivm_state *gallivm = bld_base->base.gallivm;
3409         LLVMBuilderRef builder = gallivm->builder;
3410         unsigned target = inst->Memory.Texture;
3411         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3412         LLVMValueRef coords[4];
3413         LLVMValueRef tmp;
3414         int chan;
3415
3416         for (chan = 0; chan < num_coords; ++chan) {
3417                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3418                 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3419                 coords[chan] = tmp;
3420         }
3421
3422         if (num_coords == 1)
3423                 return coords[0];
3424
3425         if (num_coords == 3) {
3426                 /* LLVM has difficulties lowering 3-element vectors. */
3427                 coords[3] = bld_base->uint_bld.undef;
3428                 num_coords = 4;
3429         }
3430
3431         return lp_build_gather_values(gallivm, coords, num_coords);
3432 }
3433
3434 /**
3435  * Append the extra mode bits that are used by image load and store.
3436  */
3437 static void image_append_args(
3438                 struct si_shader_context *ctx,
3439                 struct lp_build_emit_data * emit_data,
3440                 unsigned target,
3441                 bool atomic,
3442                 bool force_glc)
3443 {
3444         const struct tgsi_full_instruction *inst = emit_data->inst;
3445         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3446         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3447         LLVMValueRef r128 = i1false;
3448         LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3449         LLVMValueRef glc =
3450                 force_glc ||
3451                 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3452                 i1true : i1false;
3453         LLVMValueRef slc = i1false;
3454         LLVMValueRef lwe = i1false;
3455
3456         if (atomic || (HAVE_LLVM <= 0x0309)) {
3457                 emit_data->args[emit_data->arg_count++] = r128;
3458                 emit_data->args[emit_data->arg_count++] = da;
3459                 if (!atomic) {
3460                         emit_data->args[emit_data->arg_count++] = glc;
3461                 }
3462                 emit_data->args[emit_data->arg_count++] = slc;
3463                 return;
3464         }
3465
3466         /* HAVE_LLVM >= 0x0400 */
3467         emit_data->args[emit_data->arg_count++] = glc;
3468         emit_data->args[emit_data->arg_count++] = slc;
3469         emit_data->args[emit_data->arg_count++] = lwe;
3470         emit_data->args[emit_data->arg_count++] = da;
3471 }
3472
3473 /**
3474  * Append the resource and indexing arguments for buffer intrinsics.
3475  *
3476  * \param rsrc the v4i32 buffer resource
3477  * \param index index into the buffer (stride-based)
3478  * \param offset byte offset into the buffer
3479  */
3480 static void buffer_append_args(
3481                 struct si_shader_context *ctx,
3482                 struct lp_build_emit_data *emit_data,
3483                 LLVMValueRef rsrc,
3484                 LLVMValueRef index,
3485                 LLVMValueRef offset,
3486                 bool atomic,
3487                 bool force_glc)
3488 {
3489         const struct tgsi_full_instruction *inst = emit_data->inst;
3490         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3491         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3492
3493         emit_data->args[emit_data->arg_count++] = rsrc;
3494         emit_data->args[emit_data->arg_count++] = index; /* vindex */
3495         emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3496         if (!atomic) {
3497                 emit_data->args[emit_data->arg_count++] =
3498                         force_glc ||
3499                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3500                         i1true : i1false; /* glc */
3501         }
3502         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3503 }
3504
3505 static void load_fetch_args(
3506                 struct lp_build_tgsi_context * bld_base,
3507                 struct lp_build_emit_data * emit_data)
3508 {
3509         struct si_shader_context *ctx = si_shader_context(bld_base);
3510         struct gallivm_state *gallivm = bld_base->base.gallivm;
3511         const struct tgsi_full_instruction * inst = emit_data->inst;
3512         unsigned target = inst->Memory.Texture;
3513         LLVMValueRef rsrc;
3514
3515         emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3516
3517         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3518                 LLVMBuilderRef builder = gallivm->builder;
3519                 LLVMValueRef offset;
3520                 LLVMValueRef tmp;
3521
3522                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3523
3524                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3525                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3526
3527                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3528                                    offset, false, false);
3529         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3530                 LLVMValueRef coords;
3531
3532                 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3533                 coords = image_fetch_coords(bld_base, inst, 1);
3534
3535                 if (target == TGSI_TEXTURE_BUFFER) {
3536                         buffer_append_args(ctx, emit_data, rsrc, coords,
3537                                            bld_base->uint_bld.zero, false, false);
3538                 } else {
3539                         emit_data->args[0] = coords;
3540                         emit_data->args[1] = rsrc;
3541                         emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3542                         emit_data->arg_count = 3;
3543
3544                         image_append_args(ctx, emit_data, target, false, false);
3545                 }
3546         }
3547 }
3548
3549 static void load_emit_buffer(struct si_shader_context *ctx,
3550                              struct lp_build_emit_data *emit_data)
3551 {
3552         const struct tgsi_full_instruction *inst = emit_data->inst;
3553         struct gallivm_state *gallivm = &ctx->gallivm;
3554         LLVMBuilderRef builder = gallivm->builder;
3555         uint writemask = inst->Dst[0].Register.WriteMask;
3556         uint count = util_last_bit(writemask);
3557         const char *intrinsic_name;
3558         LLVMTypeRef dst_type;
3559
3560         switch (count) {
3561         case 1:
3562                 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3563                 dst_type = ctx->f32;
3564                 break;
3565         case 2:
3566                 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3567                 dst_type = LLVMVectorType(ctx->f32, 2);
3568                 break;
3569         default: // 3 & 4
3570                 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3571                 dst_type = ctx->v4f32;
3572                 count = 4;
3573         }
3574
3575         emit_data->output[emit_data->chan] = lp_build_intrinsic(
3576                         builder, intrinsic_name, dst_type,
3577                         emit_data->args, emit_data->arg_count,
3578                         LP_FUNC_ATTR_READONLY);
3579 }
3580
3581 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3582                                    const struct tgsi_full_instruction *inst,
3583                                    LLVMTypeRef type, int arg)
3584 {
3585         struct gallivm_state *gallivm = &ctx->gallivm;
3586         LLVMBuilderRef builder = gallivm->builder;
3587         LLVMValueRef offset, ptr;
3588         int addr_space;
3589
3590         offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3591         offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3592
3593         ptr = ctx->shared_memory;
3594         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3595         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3596         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3597
3598         return ptr;
3599 }
3600
3601 static void load_emit_memory(
3602                 struct si_shader_context *ctx,
3603                 struct lp_build_emit_data *emit_data)
3604 {
3605         const struct tgsi_full_instruction *inst = emit_data->inst;
3606         struct lp_build_context *base = &ctx->bld_base.base;
3607         struct gallivm_state *gallivm = &ctx->gallivm;
3608         LLVMBuilderRef builder = gallivm->builder;
3609         unsigned writemask = inst->Dst[0].Register.WriteMask;
3610         LLVMValueRef channels[4], ptr, derived_ptr, index;
3611         int chan;
3612
3613         ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3614
3615         for (chan = 0; chan < 4; ++chan) {
3616                 if (!(writemask & (1 << chan))) {
3617                         channels[chan] = LLVMGetUndef(base->elem_type);
3618                         continue;
3619                 }
3620
3621                 index = lp_build_const_int32(gallivm, chan);
3622                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3623                 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3624         }
3625         emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3626 }
3627
3628 static void get_image_intr_name(const char *base_name,
3629                                 LLVMTypeRef data_type,
3630                                 LLVMTypeRef coords_type,
3631                                 LLVMTypeRef rsrc_type,
3632                                 char *out_name, unsigned out_len)
3633 {
3634         char coords_type_name[8];
3635
3636         build_type_name_for_intr(coords_type, coords_type_name,
3637                             sizeof(coords_type_name));
3638
3639         if (HAVE_LLVM <= 0x0309) {
3640                 snprintf(out_name, out_len, "%s.%s", base_name, coords_type_name);
3641         } else {
3642                 char data_type_name[8];
3643                 char rsrc_type_name[8];
3644
3645                 build_type_name_for_intr(data_type, data_type_name,
3646                                         sizeof(data_type_name));
3647                 build_type_name_for_intr(rsrc_type, rsrc_type_name,
3648                                         sizeof(rsrc_type_name));
3649                 snprintf(out_name, out_len, "%s.%s.%s.%s", base_name,
3650                          data_type_name, coords_type_name, rsrc_type_name);
3651         }
3652 }
3653
3654 static void load_emit(
3655                 const struct lp_build_tgsi_action *action,
3656                 struct lp_build_tgsi_context *bld_base,
3657                 struct lp_build_emit_data *emit_data)
3658 {
3659         struct si_shader_context *ctx = si_shader_context(bld_base);
3660         struct gallivm_state *gallivm = bld_base->base.gallivm;
3661         LLVMBuilderRef builder = gallivm->builder;
3662         const struct tgsi_full_instruction * inst = emit_data->inst;
3663         char intrinsic_name[64];
3664
3665         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3666                 load_emit_memory(ctx, emit_data);
3667                 return;
3668         }
3669
3670         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3671                 emit_waitcnt(ctx, VM_CNT);
3672
3673         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3674                 load_emit_buffer(ctx, emit_data);
3675                 return;
3676         }
3677
3678         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3679                 emit_data->output[emit_data->chan] =
3680                         lp_build_intrinsic(
3681                                 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3682                                 emit_data->args, emit_data->arg_count,
3683                                 LP_FUNC_ATTR_READONLY);
3684         } else {
3685                 get_image_intr_name("llvm.amdgcn.image.load",
3686                                 emit_data->dst_type,            /* vdata */
3687                                 LLVMTypeOf(emit_data->args[0]), /* coords */
3688                                 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3689                                 intrinsic_name, sizeof(intrinsic_name));
3690
3691                 emit_data->output[emit_data->chan] =
3692                         lp_build_intrinsic(
3693                                 builder, intrinsic_name, emit_data->dst_type,
3694                                 emit_data->args, emit_data->arg_count,
3695                                 LP_FUNC_ATTR_READONLY);
3696         }
3697 }
3698
3699 static void store_fetch_args(
3700                 struct lp_build_tgsi_context * bld_base,
3701                 struct lp_build_emit_data * emit_data)
3702 {
3703         struct si_shader_context *ctx = si_shader_context(bld_base);
3704         struct gallivm_state *gallivm = bld_base->base.gallivm;
3705         LLVMBuilderRef builder = gallivm->builder;
3706         const struct tgsi_full_instruction * inst = emit_data->inst;
3707         struct tgsi_full_src_register memory;
3708         LLVMValueRef chans[4];
3709         LLVMValueRef data;
3710         LLVMValueRef rsrc;
3711         unsigned chan;
3712
3713         emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3714
3715         for (chan = 0; chan < 4; ++chan) {
3716                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3717         }
3718         data = lp_build_gather_values(gallivm, chans, 4);
3719
3720         emit_data->args[emit_data->arg_count++] = data;
3721
3722         memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3723
3724         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3725                 LLVMValueRef offset;
3726                 LLVMValueRef tmp;
3727
3728                 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3729
3730                 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3731                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3732
3733                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3734                                    offset, false, false);
3735         } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3736                 unsigned target = inst->Memory.Texture;
3737                 LLVMValueRef coords;
3738
3739                 /* 8bit/16bit TC L1 write corruption bug on SI.
3740                  * All store opcodes not aligned to a dword are affected.
3741                  *
3742                  * The only way to get unaligned stores in radeonsi is through
3743                  * shader images.
3744                  */
3745                 bool force_glc = ctx->screen->b.chip_class == SI;
3746
3747                 coords = image_fetch_coords(bld_base, inst, 0);
3748
3749                 if (target == TGSI_TEXTURE_BUFFER) {
3750                         image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
3751                         buffer_append_args(ctx, emit_data, rsrc, coords,
3752                                            bld_base->uint_bld.zero, false, force_glc);
3753                 } else {
3754                         emit_data->args[1] = coords;
3755                         image_fetch_rsrc(bld_base, &memory, true, target,
3756                                          &emit_data->args[2]);
3757                         emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3758                         emit_data->arg_count = 4;
3759
3760                         image_append_args(ctx, emit_data, target, false, force_glc);
3761                 }
3762         }
3763 }
3764
3765 static void store_emit_buffer(
3766                 struct si_shader_context *ctx,
3767                 struct lp_build_emit_data *emit_data)
3768 {
3769         const struct tgsi_full_instruction *inst = emit_data->inst;
3770         struct gallivm_state *gallivm = &ctx->gallivm;
3771         LLVMBuilderRef builder = gallivm->builder;
3772         struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
3773         LLVMValueRef base_data = emit_data->args[0];
3774         LLVMValueRef base_offset = emit_data->args[3];
3775         unsigned writemask = inst->Dst[0].Register.WriteMask;
3776
3777         while (writemask) {
3778                 int start, count;
3779                 const char *intrinsic_name;
3780                 LLVMValueRef data;
3781                 LLVMValueRef offset;
3782                 LLVMValueRef tmp;
3783
3784                 u_bit_scan_consecutive_range(&writemask, &start, &count);
3785
3786                 /* Due to an LLVM limitation, split 3-element writes
3787                  * into a 2-element and a 1-element write. */
3788                 if (count == 3) {
3789                         writemask |= 1 << (start + 2);
3790                         count = 2;
3791                 }
3792
3793                 if (count == 4) {
3794                         data = base_data;
3795                         intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3796                 } else if (count == 2) {
3797                         LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3798
3799                         tmp = LLVMBuildExtractElement(
3800                                 builder, base_data,
3801                                 lp_build_const_int32(gallivm, start), "");
3802                         data = LLVMBuildInsertElement(
3803                                 builder, LLVMGetUndef(v2f32), tmp,
3804                                 uint_bld->zero, "");
3805
3806                         tmp = LLVMBuildExtractElement(
3807                                 builder, base_data,
3808                                 lp_build_const_int32(gallivm, start + 1), "");
3809                         data = LLVMBuildInsertElement(
3810                                 builder, data, tmp, uint_bld->one, "");
3811
3812                         intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3813                 } else {
3814                         assert(count == 1);
3815                         data = LLVMBuildExtractElement(
3816                                 builder, base_data,
3817                                 lp_build_const_int32(gallivm, start), "");
3818                         intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3819                 }
3820
3821                 offset = base_offset;
3822                 if (start != 0) {
3823                         offset = LLVMBuildAdd(
3824                                 builder, offset,
3825                                 lp_build_const_int32(gallivm, start * 4), "");
3826                 }
3827
3828                 emit_data->args[0] = data;
3829                 emit_data->args[3] = offset;
3830
3831                 lp_build_intrinsic(
3832                         builder, intrinsic_name, emit_data->dst_type,
3833                         emit_data->args, emit_data->arg_count, 0);
3834         }
3835 }
3836
3837 static void store_emit_memory(
3838                 struct si_shader_context *ctx,
3839                 struct lp_build_emit_data *emit_data)
3840 {
3841         const struct tgsi_full_instruction *inst = emit_data->inst;
3842         struct gallivm_state *gallivm = &ctx->gallivm;
3843         struct lp_build_context *base = &ctx->bld_base.base;
3844         LLVMBuilderRef builder = gallivm->builder;
3845         unsigned writemask = inst->Dst[0].Register.WriteMask;
3846         LLVMValueRef ptr, derived_ptr, data, index;
3847         int chan;
3848
3849         ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3850
3851         for (chan = 0; chan < 4; ++chan) {
3852                 if (!(writemask & (1 << chan))) {
3853                         continue;
3854                 }
3855                 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
3856                 index = lp_build_const_int32(gallivm, chan);
3857                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3858                 LLVMBuildStore(builder, data, derived_ptr);
3859         }
3860 }
3861
3862 static void store_emit(
3863                 const struct lp_build_tgsi_action *action,
3864                 struct lp_build_tgsi_context *bld_base,
3865                 struct lp_build_emit_data *emit_data)
3866 {
3867         struct si_shader_context *ctx = si_shader_context(bld_base);
3868         struct gallivm_state *gallivm = bld_base->base.gallivm;
3869         LLVMBuilderRef builder = gallivm->builder;
3870         const struct tgsi_full_instruction * inst = emit_data->inst;
3871         unsigned target = inst->Memory.Texture;
3872         char intrinsic_name[64];
3873
3874         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3875                 store_emit_memory(ctx, emit_data);
3876                 return;
3877         }
3878
3879         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3880                 emit_waitcnt(ctx, VM_CNT);
3881
3882         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3883                 store_emit_buffer(ctx, emit_data);
3884                 return;
3885         }
3886
3887         if (target == TGSI_TEXTURE_BUFFER) {
3888                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3889                         builder, "llvm.amdgcn.buffer.store.format.v4f32",
3890                         emit_data->dst_type, emit_data->args,
3891                         emit_data->arg_count, 0);
3892         } else {
3893                 get_image_intr_name("llvm.amdgcn.image.store",
3894                                 LLVMTypeOf(emit_data->args[0]), /* vdata */
3895                                 LLVMTypeOf(emit_data->args[1]), /* coords */
3896                                 LLVMTypeOf(emit_data->args[2]), /* rsrc */
3897                                 intrinsic_name, sizeof(intrinsic_name));
3898
3899                 emit_data->output[emit_data->chan] =
3900                         lp_build_intrinsic(
3901                                 builder, intrinsic_name, emit_data->dst_type,
3902                                 emit_data->args, emit_data->arg_count, 0);
3903         }
3904 }
3905
3906 static void atomic_fetch_args(
3907                 struct lp_build_tgsi_context * bld_base,
3908                 struct lp_build_emit_data * emit_data)
3909 {
3910         struct si_shader_context *ctx = si_shader_context(bld_base);
3911         struct gallivm_state *gallivm = bld_base->base.gallivm;
3912         LLVMBuilderRef builder = gallivm->builder;
3913         const struct tgsi_full_instruction * inst = emit_data->inst;
3914         LLVMValueRef data1, data2;
3915         LLVMValueRef rsrc;
3916         LLVMValueRef tmp;
3917
3918         emit_data->dst_type = bld_base->base.elem_type;
3919
3920         tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3921         data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3922
3923         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3924                 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3925                 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3926         }
3927
3928         /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3929          * of arguments, which is reversed relative to TGSI (and GLSL)
3930          */
3931         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3932                 emit_data->args[emit_data->arg_count++] = data2;
3933         emit_data->args[emit_data->arg_count++] = data1;
3934
3935         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3936                 LLVMValueRef offset;
3937
3938                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3939
3940                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3941                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3942
3943                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3944                                    offset, true, false);
3945         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3946                 unsigned target = inst->Memory.Texture;
3947                 LLVMValueRef coords;
3948
3949                 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
3950                 coords = image_fetch_coords(bld_base, inst, 1);
3951
3952                 if (target == TGSI_TEXTURE_BUFFER) {
3953                         buffer_append_args(ctx, emit_data, rsrc, coords,
3954                                            bld_base->uint_bld.zero, true, false);
3955                 } else {
3956                         emit_data->args[emit_data->arg_count++] = coords;
3957                         emit_data->args[emit_data->arg_count++] = rsrc;
3958
3959                         image_append_args(ctx, emit_data, target, true, false);
3960                 }
3961         }
3962 }
3963
3964 static void atomic_emit_memory(struct si_shader_context *ctx,
3965                                struct lp_build_emit_data *emit_data) {
3966         struct gallivm_state *gallivm = &ctx->gallivm;
3967         LLVMBuilderRef builder = gallivm->builder;
3968         const struct tgsi_full_instruction * inst = emit_data->inst;
3969         LLVMValueRef ptr, result, arg;
3970
3971         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3972
3973         arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
3974         arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3975
3976         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3977                 LLVMValueRef new_data;
3978                 new_data = lp_build_emit_fetch(&ctx->bld_base,
3979                                                inst, 3, 0);
3980
3981                 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
3982
3983 #if HAVE_LLVM >= 0x309
3984                 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
3985                                        LLVMAtomicOrderingSequentiallyConsistent,
3986                                        LLVMAtomicOrderingSequentiallyConsistent,
3987                                        false);
3988 #endif
3989
3990                 result = LLVMBuildExtractValue(builder, result, 0, "");
3991         } else {
3992                 LLVMAtomicRMWBinOp op;
3993
3994                 switch(inst->Instruction.Opcode) {
3995                         case TGSI_OPCODE_ATOMUADD:
3996                                 op = LLVMAtomicRMWBinOpAdd;
3997                                 break;
3998                         case TGSI_OPCODE_ATOMXCHG:
3999                                 op = LLVMAtomicRMWBinOpXchg;
4000                                 break;
4001                         case TGSI_OPCODE_ATOMAND:
4002                                 op = LLVMAtomicRMWBinOpAnd;
4003                                 break;
4004                         case TGSI_OPCODE_ATOMOR:
4005                                 op = LLVMAtomicRMWBinOpOr;
4006                                 break;
4007                         case TGSI_OPCODE_ATOMXOR:
4008                                 op = LLVMAtomicRMWBinOpXor;
4009                                 break;
4010                         case TGSI_OPCODE_ATOMUMIN:
4011                                 op = LLVMAtomicRMWBinOpUMin;
4012                                 break;
4013                         case TGSI_OPCODE_ATOMUMAX:
4014                                 op = LLVMAtomicRMWBinOpUMax;
4015                                 break;
4016                         case TGSI_OPCODE_ATOMIMIN:
4017                                 op = LLVMAtomicRMWBinOpMin;
4018                                 break;
4019                         case TGSI_OPCODE_ATOMIMAX:
4020                                 op = LLVMAtomicRMWBinOpMax;
4021                                 break;
4022                         default:
4023                                 unreachable("unknown atomic opcode");
4024                 }
4025
4026                 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4027                                        LLVMAtomicOrderingSequentiallyConsistent,
4028                                        false);
4029         }
4030         emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4031 }
4032
4033 static void atomic_emit(
4034                 const struct lp_build_tgsi_action *action,
4035                 struct lp_build_tgsi_context *bld_base,
4036                 struct lp_build_emit_data *emit_data)
4037 {
4038         struct si_shader_context *ctx = si_shader_context(bld_base);
4039         struct gallivm_state *gallivm = bld_base->base.gallivm;
4040         LLVMBuilderRef builder = gallivm->builder;
4041         const struct tgsi_full_instruction * inst = emit_data->inst;
4042         char intrinsic_name[40];
4043         LLVMValueRef tmp;
4044
4045         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4046                 atomic_emit_memory(ctx, emit_data);
4047                 return;
4048         }
4049
4050         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4051             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4052                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4053                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4054         } else {
4055                 LLVMValueRef coords;
4056                 char coords_type[8];
4057
4058                 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4059                         coords = emit_data->args[2];
4060                 else
4061                         coords = emit_data->args[1];
4062
4063                 build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4064                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4065                          "llvm.amdgcn.image.atomic.%s.%s",
4066                          action->intr_name, coords_type);
4067         }
4068
4069         tmp = lp_build_intrinsic(
4070                 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4071                 emit_data->args, emit_data->arg_count, 0);
4072         emit_data->output[emit_data->chan] =
4073                 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4074 }
4075
4076 static void resq_fetch_args(
4077                 struct lp_build_tgsi_context * bld_base,
4078                 struct lp_build_emit_data * emit_data)
4079 {
4080         struct si_shader_context *ctx = si_shader_context(bld_base);
4081         struct gallivm_state *gallivm = bld_base->base.gallivm;
4082         const struct tgsi_full_instruction *inst = emit_data->inst;
4083         const struct tgsi_full_src_register *reg = &inst->Src[0];
4084
4085         emit_data->dst_type = ctx->v4i32;
4086
4087         if (reg->Register.File == TGSI_FILE_BUFFER) {
4088                 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4089                 emit_data->arg_count = 1;
4090         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4091                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4092                                  &emit_data->args[0]);
4093                 emit_data->arg_count = 1;
4094         } else {
4095                 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4096                 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4097                                  &emit_data->args[1]);
4098                 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4099                 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4100                 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4101                 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4102                         bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4103                 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4104                 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4105                 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4106                 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4107                 emit_data->arg_count = 10;
4108         }
4109 }
4110
4111 static void resq_emit(
4112                 const struct lp_build_tgsi_action *action,
4113                 struct lp_build_tgsi_context *bld_base,
4114                 struct lp_build_emit_data *emit_data)
4115 {
4116         struct gallivm_state *gallivm = bld_base->base.gallivm;
4117         LLVMBuilderRef builder = gallivm->builder;
4118         const struct tgsi_full_instruction *inst = emit_data->inst;
4119         LLVMValueRef out;
4120
4121         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4122                 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4123                                               lp_build_const_int32(gallivm, 2), "");
4124         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4125                 out = get_buffer_size(bld_base, emit_data->args[0]);
4126         } else {
4127                 out = lp_build_intrinsic(
4128                         builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4129                         emit_data->args, emit_data->arg_count,
4130                         LP_FUNC_ATTR_READNONE);
4131
4132                 /* Divide the number of layers by 6 to get the number of cubes. */
4133                 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4134                         LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4135                         LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4136
4137                         LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4138                         z = LLVMBuildSDiv(builder, z, imm6, "");
4139                         out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4140                 }
4141         }
4142
4143         emit_data->output[emit_data->chan] = out;
4144 }
4145
4146 static void set_tex_fetch_args(struct si_shader_context *ctx,
4147                                struct lp_build_emit_data *emit_data,
4148                                unsigned opcode, unsigned target,
4149                                LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4150                                LLVMValueRef *param, unsigned count,
4151                                unsigned dmask)
4152 {
4153         struct gallivm_state *gallivm = &ctx->gallivm;
4154         unsigned num_args;
4155         unsigned is_rect = target == TGSI_TEXTURE_RECT;
4156
4157         /* Pad to power of two vector */
4158         while (count < util_next_power_of_two(count))
4159                 param[count++] = LLVMGetUndef(ctx->i32);
4160
4161         /* Texture coordinates. */
4162         if (count > 1)
4163                 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4164         else
4165                 emit_data->args[0] = param[0];
4166
4167         /* Resource. */
4168         emit_data->args[1] = res_ptr;
4169         num_args = 2;
4170
4171         if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4172                 emit_data->dst_type = ctx->v4i32;
4173         else {
4174                 emit_data->dst_type = ctx->v4f32;
4175
4176                 emit_data->args[num_args++] = samp_ptr;
4177         }
4178
4179         emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4180         emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4181         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4182         emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4183                                         tgsi_is_array_sampler(target)); /* da */
4184         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4185         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4186         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4187         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4188
4189         emit_data->arg_count = num_args;
4190 }
4191
4192 static const struct lp_build_tgsi_action tex_action;
4193
4194 enum desc_type {
4195         DESC_IMAGE,
4196         DESC_BUFFER,
4197         DESC_FMASK,
4198         DESC_SAMPLER,
4199 };
4200
4201 /**
4202  * Load an image view, fmask view. or sampler state descriptor.
4203  */
4204 static LLVMValueRef load_sampler_desc_custom(struct si_shader_context *ctx,
4205                                              LLVMValueRef list, LLVMValueRef index,
4206                                              enum desc_type type)
4207 {
4208         struct gallivm_state *gallivm = &ctx->gallivm;
4209         LLVMBuilderRef builder = gallivm->builder;
4210
4211         switch (type) {
4212         case DESC_IMAGE:
4213                 /* The image is at [0:7]. */
4214                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4215                 break;
4216         case DESC_BUFFER:
4217                 /* The buffer is in [4:7]. */
4218                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4219                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4220                 list = LLVMBuildPointerCast(builder, list,
4221                                             const_array(ctx->v4i32, 0), "");
4222                 break;
4223         case DESC_FMASK:
4224                 /* The FMASK is at [8:15]. */
4225                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4226                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4227                 break;
4228         case DESC_SAMPLER:
4229                 /* The sampler state is at [12:15]. */
4230                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4231                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4232                 list = LLVMBuildPointerCast(builder, list,
4233                                             const_array(ctx->v4i32, 0), "");
4234                 break;
4235         }
4236
4237         return ac_build_indexed_load_const(&ctx->ac, list, index);
4238 }
4239
4240 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4241                                      LLVMValueRef index, enum desc_type type)
4242 {
4243         LLVMValueRef list = LLVMGetParam(ctx->main_fn,
4244                                          SI_PARAM_SAMPLERS);
4245
4246         return load_sampler_desc_custom(ctx, list, index, type);
4247 }
4248
4249 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4250  *
4251  * SI-CI:
4252  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4253  *   filtering manually. The driver sets img7 to a mask clearing
4254  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4255  *     s_and_b32 samp0, samp0, img7
4256  *
4257  * VI:
4258  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
4259  */
4260 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4261                                            LLVMValueRef res, LLVMValueRef samp)
4262 {
4263         LLVMBuilderRef builder = ctx->gallivm.builder;
4264         LLVMValueRef img7, samp0;
4265
4266         if (ctx->screen->b.chip_class >= VI)
4267                 return samp;
4268
4269         img7 = LLVMBuildExtractElement(builder, res,
4270                                        LLVMConstInt(ctx->i32, 7, 0), "");
4271         samp0 = LLVMBuildExtractElement(builder, samp,
4272                                         LLVMConstInt(ctx->i32, 0, 0), "");
4273         samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4274         return LLVMBuildInsertElement(builder, samp, samp0,
4275                                       LLVMConstInt(ctx->i32, 0, 0), "");
4276 }
4277
4278 static void tex_fetch_ptrs(
4279         struct lp_build_tgsi_context *bld_base,
4280         struct lp_build_emit_data *emit_data,
4281         LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4282 {
4283         struct si_shader_context *ctx = si_shader_context(bld_base);
4284         const struct tgsi_full_instruction *inst = emit_data->inst;
4285         unsigned target = inst->Texture.Texture;
4286         unsigned sampler_src;
4287         unsigned sampler_index;
4288         LLVMValueRef index;
4289
4290         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4291         sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4292
4293         if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4294                 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4295
4296                 index = get_bounded_indirect_index(ctx,
4297                                                    &reg->Indirect,
4298                                                    reg->Register.Index,
4299                                                    SI_NUM_SAMPLERS);
4300         } else {
4301                 index = LLVMConstInt(ctx->i32, sampler_index, 0);
4302         }
4303
4304         if (target == TGSI_TEXTURE_BUFFER)
4305                 *res_ptr = load_sampler_desc(ctx, index, DESC_BUFFER);
4306         else
4307                 *res_ptr = load_sampler_desc(ctx, index, DESC_IMAGE);
4308
4309         if (samp_ptr)
4310                 *samp_ptr = NULL;
4311         if (fmask_ptr)
4312                 *fmask_ptr = NULL;
4313
4314         if (target == TGSI_TEXTURE_2D_MSAA ||
4315             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4316                 if (fmask_ptr)
4317                         *fmask_ptr = load_sampler_desc(ctx, index, DESC_FMASK);
4318         } else if (target != TGSI_TEXTURE_BUFFER) {
4319                 if (samp_ptr) {
4320                         *samp_ptr = load_sampler_desc(ctx, index, DESC_SAMPLER);
4321                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4322                 }
4323         }
4324 }
4325
4326 static void txq_fetch_args(
4327         struct lp_build_tgsi_context *bld_base,
4328         struct lp_build_emit_data *emit_data)
4329 {
4330         struct si_shader_context *ctx = si_shader_context(bld_base);
4331         const struct tgsi_full_instruction *inst = emit_data->inst;
4332         unsigned target = inst->Texture.Texture;
4333         LLVMValueRef res_ptr;
4334         LLVMValueRef address;
4335
4336         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4337
4338         if (target == TGSI_TEXTURE_BUFFER) {
4339                 /* Read the size from the buffer descriptor directly. */
4340                 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4341                 return;
4342         }
4343
4344         /* Textures - set the mip level. */
4345         address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4346
4347         set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4348                            NULL, &address, 1, 0xf);
4349 }
4350
4351 static void txq_emit(const struct lp_build_tgsi_action *action,
4352                      struct lp_build_tgsi_context *bld_base,
4353                      struct lp_build_emit_data *emit_data)
4354 {
4355         struct lp_build_context *base = &bld_base->base;
4356         unsigned target = emit_data->inst->Texture.Texture;
4357
4358         if (target == TGSI_TEXTURE_BUFFER) {
4359                 /* Just return the buffer size. */
4360                 emit_data->output[emit_data->chan] = emit_data->args[0];
4361                 return;
4362         }
4363
4364         emit_data->output[emit_data->chan] = lp_build_intrinsic(
4365                 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4366                 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4367                 LP_FUNC_ATTR_READNONE);
4368
4369         /* Divide the number of layers by 6 to get the number of cubes. */
4370         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4371             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4372                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4373                 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4374                 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4375
4376                 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4377                 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4378                 z = LLVMBuildSDiv(builder, z, six, "");
4379
4380                 emit_data->output[emit_data->chan] =
4381                         LLVMBuildInsertElement(builder, v4, z, two, "");
4382         }
4383 }
4384
4385 static void tex_fetch_args(
4386         struct lp_build_tgsi_context *bld_base,
4387         struct lp_build_emit_data *emit_data)
4388 {
4389         struct si_shader_context *ctx = si_shader_context(bld_base);
4390         struct gallivm_state *gallivm = bld_base->base.gallivm;
4391         const struct tgsi_full_instruction *inst = emit_data->inst;
4392         unsigned opcode = inst->Instruction.Opcode;
4393         unsigned target = inst->Texture.Texture;
4394         LLVMValueRef coords[5], derivs[6];
4395         LLVMValueRef address[16];
4396         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4397         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4398         unsigned count = 0;
4399         unsigned chan;
4400         unsigned num_deriv_channels = 0;
4401         bool has_offset = inst->Texture.NumOffsets > 0;
4402         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4403         unsigned dmask = 0xf;
4404
4405         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4406
4407         if (target == TGSI_TEXTURE_BUFFER) {
4408                 emit_data->dst_type = ctx->v4f32;
4409                 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
4410                                                       ctx->v16i8, "");
4411                 emit_data->args[1] = bld_base->uint_bld.zero;
4412                 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4413                 emit_data->arg_count = 3;
4414                 return;
4415         }
4416
4417         /* Fetch and project texture coordinates */
4418         coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4419         for (chan = 0; chan < 3; chan++ ) {
4420                 coords[chan] = lp_build_emit_fetch(bld_base,
4421                                                    emit_data->inst, 0,
4422                                                    chan);
4423                 if (opcode == TGSI_OPCODE_TXP)
4424                         coords[chan] = lp_build_emit_llvm_binary(bld_base,
4425                                                                  TGSI_OPCODE_DIV,
4426                                                                  coords[chan],
4427                                                                  coords[3]);
4428         }
4429
4430         if (opcode == TGSI_OPCODE_TXP)
4431                 coords[3] = bld_base->base.one;
4432
4433         /* Pack offsets. */
4434         if (has_offset && opcode != TGSI_OPCODE_TXF) {
4435                 /* The offsets are six-bit signed integers packed like this:
4436                  *   X=[5:0], Y=[13:8], and Z=[21:16].
4437                  */
4438                 LLVMValueRef offset[3], pack;
4439
4440                 assert(inst->Texture.NumOffsets == 1);
4441
4442                 for (chan = 0; chan < 3; chan++) {
4443                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4444                                                                      emit_data->inst, 0, chan);
4445                         offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4446                                                     lp_build_const_int32(gallivm, 0x3f), "");
4447                         if (chan)
4448                                 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4449                                                             lp_build_const_int32(gallivm, chan*8), "");
4450                 }
4451
4452                 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4453                 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4454                 address[count++] = pack;
4455         }
4456
4457         /* Pack LOD bias value */
4458         if (opcode == TGSI_OPCODE_TXB)
4459                 address[count++] = coords[3];
4460         if (opcode == TGSI_OPCODE_TXB2)
4461                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4462
4463         /* Pack depth comparison value */
4464         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4465                 LLVMValueRef z;
4466
4467                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4468                         z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4469                 } else {
4470                         assert(ref_pos >= 0);
4471                         z = coords[ref_pos];
4472                 }
4473
4474                 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4475                  * so the depth comparison value isn't clamped for Z16 and
4476                  * Z24 anymore. Do it manually here.
4477                  *
4478                  * It's unnecessary if the original texture format was
4479                  * Z32_FLOAT, but we don't know that here.
4480                  */
4481                 if (ctx->screen->b.chip_class == VI)
4482                         z = si_llvm_saturate(bld_base, z);
4483
4484                 address[count++] = z;
4485         }
4486
4487         /* Pack user derivatives */
4488         if (opcode == TGSI_OPCODE_TXD) {
4489                 int param, num_src_deriv_channels;
4490
4491                 switch (target) {
4492                 case TGSI_TEXTURE_3D:
4493                         num_src_deriv_channels = 3;
4494                         num_deriv_channels = 3;
4495                         break;
4496                 case TGSI_TEXTURE_2D:
4497                 case TGSI_TEXTURE_SHADOW2D:
4498                 case TGSI_TEXTURE_RECT:
4499                 case TGSI_TEXTURE_SHADOWRECT:
4500                 case TGSI_TEXTURE_2D_ARRAY:
4501                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4502                         num_src_deriv_channels = 2;
4503                         num_deriv_channels = 2;
4504                         break;
4505                 case TGSI_TEXTURE_CUBE:
4506                 case TGSI_TEXTURE_SHADOWCUBE:
4507                 case TGSI_TEXTURE_CUBE_ARRAY:
4508                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4509                         /* Cube derivatives will be converted to 2D. */
4510                         num_src_deriv_channels = 3;
4511                         num_deriv_channels = 2;
4512                         break;
4513                 case TGSI_TEXTURE_1D:
4514                 case TGSI_TEXTURE_SHADOW1D:
4515                 case TGSI_TEXTURE_1D_ARRAY:
4516                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4517                         num_src_deriv_channels = 1;
4518                         num_deriv_channels = 1;
4519                         break;
4520                 default:
4521                         unreachable("invalid target");
4522                 }
4523
4524                 for (param = 0; param < 2; param++)
4525                         for (chan = 0; chan < num_src_deriv_channels; chan++)
4526                                 derivs[param * num_src_deriv_channels + chan] =
4527                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
4528         }
4529
4530         if (target == TGSI_TEXTURE_CUBE ||
4531             target == TGSI_TEXTURE_CUBE_ARRAY ||
4532             target == TGSI_TEXTURE_SHADOWCUBE ||
4533             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4534                 ac_prepare_cube_coords(&ctx->ac,
4535                                        opcode == TGSI_OPCODE_TXD,
4536                                        target == TGSI_TEXTURE_CUBE_ARRAY ||
4537                                        target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4538                                        coords, derivs);
4539
4540         if (opcode == TGSI_OPCODE_TXD)
4541                 for (int i = 0; i < num_deriv_channels * 2; i++)
4542                         address[count++] = derivs[i];
4543
4544         /* Pack texture coordinates */
4545         address[count++] = coords[0];
4546         if (num_coords > 1)
4547                 address[count++] = coords[1];
4548         if (num_coords > 2)
4549                 address[count++] = coords[2];
4550
4551         /* Pack LOD or sample index */
4552         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4553                 address[count++] = coords[3];
4554         else if (opcode == TGSI_OPCODE_TXL2)
4555                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4556
4557         if (count > 16) {
4558                 assert(!"Cannot handle more than 16 texture address parameters");
4559                 count = 16;
4560         }
4561
4562         for (chan = 0; chan < count; chan++ ) {
4563                 address[chan] = LLVMBuildBitCast(gallivm->builder,
4564                                                  address[chan], ctx->i32, "");
4565         }
4566
4567         /* Adjust the sample index according to FMASK.
4568          *
4569          * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4570          * which is the identity mapping. Each nibble says which physical sample
4571          * should be fetched to get that sample.
4572          *
4573          * For example, 0x11111100 means there are only 2 samples stored and
4574          * the second sample covers 3/4 of the pixel. When reading samples 0
4575          * and 1, return physical sample 0 (determined by the first two 0s
4576          * in FMASK), otherwise return physical sample 1.
4577          *
4578          * The sample index should be adjusted as follows:
4579          *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
4580          */
4581         if (target == TGSI_TEXTURE_2D_MSAA ||
4582             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4583                 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4584                 struct lp_build_emit_data txf_emit_data = *emit_data;
4585                 LLVMValueRef txf_address[4];
4586                 unsigned txf_count = count;
4587                 struct tgsi_full_instruction inst = {};
4588
4589                 memcpy(txf_address, address, sizeof(txf_address));
4590
4591                 if (target == TGSI_TEXTURE_2D_MSAA) {
4592                         txf_address[2] = bld_base->uint_bld.zero;
4593                 }
4594                 txf_address[3] = bld_base->uint_bld.zero;
4595
4596                 /* Read FMASK using TXF. */
4597                 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4598                 inst.Texture.Texture = target;
4599                 txf_emit_data.inst = &inst;
4600                 txf_emit_data.chan = 0;
4601                 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4602                                    target, fmask_ptr, NULL,
4603                                    txf_address, txf_count, 0xf);
4604                 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4605
4606                 /* Initialize some constants. */
4607                 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4608                 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4609
4610                 /* Apply the formula. */
4611                 LLVMValueRef fmask =
4612                         LLVMBuildExtractElement(gallivm->builder,
4613                                                 txf_emit_data.output[0],
4614                                                 uint_bld->zero, "");
4615
4616                 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4617
4618                 LLVMValueRef sample_index4 =
4619                         LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4620
4621                 LLVMValueRef shifted_fmask =
4622                         LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4623
4624                 LLVMValueRef final_sample =
4625                         LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4626
4627                 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4628                  * resource descriptor is 0 (invalid),
4629                  */
4630                 LLVMValueRef fmask_desc =
4631                         LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4632                                          ctx->v8i32, "");
4633
4634                 LLVMValueRef fmask_word1 =
4635                         LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4636                                                 uint_bld->one, "");
4637
4638                 LLVMValueRef word1_is_nonzero =
4639                         LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4640                                       fmask_word1, uint_bld->zero, "");
4641
4642                 /* Replace the MSAA sample index. */
4643                 address[sample_chan] =
4644                         LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4645                                         final_sample, address[sample_chan], "");
4646         }
4647
4648         if (opcode == TGSI_OPCODE_TXF) {
4649                 /* add tex offsets */
4650                 if (inst->Texture.NumOffsets) {
4651                         struct lp_build_context *uint_bld = &bld_base->uint_bld;
4652                         const struct tgsi_texture_offset *off = inst->TexOffsets;
4653
4654                         assert(inst->Texture.NumOffsets == 1);
4655
4656                         switch (target) {
4657                         case TGSI_TEXTURE_3D:
4658                                 address[2] = lp_build_add(uint_bld, address[2],
4659                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4660                                 /* fall through */
4661                         case TGSI_TEXTURE_2D:
4662                         case TGSI_TEXTURE_SHADOW2D:
4663                         case TGSI_TEXTURE_RECT:
4664                         case TGSI_TEXTURE_SHADOWRECT:
4665                         case TGSI_TEXTURE_2D_ARRAY:
4666                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
4667                                 address[1] =
4668                                         lp_build_add(uint_bld, address[1],
4669                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4670                                 /* fall through */
4671                         case TGSI_TEXTURE_1D:
4672                         case TGSI_TEXTURE_SHADOW1D:
4673                         case TGSI_TEXTURE_1D_ARRAY:
4674                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
4675                                 address[0] =
4676                                         lp_build_add(uint_bld, address[0],
4677                                                 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4678                                 break;
4679                                 /* texture offsets do not apply to other texture targets */
4680                         }
4681                 }
4682         }
4683
4684         if (opcode == TGSI_OPCODE_TG4) {
4685                 unsigned gather_comp = 0;
4686
4687                 /* DMASK was repurposed for GATHER4. 4 components are always
4688                  * returned and DMASK works like a swizzle - it selects
4689                  * the component to fetch. The only valid DMASK values are
4690                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4691                  * (red,red,red,red) etc.) The ISA document doesn't mention
4692                  * this.
4693                  */
4694
4695                 /* Get the component index from src1.x for Gather4. */
4696                 if (!tgsi_is_shadow_target(target)) {
4697                         LLVMValueRef comp_imm;
4698                         struct tgsi_src_register src1 = inst->Src[1].Register;
4699
4700                         assert(src1.File == TGSI_FILE_IMMEDIATE);
4701
4702                         comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
4703                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4704                         gather_comp = CLAMP(gather_comp, 0, 3);
4705                 }
4706
4707                 dmask = 1 << gather_comp;
4708         }
4709
4710         set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4711                            samp_ptr, address, count, dmask);
4712 }
4713
4714 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
4715  * incorrectly forces nearest filtering if the texture format is integer.
4716  * The only effect it has on Gather4, which always returns 4 texels for
4717  * bilinear filtering, is that the final coordinates are off by 0.5 of
4718  * the texel size.
4719  *
4720  * The workaround is to subtract 0.5 from the unnormalized coordinates,
4721  * or (0.5 / size) from the normalized coordinates.
4722  */
4723 static void si_lower_gather4_integer(struct si_shader_context *ctx,
4724                                      struct lp_build_emit_data *emit_data,
4725                                      const char *intr_name,
4726                                      unsigned coord_vgpr_index)
4727 {
4728         LLVMBuilderRef builder = ctx->gallivm.builder;
4729         LLVMValueRef coord = emit_data->args[0];
4730         LLVMValueRef half_texel[2];
4731         int c;
4732
4733         if (emit_data->inst->Texture.Texture == TGSI_TEXTURE_RECT ||
4734             emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
4735                 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
4736         } else {
4737                 struct tgsi_full_instruction txq_inst = {};
4738                 struct lp_build_emit_data txq_emit_data = {};
4739
4740                 /* Query the texture size. */
4741                 txq_inst.Texture.Texture = emit_data->inst->Texture.Texture;
4742                 txq_emit_data.inst = &txq_inst;
4743                 txq_emit_data.dst_type = ctx->v4i32;
4744                 set_tex_fetch_args(ctx, &txq_emit_data, TGSI_OPCODE_TXQ,
4745                                    txq_inst.Texture.Texture,
4746                                    emit_data->args[1], NULL,
4747                                    &ctx->bld_base.uint_bld.zero,
4748                                    1, 0xf);
4749                 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
4750
4751                 /* Compute -0.5 / size. */
4752                 for (c = 0; c < 2; c++) {
4753                         half_texel[c] =
4754                                 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
4755                                                         LLVMConstInt(ctx->i32, c, 0), "");
4756                         half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
4757                         half_texel[c] =
4758                                 lp_build_emit_llvm_unary(&ctx->bld_base,
4759                                                          TGSI_OPCODE_RCP, half_texel[c]);
4760                         half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
4761                                                       LLVMConstReal(ctx->f32, -0.5), "");
4762                 }
4763         }
4764
4765         for (c = 0; c < 2; c++) {
4766                 LLVMValueRef tmp;
4767                 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
4768
4769                 tmp = LLVMBuildExtractElement(builder, coord, index, "");
4770                 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4771                 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
4772                 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4773                 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
4774         }
4775
4776         emit_data->args[0] = coord;
4777         emit_data->output[emit_data->chan] =
4778                 lp_build_intrinsic(builder, intr_name, emit_data->dst_type,
4779                                    emit_data->args, emit_data->arg_count,
4780                                    LP_FUNC_ATTR_READNONE);
4781 }
4782
4783 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4784                                 struct lp_build_tgsi_context *bld_base,
4785                                 struct lp_build_emit_data *emit_data)
4786 {
4787         struct si_shader_context *ctx = si_shader_context(bld_base);
4788         struct lp_build_context *base = &bld_base->base;
4789         const struct tgsi_full_instruction *inst = emit_data->inst;
4790         unsigned opcode = inst->Instruction.Opcode;
4791         unsigned target = inst->Texture.Texture;
4792         char intr_name[127];
4793         bool has_offset = inst->Texture.NumOffsets > 0;
4794         bool is_shadow = tgsi_is_shadow_target(target);
4795         char type[64];
4796         const char *name = "llvm.SI.image.sample";
4797         const char *infix = "";
4798
4799         if (target == TGSI_TEXTURE_BUFFER) {
4800                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4801                         base->gallivm->builder,
4802                         "llvm.SI.vs.load.input", emit_data->dst_type,
4803                         emit_data->args, emit_data->arg_count,
4804                         LP_FUNC_ATTR_READNONE);
4805                 return;
4806         }
4807
4808         switch (opcode) {
4809         case TGSI_OPCODE_TXF:
4810                 name = target == TGSI_TEXTURE_2D_MSAA ||
4811                        target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4812                                "llvm.SI.image.load" :
4813                                "llvm.SI.image.load.mip";
4814                 is_shadow = false;
4815                 has_offset = false;
4816                 break;
4817         case TGSI_OPCODE_LODQ:
4818                 name = "llvm.SI.getlod";
4819                 is_shadow = false;
4820                 has_offset = false;
4821                 break;
4822         case TGSI_OPCODE_TEX:
4823         case TGSI_OPCODE_TEX2:
4824         case TGSI_OPCODE_TXP:
4825                 if (ctx->type != PIPE_SHADER_FRAGMENT)
4826                         infix = ".lz";
4827                 break;
4828         case TGSI_OPCODE_TXB:
4829         case TGSI_OPCODE_TXB2:
4830                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4831                 infix = ".b";
4832                 break;
4833         case TGSI_OPCODE_TXL:
4834         case TGSI_OPCODE_TXL2:
4835                 infix = ".l";
4836                 break;
4837         case TGSI_OPCODE_TXD:
4838                 infix = ".d";
4839                 break;
4840         case TGSI_OPCODE_TG4:
4841                 name = "llvm.SI.gather4";
4842                 infix = ".lz";
4843                 break;
4844         default:
4845                 assert(0);
4846                 return;
4847         }
4848
4849         /* Add the type and suffixes .c, .o if needed. */
4850         build_type_name_for_intr(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4851         sprintf(intr_name, "%s%s%s%s.%s",
4852                 name, is_shadow ? ".c" : "", infix,
4853                 has_offset ? ".o" : "", type);
4854
4855         /* The hardware needs special lowering for Gather4 with integer formats. */
4856         if (opcode == TGSI_OPCODE_TG4) {
4857                 struct tgsi_shader_info *info = &ctx->shader->selector->info;
4858                 /* This will also work with non-constant indexing because of how
4859                  * glsl_to_tgsi works and we intent to preserve that behavior.
4860                  */
4861                 const unsigned src_idx = 2;
4862                 unsigned sampler = inst->Src[src_idx].Register.Index;
4863
4864                 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
4865
4866                 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
4867                     info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT) {
4868                         /* Texture coordinates start after:
4869                          *   {offset, bias, z-compare, derivatives}
4870                          * Only the offset and z-compare can occur here.
4871                          */
4872                         si_lower_gather4_integer(ctx, emit_data, intr_name,
4873                                                  (int)has_offset + (int)is_shadow);
4874                         return;
4875                 }
4876         }
4877
4878         emit_data->output[emit_data->chan] = lp_build_intrinsic(
4879                 base->gallivm->builder, intr_name, emit_data->dst_type,
4880                 emit_data->args, emit_data->arg_count,
4881                 LP_FUNC_ATTR_READNONE);
4882 }
4883
4884 static void si_llvm_emit_txqs(
4885         const struct lp_build_tgsi_action *action,
4886         struct lp_build_tgsi_context *bld_base,
4887         struct lp_build_emit_data *emit_data)
4888 {
4889         struct si_shader_context *ctx = si_shader_context(bld_base);
4890         struct gallivm_state *gallivm = bld_base->base.gallivm;
4891         LLVMBuilderRef builder = gallivm->builder;
4892         LLVMValueRef res, samples;
4893         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4894
4895         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4896
4897
4898         /* Read the samples from the descriptor directly. */
4899         res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4900         samples = LLVMBuildExtractElement(
4901                 builder, res,
4902                 lp_build_const_int32(gallivm, 3), "");
4903         samples = LLVMBuildLShr(builder, samples,
4904                                 lp_build_const_int32(gallivm, 16), "");
4905         samples = LLVMBuildAnd(builder, samples,
4906                                lp_build_const_int32(gallivm, 0xf), "");
4907         samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4908                                samples, "");
4909
4910         emit_data->output[emit_data->chan] = samples;
4911 }
4912
4913 /*
4914  * SI implements derivatives using the local data store (LDS)
4915  * All writes to the LDS happen in all executing threads at
4916  * the same time. TID is the Thread ID for the current
4917  * thread and is a value between 0 and 63, representing
4918  * the thread's position in the wavefront.
4919  *
4920  * For the pixel shader threads are grouped into quads of four pixels.
4921  * The TIDs of the pixels of a quad are:
4922  *
4923  *  +------+------+
4924  *  |4n + 0|4n + 1|
4925  *  +------+------+
4926  *  |4n + 2|4n + 3|
4927  *  +------+------+
4928  *
4929  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4930  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4931  * the current pixel's column, and masking with 0xfffffffe yields the TID
4932  * of the left pixel of the current pixel's row.
4933  *
4934  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4935  * adding 2 yields the TID of the pixel below the top pixel.
4936  */
4937 /* masks for thread ID. */
4938 #define TID_MASK_TOP_LEFT 0xfffffffc
4939 #define TID_MASK_TOP      0xfffffffd
4940 #define TID_MASK_LEFT     0xfffffffe
4941
4942 static void si_llvm_emit_ddxy(
4943         const struct lp_build_tgsi_action *action,
4944         struct lp_build_tgsi_context *bld_base,
4945         struct lp_build_emit_data *emit_data)
4946 {
4947         struct si_shader_context *ctx = si_shader_context(bld_base);
4948         struct gallivm_state *gallivm = bld_base->base.gallivm;
4949         unsigned opcode = emit_data->info->opcode;
4950         LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, val, args[2];
4951         int idx;
4952         unsigned mask;
4953
4954         thread_id = get_thread_id(ctx);
4955
4956         if (opcode == TGSI_OPCODE_DDX_FINE)
4957                 mask = TID_MASK_LEFT;
4958         else if (opcode == TGSI_OPCODE_DDY_FINE)
4959                 mask = TID_MASK_TOP;
4960         else
4961                 mask = TID_MASK_TOP_LEFT;
4962
4963         tl_tid = LLVMBuildAnd(gallivm->builder, thread_id,
4964                                 lp_build_const_int32(gallivm, mask), "");
4965
4966         /* for DDX we want to next X pixel, DDY next Y pixel. */
4967         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4968         trbl_tid = LLVMBuildAdd(gallivm->builder, tl_tid,
4969                                   lp_build_const_int32(gallivm, idx), "");
4970
4971         val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
4972
4973         if (ctx->screen->has_ds_bpermute) {
4974                 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4975                                        lp_build_const_int32(gallivm, 4), "");
4976                 args[1] = val;
4977                 tl = lp_build_intrinsic(gallivm->builder,
4978                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
4979                                         args, 2, LP_FUNC_ATTR_READNONE);
4980
4981                 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4982                                        lp_build_const_int32(gallivm, 4), "");
4983                 trbl = lp_build_intrinsic(gallivm->builder,
4984                                           "llvm.amdgcn.ds.bpermute", ctx->i32,
4985                                           args, 2, LP_FUNC_ATTR_READNONE);
4986         } else {
4987                 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4988
4989                 store_ptr = ac_build_gep0(&ctx->ac, ctx->lds, thread_id);
4990                 load_ptr0 = ac_build_gep0(&ctx->ac, ctx->lds, tl_tid);
4991                 load_ptr1 = ac_build_gep0(&ctx->ac, ctx->lds, trbl_tid);
4992
4993                 LLVMBuildStore(gallivm->builder, val, store_ptr);
4994                 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4995                 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4996         }
4997
4998         tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4999         trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
5000
5001         emit_data->output[emit_data->chan] =
5002                 LLVMBuildFSub(gallivm->builder, trbl, tl, "");
5003 }
5004
5005 /*
5006  * this takes an I,J coordinate pair,
5007  * and works out the X and Y derivatives.
5008  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5009  */
5010 static LLVMValueRef si_llvm_emit_ddxy_interp(
5011         struct lp_build_tgsi_context *bld_base,
5012         LLVMValueRef interp_ij)
5013 {
5014         struct si_shader_context *ctx = si_shader_context(bld_base);
5015         struct gallivm_state *gallivm = bld_base->base.gallivm;
5016         LLVMValueRef result[4], a;
5017         unsigned i;
5018
5019         for (i = 0; i < 2; i++) {
5020                 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5021                                             LLVMConstInt(ctx->i32, i, 0), "");
5022                 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5023                 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5024         }
5025
5026         return lp_build_gather_values(gallivm, result, 4);
5027 }
5028
5029 static void interp_fetch_args(
5030         struct lp_build_tgsi_context *bld_base,
5031         struct lp_build_emit_data *emit_data)
5032 {
5033         struct si_shader_context *ctx = si_shader_context(bld_base);
5034         struct gallivm_state *gallivm = bld_base->base.gallivm;
5035         const struct tgsi_full_instruction *inst = emit_data->inst;
5036
5037         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5038                 /* offset is in second src, first two channels */
5039                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5040                                                          emit_data->inst, 1,
5041                                                          TGSI_CHAN_X);
5042                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5043                                                          emit_data->inst, 1,
5044                                                          TGSI_CHAN_Y);
5045                 emit_data->arg_count = 2;
5046         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5047                 LLVMValueRef sample_position;
5048                 LLVMValueRef sample_id;
5049                 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5050
5051                 /* fetch sample ID, then fetch its sample position,
5052                  * and place into first two channels.
5053                  */
5054                 sample_id = lp_build_emit_fetch(bld_base,
5055                                                 emit_data->inst, 1, TGSI_CHAN_X);
5056                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5057                                              ctx->i32, "");
5058                 sample_position = load_sample_position(ctx, sample_id);
5059
5060                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5061                                                              sample_position,
5062                                                              lp_build_const_int32(gallivm, 0), "");
5063
5064                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5065                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5066                                                              sample_position,
5067                                                              lp_build_const_int32(gallivm, 1), "");
5068                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5069                 emit_data->arg_count = 2;
5070         }
5071 }
5072
5073 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5074                                 struct lp_build_tgsi_context *bld_base,
5075                                 struct lp_build_emit_data *emit_data)
5076 {
5077         struct si_shader_context *ctx = si_shader_context(bld_base);
5078         struct si_shader *shader = ctx->shader;
5079         struct gallivm_state *gallivm = bld_base->base.gallivm;
5080         struct lp_build_context *uint = &bld_base->uint_bld;
5081         LLVMValueRef interp_param;
5082         const struct tgsi_full_instruction *inst = emit_data->inst;
5083         int input_index = inst->Src[0].Register.Index;
5084         int chan;
5085         int i;
5086         LLVMValueRef attr_number;
5087         LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5088         int interp_param_idx;
5089         unsigned interp = shader->selector->info.input_interpolate[input_index];
5090         unsigned location;
5091
5092         assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5093
5094         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5095             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5096                 location = TGSI_INTERPOLATE_LOC_CENTER;
5097         else
5098                 location = TGSI_INTERPOLATE_LOC_CENTROID;
5099
5100         interp_param_idx = lookup_interp_param_index(interp, location);
5101         if (interp_param_idx == -1)
5102                 return;
5103         else if (interp_param_idx)
5104                 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5105         else
5106                 interp_param = NULL;
5107
5108         attr_number = lp_build_const_int32(gallivm, input_index);
5109
5110         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5111             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5112                 LLVMValueRef ij_out[2];
5113                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5114
5115                 /*
5116                  * take the I then J parameters, and the DDX/Y for it, and
5117                  * calculate the IJ inputs for the interpolator.
5118                  * temp1 = ddx * offset/sample.x + I;
5119                  * interp_param.I = ddy * offset/sample.y + temp1;
5120                  * temp1 = ddx * offset/sample.x + J;
5121                  * interp_param.J = ddy * offset/sample.y + temp1;
5122                  */
5123                 for (i = 0; i < 2; i++) {
5124                         LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5125                         LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5126                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5127                                                                       ddxy_out, ix_ll, "");
5128                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5129                                                                       ddxy_out, iy_ll, "");
5130                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5131                                                                          interp_param, ix_ll, "");
5132                         LLVMValueRef temp1, temp2;
5133
5134                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5135                                                      ctx->f32, "");
5136
5137                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5138
5139                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5140
5141                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5142
5143                         ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5144                 }
5145                 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5146         }
5147
5148         for (chan = 0; chan < 4; chan++) {
5149                 LLVMValueRef llvm_chan;
5150                 unsigned schan;
5151
5152                 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5153                 llvm_chan = lp_build_const_int32(gallivm, schan);
5154
5155                 if (interp_param) {
5156                         interp_param = LLVMBuildBitCast(gallivm->builder,
5157                                 interp_param, LLVMVectorType(ctx->f32, 2), "");
5158                         LLVMValueRef i = LLVMBuildExtractElement(
5159                                 gallivm->builder, interp_param, uint->zero, "");
5160                         LLVMValueRef j = LLVMBuildExtractElement(
5161                                 gallivm->builder, interp_param, uint->one, "");
5162                         emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5163                                 llvm_chan, attr_number, params,
5164                                 i, j);
5165                 } else {
5166                         emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5167                                 lp_build_const_int32(gallivm, 2), /* P0 */
5168                                 llvm_chan, attr_number, params);
5169                 }
5170         }
5171 }
5172
5173 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5174                                        struct lp_build_emit_data *emit_data)
5175 {
5176         struct si_shader_context *ctx = si_shader_context(bld_base);
5177         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5178         LLVMValueRef imm;
5179         unsigned stream;
5180
5181         assert(src0.File == TGSI_FILE_IMMEDIATE);
5182
5183         imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5184         stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5185         return stream;
5186 }
5187
5188 /* Emit one vertex from the geometry shader */
5189 static void si_llvm_emit_vertex(
5190         const struct lp_build_tgsi_action *action,
5191         struct lp_build_tgsi_context *bld_base,
5192         struct lp_build_emit_data *emit_data)
5193 {
5194         struct si_shader_context *ctx = si_shader_context(bld_base);
5195         struct lp_build_context *uint = &bld_base->uint_bld;
5196         struct si_shader *shader = ctx->shader;
5197         struct tgsi_shader_info *info = &shader->selector->info;
5198         struct gallivm_state *gallivm = bld_base->base.gallivm;
5199         struct lp_build_if_state if_state;
5200         LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5201                                             SI_PARAM_GS2VS_OFFSET);
5202         LLVMValueRef gs_next_vertex;
5203         LLVMValueRef can_emit, kill;
5204         LLVMValueRef args[2];
5205         unsigned chan, offset;
5206         int i;
5207         unsigned stream;
5208
5209         stream = si_llvm_get_stream(bld_base, emit_data);
5210
5211         /* Write vertex attribute values to GSVS ring */
5212         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5213                                        ctx->gs_next_vertex[stream],
5214                                        "");
5215
5216         /* If this thread has already emitted the declared maximum number of
5217          * vertices, skip the write: excessive vertex emissions are not
5218          * supposed to have any effect.
5219          *
5220          * If the shader has no writes to memory, kill it instead. This skips
5221          * further memory loads and may allow LLVM to skip to the end
5222          * altogether.
5223          */
5224         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5225                                  lp_build_const_int32(gallivm,
5226                                                       shader->selector->gs_max_out_vertices), "");
5227
5228         bool use_kill = !info->writes_memory;
5229         if (use_kill) {
5230                 kill = lp_build_select(&bld_base->base, can_emit,
5231                                        lp_build_const_float(gallivm, 1.0f),
5232                                        lp_build_const_float(gallivm, -1.0f));
5233
5234                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5235                                    ctx->voidt, &kill, 1, 0);
5236         } else {
5237                 lp_build_if(&if_state, gallivm, can_emit);
5238         }
5239
5240         offset = 0;
5241         for (i = 0; i < info->num_outputs; i++) {
5242                 LLVMValueRef *out_ptr = ctx->outputs[i];
5243
5244                 for (chan = 0; chan < 4; chan++) {
5245                         if (!(info->output_usagemask[i] & (1 << chan)) ||
5246                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5247                                 continue;
5248
5249                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5250                         LLVMValueRef voffset =
5251                                 lp_build_const_int32(gallivm, offset *
5252                                                      shader->selector->gs_max_out_vertices);
5253                         offset++;
5254
5255                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
5256                         voffset = lp_build_mul_imm(uint, voffset, 4);
5257
5258                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5259
5260                         build_tbuffer_store(ctx,
5261                                             ctx->gsvs_ring[stream],
5262                                             out_val, 1,
5263                                             voffset, soffset, 0,
5264                                             V_008F0C_BUF_DATA_FORMAT_32,
5265                                             V_008F0C_BUF_NUM_FORMAT_UINT,
5266                                             1, 0, 1, 1, 0);
5267                 }
5268         }
5269
5270         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5271                                       lp_build_const_int32(gallivm, 1));
5272
5273         LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5274
5275         /* Signal vertex emission */
5276         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5277         args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID);
5278         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5279                            ctx->voidt, args, 2, 0);
5280
5281         if (!use_kill)
5282                 lp_build_endif(&if_state);
5283 }
5284
5285 /* Cut one primitive from the geometry shader */
5286 static void si_llvm_emit_primitive(
5287         const struct lp_build_tgsi_action *action,
5288         struct lp_build_tgsi_context *bld_base,
5289         struct lp_build_emit_data *emit_data)
5290 {
5291         struct si_shader_context *ctx = si_shader_context(bld_base);
5292         struct gallivm_state *gallivm = bld_base->base.gallivm;
5293         LLVMValueRef args[2];
5294         unsigned stream;
5295
5296         /* Signal primitive cut */
5297         stream = si_llvm_get_stream(bld_base, emit_data);
5298         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5299         args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID);
5300         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5301                            ctx->voidt, args, 2, 0);
5302 }
5303
5304 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5305                                  struct lp_build_tgsi_context *bld_base,
5306                                  struct lp_build_emit_data *emit_data)
5307 {
5308         struct si_shader_context *ctx = si_shader_context(bld_base);
5309         struct gallivm_state *gallivm = bld_base->base.gallivm;
5310
5311         /* SI only (thanks to a hw bug workaround):
5312          * The real barrier instruction isn’t needed, because an entire patch
5313          * always fits into a single wave.
5314          */
5315         if (HAVE_LLVM >= 0x0309 &&
5316             ctx->screen->b.chip_class == SI &&
5317             ctx->type == PIPE_SHADER_TESS_CTRL) {
5318                 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5319                 return;
5320         }
5321
5322         lp_build_intrinsic(gallivm->builder,
5323                            HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5324                                                : "llvm.AMDGPU.barrier.local",
5325                            ctx->voidt, NULL, 0, 0);
5326 }
5327
5328 static const struct lp_build_tgsi_action tex_action = {
5329         .fetch_args = tex_fetch_args,
5330         .emit = build_tex_intrinsic,
5331 };
5332
5333 static const struct lp_build_tgsi_action interp_action = {
5334         .fetch_args = interp_fetch_args,
5335         .emit = build_interp_intrinsic,
5336 };
5337
5338 static void si_create_function(struct si_shader_context *ctx,
5339                                const char *name,
5340                                LLVMTypeRef *returns, unsigned num_returns,
5341                                LLVMTypeRef *params, unsigned num_params,
5342                                int last_sgpr)
5343 {
5344         int i;
5345
5346         si_llvm_create_func(ctx, name, returns, num_returns,
5347                             params, num_params);
5348         si_llvm_shader_type(ctx->main_fn, ctx->type);
5349         ctx->return_value = LLVMGetUndef(ctx->return_type);
5350
5351         for (i = 0; i <= last_sgpr; ++i) {
5352                 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5353
5354                 /* The combination of:
5355                  * - ByVal
5356                  * - dereferenceable
5357                  * - invariant.load
5358                  * allows the optimization passes to move loads and reduces
5359                  * SGPR spilling significantly.
5360                  */
5361                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5362                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5363                         lp_add_attr_dereferenceable(P, UINT64_MAX);
5364                 } else
5365                         lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5366         }
5367
5368         if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5369                 /* These were copied from some LLVM test. */
5370                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5371                                                    "less-precise-fpmad",
5372                                                    "true");
5373                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5374                                                    "no-infs-fp-math",
5375                                                    "true");
5376                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5377                                                    "no-nans-fp-math",
5378                                                    "true");
5379                 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5380                                                    "unsafe-fp-math",
5381                                                    "true");
5382         }
5383 }
5384
5385 static void create_meta_data(struct si_shader_context *ctx)
5386 {
5387         struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
5388
5389         ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5390                                                      "range", 5);
5391 }
5392
5393 static void declare_streamout_params(struct si_shader_context *ctx,
5394                                      struct pipe_stream_output_info *so,
5395                                      LLVMTypeRef *params, LLVMTypeRef i32,
5396                                      unsigned *num_params)
5397 {
5398         int i;
5399
5400         /* Streamout SGPRs. */
5401         if (so->num_outputs) {
5402                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5403                         params[ctx->param_streamout_config = (*num_params)++] = i32;
5404                 else
5405                         ctx->param_streamout_config = *num_params - 1;
5406
5407                 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5408         }
5409         /* A streamout buffer offset is loaded if the stride is non-zero. */
5410         for (i = 0; i < 4; i++) {
5411                 if (!so->stride[i])
5412                         continue;
5413
5414                 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5415         }
5416 }
5417
5418 static unsigned llvm_get_type_size(LLVMTypeRef type)
5419 {
5420         LLVMTypeKind kind = LLVMGetTypeKind(type);
5421
5422         switch (kind) {
5423         case LLVMIntegerTypeKind:
5424                 return LLVMGetIntTypeWidth(type) / 8;
5425         case LLVMFloatTypeKind:
5426                 return 4;
5427         case LLVMPointerTypeKind:
5428                 return 8;
5429         case LLVMVectorTypeKind:
5430                 return LLVMGetVectorSize(type) *
5431                        llvm_get_type_size(LLVMGetElementType(type));
5432         case LLVMArrayTypeKind:
5433                 return LLVMGetArrayLength(type) *
5434                        llvm_get_type_size(LLVMGetElementType(type));
5435         default:
5436                 assert(0);
5437                 return 0;
5438         }
5439 }
5440
5441 static void declare_tess_lds(struct si_shader_context *ctx)
5442 {
5443         struct gallivm_state *gallivm = &ctx->gallivm;
5444         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5445         struct lp_build_context *uint = &bld_base->uint_bld;
5446
5447         unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5448         ctx->lds = LLVMBuildIntToPtr(gallivm->builder, uint->zero,
5449                 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5450                 "tess_lds");
5451 }
5452
5453 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5454 {
5455         const unsigned *properties = shader->selector->info.properties;
5456         unsigned max_work_group_size =
5457                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5458                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5459                        properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5460
5461         if (!max_work_group_size) {
5462                 /* This is a variable group size compute shader,
5463                  * compile it for the maximum possible group size.
5464                  */
5465                 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5466         }
5467         return max_work_group_size;
5468 }
5469
5470 static void create_function(struct si_shader_context *ctx)
5471 {
5472         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5473         struct gallivm_state *gallivm = bld_base->base.gallivm;
5474         struct si_shader *shader = ctx->shader;
5475         LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5476         LLVMTypeRef returns[16+32*4];
5477         unsigned i, last_sgpr, num_params, num_return_sgprs;
5478         unsigned num_returns = 0;
5479         unsigned num_prolog_vgprs = 0;
5480
5481         v3i32 = LLVMVectorType(ctx->i32, 3);
5482
5483         params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5484         params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5485         params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5486         params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5487         params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5488
5489         switch (ctx->type) {
5490         case PIPE_SHADER_VERTEX:
5491                 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5492                 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5493                 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5494                 params[SI_PARAM_DRAWID] = ctx->i32;
5495                 num_params = SI_PARAM_DRAWID+1;
5496
5497                 if (shader->key.as_es) {
5498                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5499                 } else if (shader->key.as_ls) {
5500                         params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5501                         num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5502                 } else {
5503                         if (shader->is_gs_copy_shader) {
5504                                 num_params = SI_PARAM_RW_BUFFERS+1;
5505                         } else {
5506                                 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5507                                 num_params = SI_PARAM_VS_STATE_BITS+1;
5508                         }
5509
5510                         /* The locations of the other parameters are assigned dynamically. */
5511                         declare_streamout_params(ctx, &shader->selector->so,
5512                                                  params, ctx->i32, &num_params);
5513                 }
5514
5515                 last_sgpr = num_params-1;
5516
5517                 /* VGPRs */
5518                 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5519                 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5520                 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5521                 params[ctx->param_instance_id = num_params++] = ctx->i32;
5522
5523                 if (!shader->is_gs_copy_shader) {
5524                         /* Vertex load indices. */
5525                         ctx->param_vertex_index0 = num_params;
5526
5527                         for (i = 0; i < shader->selector->info.num_inputs; i++)
5528                                 params[num_params++] = ctx->i32;
5529
5530                         num_prolog_vgprs += shader->selector->info.num_inputs;
5531
5532                         /* PrimitiveID output. */
5533                         if (!shader->key.as_es && !shader->key.as_ls)
5534                                 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5535                                         returns[num_returns++] = ctx->f32;
5536                 }
5537                 break;
5538
5539         case PIPE_SHADER_TESS_CTRL:
5540                 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5541                 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5542                 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5543                 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5544                 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5545                 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5546                 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5547
5548                 /* VGPRs */
5549                 params[SI_PARAM_PATCH_ID] = ctx->i32;
5550                 params[SI_PARAM_REL_IDS] = ctx->i32;
5551                 num_params = SI_PARAM_REL_IDS+1;
5552
5553                 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5554                  * placed after the user SGPRs.
5555                  */
5556                 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5557                         returns[num_returns++] = ctx->i32; /* SGPRs */
5558
5559                 for (i = 0; i < 3; i++)
5560                         returns[num_returns++] = ctx->f32; /* VGPRs */
5561                 break;
5562
5563         case PIPE_SHADER_TESS_EVAL:
5564                 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5565                 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5566
5567                 if (shader->key.as_es) {
5568                         params[ctx->param_oc_lds = num_params++] = ctx->i32;
5569                         params[num_params++] = ctx->i32;
5570                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5571                 } else {
5572                         params[num_params++] = ctx->i32;
5573                         declare_streamout_params(ctx, &shader->selector->so,
5574                                                  params, ctx->i32, &num_params);
5575                         params[ctx->param_oc_lds = num_params++] = ctx->i32;
5576                 }
5577                 last_sgpr = num_params - 1;
5578
5579                 /* VGPRs */
5580                 params[ctx->param_tes_u = num_params++] = ctx->f32;
5581                 params[ctx->param_tes_v = num_params++] = ctx->f32;
5582                 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5583                 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5584
5585                 /* PrimitiveID output. */
5586                 if (!shader->key.as_es)
5587                         for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5588                                 returns[num_returns++] = ctx->f32;
5589                 break;
5590
5591         case PIPE_SHADER_GEOMETRY:
5592                 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5593                 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5594                 last_sgpr = SI_PARAM_GS_WAVE_ID;
5595
5596                 /* VGPRs */
5597                 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5598                 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5599                 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5600                 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5601                 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5602                 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5603                 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5604                 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5605                 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5606                 break;
5607
5608         case PIPE_SHADER_FRAGMENT:
5609                 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5610                 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5611                 last_sgpr = SI_PARAM_PRIM_MASK;
5612                 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5613                 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5614                 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5615                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5616                 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5617                 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5618                 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5619                 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5620                 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5621                 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5622                 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5623                 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5624                 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5625                 shader->info.face_vgpr_index = 20;
5626                 params[SI_PARAM_ANCILLARY] = ctx->i32;
5627                 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5628                 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5629                 num_params = SI_PARAM_POS_FIXED_PT+1;
5630
5631                 /* Color inputs from the prolog. */
5632                 if (shader->selector->info.colors_read) {
5633                         unsigned num_color_elements =
5634                                 util_bitcount(shader->selector->info.colors_read);
5635
5636                         assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5637                         for (i = 0; i < num_color_elements; i++)
5638                                 params[num_params++] = ctx->f32;
5639
5640                         num_prolog_vgprs += num_color_elements;
5641                 }
5642
5643                 /* Outputs for the epilog. */
5644                 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5645                 num_returns =
5646                         num_return_sgprs +
5647                         util_bitcount(shader->selector->info.colors_written) * 4 +
5648                         shader->selector->info.writes_z +
5649                         shader->selector->info.writes_stencil +
5650                         shader->selector->info.writes_samplemask +
5651                         1 /* SampleMaskIn */;
5652
5653                 num_returns = MAX2(num_returns,
5654                                    num_return_sgprs +
5655                                    PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5656
5657                 for (i = 0; i < num_return_sgprs; i++)
5658                         returns[i] = ctx->i32;
5659                 for (; i < num_returns; i++)
5660                         returns[i] = ctx->f32;
5661                 break;
5662
5663         case PIPE_SHADER_COMPUTE:
5664                 params[SI_PARAM_GRID_SIZE] = v3i32;
5665                 params[SI_PARAM_BLOCK_SIZE] = v3i32;
5666                 params[SI_PARAM_BLOCK_ID] = v3i32;
5667                 last_sgpr = SI_PARAM_BLOCK_ID;
5668
5669                 params[SI_PARAM_THREAD_ID] = v3i32;
5670                 num_params = SI_PARAM_THREAD_ID + 1;
5671                 break;
5672         default:
5673                 assert(0 && "unimplemented shader");
5674                 return;
5675         }
5676
5677         assert(num_params <= ARRAY_SIZE(params));
5678
5679         si_create_function(ctx, "main", returns, num_returns, params,
5680                            num_params, last_sgpr);
5681
5682         /* Reserve register locations for VGPR inputs the PS prolog may need. */
5683         if (ctx->type == PIPE_SHADER_FRAGMENT &&
5684             ctx->separate_prolog) {
5685                 si_llvm_add_attribute(ctx->main_fn,
5686                                       "InitialPSInputAddr",
5687                                       S_0286D0_PERSP_SAMPLE_ENA(1) |
5688                                       S_0286D0_PERSP_CENTER_ENA(1) |
5689                                       S_0286D0_PERSP_CENTROID_ENA(1) |
5690                                       S_0286D0_LINEAR_SAMPLE_ENA(1) |
5691                                       S_0286D0_LINEAR_CENTER_ENA(1) |
5692                                       S_0286D0_LINEAR_CENTROID_ENA(1) |
5693                                       S_0286D0_FRONT_FACE_ENA(1) |
5694                                       S_0286D0_POS_FIXED_PT_ENA(1));
5695         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5696                 si_llvm_add_attribute(ctx->main_fn,
5697                                       "amdgpu-max-work-group-size",
5698                                       si_get_max_workgroup_size(shader));
5699         }
5700
5701         shader->info.num_input_sgprs = 0;
5702         shader->info.num_input_vgprs = 0;
5703
5704         for (i = 0; i <= last_sgpr; ++i)
5705                 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5706
5707         for (; i < num_params; ++i)
5708                 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5709
5710         assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
5711         shader->info.num_input_vgprs -= num_prolog_vgprs;
5712
5713         if (!ctx->screen->has_ds_bpermute &&
5714             bld_base->info &&
5715             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5716              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5717              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5718              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5719              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5720              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5721                 ctx->lds =
5722                         LLVMAddGlobalInAddressSpace(gallivm->module,
5723                                                     LLVMArrayType(ctx->i32, 64),
5724                                                     "ddxy_lds",
5725                                                     LOCAL_ADDR_SPACE);
5726
5727         if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.as_ls) ||
5728             ctx->type == PIPE_SHADER_TESS_CTRL)
5729                 declare_tess_lds(ctx);
5730 }
5731
5732 /**
5733  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5734  * for later use.
5735  */
5736 static void preload_ring_buffers(struct si_shader_context *ctx)
5737 {
5738         struct gallivm_state *gallivm = ctx->bld_base.base.gallivm;
5739         LLVMBuilderRef builder = gallivm->builder;
5740
5741         LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
5742                                             SI_PARAM_RW_BUFFERS);
5743
5744         if ((ctx->type == PIPE_SHADER_VERTEX &&
5745              ctx->shader->key.as_es) ||
5746             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5747              ctx->shader->key.as_es) ||
5748             ctx->type == PIPE_SHADER_GEOMETRY) {
5749                 unsigned ring =
5750                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5751                                                              : SI_ES_RING_ESGS;
5752                 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5753
5754                 ctx->esgs_ring =
5755                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
5756         }
5757
5758         if (ctx->shader->is_gs_copy_shader) {
5759                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
5760
5761                 ctx->gsvs_ring[0] =
5762                         ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
5763         } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
5764                 const struct si_shader_selector *sel = ctx->shader->selector;
5765                 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
5766                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
5767                 LLVMValueRef base_ring;
5768
5769                 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
5770
5771                 /* The conceptual layout of the GSVS ring is
5772                  *   v0c0 .. vLv0 v0c1 .. vLc1 ..
5773                  * but the real memory layout is swizzled across
5774                  * threads:
5775                  *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
5776                  *   t16v0c0 ..
5777                  * Override the buffer descriptor accordingly.
5778                  */
5779                 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
5780                 uint64_t stream_offset = 0;
5781
5782                 for (unsigned stream = 0; stream < 4; ++stream) {
5783                         unsigned num_components;
5784                         unsigned stride;
5785                         unsigned num_records;
5786                         LLVMValueRef ring, tmp;
5787
5788                         num_components = sel->info.num_stream_output_components[stream];
5789                         if (!num_components)
5790                                 continue;
5791
5792                         stride = 4 * num_components * sel->gs_max_out_vertices;
5793
5794                         /* Limit on the stride field for <= CIK. */
5795                         assert(stride < (1 << 14));
5796
5797                         num_records = 64;
5798
5799                         ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
5800                         tmp = LLVMBuildExtractElement(builder, ring, uint->zero, "");
5801                         tmp = LLVMBuildAdd(builder, tmp,
5802                                            LLVMConstInt(ctx->i64,
5803                                                         stream_offset, 0), "");
5804                         stream_offset += stride * 64;
5805
5806                         ring = LLVMBuildInsertElement(builder, ring, tmp, uint->zero, "");
5807                         ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
5808                         tmp = LLVMBuildExtractElement(builder, ring, uint->one, "");
5809                         tmp = LLVMBuildOr(builder, tmp,
5810                                 LLVMConstInt(ctx->i32,
5811                                              S_008F04_STRIDE(stride) |
5812                                              S_008F04_SWIZZLE_ENABLE(1), 0), "");
5813                         ring = LLVMBuildInsertElement(builder, ring, tmp, uint->one, "");
5814                         ring = LLVMBuildInsertElement(builder, ring,
5815                                         LLVMConstInt(ctx->i32, num_records, 0),
5816                                         LLVMConstInt(ctx->i32, 2, 0), "");
5817                         ring = LLVMBuildInsertElement(builder, ring,
5818                                 LLVMConstInt(ctx->i32,
5819                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5820                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5821                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5822                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
5823                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5824                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
5825                                              S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
5826                                              S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
5827                                              S_008F0C_ADD_TID_ENABLE(1),
5828                                              0),
5829                                 LLVMConstInt(ctx->i32, 3, 0), "");
5830                         ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
5831
5832                         ctx->gsvs_ring[stream] = ring;
5833                 }
5834         }
5835 }
5836
5837 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5838                                          LLVMValueRef param_rw_buffers,
5839                                          unsigned param_pos_fixed_pt)
5840 {
5841         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5842         struct gallivm_state *gallivm = bld_base->base.gallivm;
5843         LLVMBuilderRef builder = gallivm->builder;
5844         LLVMValueRef slot, desc, offset, row, bit, address[2];
5845
5846         /* Use the fixed-point gl_FragCoord input.
5847          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5848          * per coordinate to get the repeating effect.
5849          */
5850         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5851         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5852
5853         /* Load the buffer descriptor. */
5854         slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5855         desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
5856
5857         /* The stipple pattern is 32x32, each row has 32 bits. */
5858         offset = LLVMBuildMul(builder, address[1],
5859                               LLVMConstInt(ctx->i32, 4, 0), "");
5860         row = buffer_load_const(ctx, desc, offset);
5861         row = LLVMBuildBitCast(builder, row, ctx->i32, "");
5862         bit = LLVMBuildLShr(builder, row, address[0], "");
5863         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5864
5865         /* The intrinsic kills the thread if arg < 0. */
5866         bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5867                               LLVMConstReal(ctx->f32, -1), "");
5868         lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5869 }
5870
5871 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5872                                   struct si_shader_config *conf,
5873                                   unsigned symbol_offset)
5874 {
5875         unsigned i;
5876         const unsigned char *config =
5877                 radeon_shader_binary_config_start(binary, symbol_offset);
5878         bool really_needs_scratch = false;
5879
5880         /* LLVM adds SGPR spills to the scratch size.
5881          * Find out if we really need the scratch buffer.
5882          */
5883         for (i = 0; i < binary->reloc_count; i++) {
5884                 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5885
5886                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5887                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5888                         really_needs_scratch = true;
5889                         break;
5890                 }
5891         }
5892
5893         /* XXX: We may be able to emit some of these values directly rather than
5894          * extracting fields to be emitted later.
5895          */
5896
5897         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5898                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5899                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5900                 switch (reg) {
5901                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5902                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5903                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5904                 case R_00B848_COMPUTE_PGM_RSRC1:
5905                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5906                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5907                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
5908                         conf->rsrc1 = value;
5909                         break;
5910                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5911                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5912                         break;
5913                 case R_00B84C_COMPUTE_PGM_RSRC2:
5914                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5915                         conf->rsrc2 = value;
5916                         break;
5917                 case R_0286CC_SPI_PS_INPUT_ENA:
5918                         conf->spi_ps_input_ena = value;
5919                         break;
5920                 case R_0286D0_SPI_PS_INPUT_ADDR:
5921                         conf->spi_ps_input_addr = value;
5922                         break;
5923                 case R_0286E8_SPI_TMPRING_SIZE:
5924                 case R_00B860_COMPUTE_TMPRING_SIZE:
5925                         /* WAVESIZE is in units of 256 dwords. */
5926                         if (really_needs_scratch)
5927                                 conf->scratch_bytes_per_wave =
5928                                         G_00B860_WAVESIZE(value) * 256 * 4;
5929                         break;
5930                 case 0x4: /* SPILLED_SGPRS */
5931                         conf->spilled_sgprs = value;
5932                         break;
5933                 case 0x8: /* SPILLED_VGPRS */
5934                         conf->spilled_vgprs = value;
5935                         break;
5936                 default:
5937                         {
5938                                 static bool printed;
5939
5940                                 if (!printed) {
5941                                         fprintf(stderr, "Warning: LLVM emitted unknown "
5942                                                 "config register: 0x%x\n", reg);
5943                                         printed = true;
5944                                 }
5945                         }
5946                         break;
5947                 }
5948         }
5949
5950         if (!conf->spi_ps_input_addr)
5951                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5952 }
5953
5954 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5955                         struct si_shader *shader,
5956                         struct si_shader_config *config,
5957                         uint64_t scratch_va)
5958 {
5959         unsigned i;
5960         uint32_t scratch_rsrc_dword0 = scratch_va;
5961         uint32_t scratch_rsrc_dword1 =
5962                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5963
5964         /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
5965          * correctly.
5966          */
5967         if (HAVE_LLVM >= 0x0309)
5968                 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5969         else
5970                 scratch_rsrc_dword1 |=
5971                         S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
5972
5973         for (i = 0 ; i < shader->binary.reloc_count; i++) {
5974                 const struct radeon_shader_reloc *reloc =
5975                                         &shader->binary.relocs[i];
5976                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5977                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5978                         &scratch_rsrc_dword0, 4);
5979                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5980                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5981                         &scratch_rsrc_dword1, 4);
5982                 }
5983         }
5984 }
5985
5986 static unsigned si_get_shader_binary_size(struct si_shader *shader)
5987 {
5988         unsigned size = shader->binary.code_size;
5989
5990         if (shader->prolog)
5991                 size += shader->prolog->binary.code_size;
5992         if (shader->epilog)
5993                 size += shader->epilog->binary.code_size;
5994         return size;
5995 }
5996
5997 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5998 {
5999         const struct radeon_shader_binary *prolog =
6000                 shader->prolog ? &shader->prolog->binary : NULL;
6001         const struct radeon_shader_binary *epilog =
6002                 shader->epilog ? &shader->epilog->binary : NULL;
6003         const struct radeon_shader_binary *mainb = &shader->binary;
6004         unsigned bo_size = si_get_shader_binary_size(shader) +
6005                            (!epilog ? mainb->rodata_size : 0);
6006         unsigned char *ptr;
6007
6008         assert(!prolog || !prolog->rodata_size);
6009         assert((!prolog && !epilog) || !mainb->rodata_size);
6010         assert(!epilog || !epilog->rodata_size);
6011
6012         r600_resource_reference(&shader->bo, NULL);
6013         shader->bo = (struct r600_resource*)
6014                      pipe_buffer_create(&sscreen->b.b, 0,
6015                                         PIPE_USAGE_IMMUTABLE, bo_size);
6016         if (!shader->bo)
6017                 return -ENOMEM;
6018
6019         /* Upload. */
6020         ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6021                                         PIPE_TRANSFER_READ_WRITE);
6022
6023         if (prolog) {
6024                 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6025                 ptr += prolog->code_size;
6026         }
6027
6028         util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6029         ptr += mainb->code_size;
6030
6031         if (epilog)
6032                 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6033         else if (mainb->rodata_size > 0)
6034                 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6035
6036         sscreen->b.ws->buffer_unmap(shader->bo->buf);
6037         return 0;
6038 }
6039
6040 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6041                                        struct pipe_debug_callback *debug,
6042                                        const char *name, FILE *file)
6043 {
6044         char *line, *p;
6045         unsigned i, count;
6046
6047         if (binary->disasm_string) {
6048                 fprintf(file, "Shader %s disassembly:\n", name);
6049                 fprintf(file, "%s", binary->disasm_string);
6050
6051                 if (debug && debug->debug_message) {
6052                         /* Very long debug messages are cut off, so send the
6053                          * disassembly one line at a time. This causes more
6054                          * overhead, but on the plus side it simplifies
6055                          * parsing of resulting logs.
6056                          */
6057                         pipe_debug_message(debug, SHADER_INFO,
6058                                            "Shader Disassembly Begin");
6059
6060                         line = binary->disasm_string;
6061                         while (*line) {
6062                                 p = util_strchrnul(line, '\n');
6063                                 count = p - line;
6064
6065                                 if (count) {
6066                                         pipe_debug_message(debug, SHADER_INFO,
6067                                                            "%.*s", count, line);
6068                                 }
6069
6070                                 if (!*p)
6071                                         break;
6072                                 line = p + 1;
6073                         }
6074
6075                         pipe_debug_message(debug, SHADER_INFO,
6076                                            "Shader Disassembly End");
6077                 }
6078         } else {
6079                 fprintf(file, "Shader %s binary:\n", name);
6080                 for (i = 0; i < binary->code_size; i += 4) {
6081                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6082                                 binary->code[i + 3], binary->code[i + 2],
6083                                 binary->code[i + 1], binary->code[i]);
6084                 }
6085         }
6086 }
6087
6088 static void si_shader_dump_stats(struct si_screen *sscreen,
6089                                  struct si_shader *shader,
6090                                  struct pipe_debug_callback *debug,
6091                                  unsigned processor,
6092                                  FILE *file,
6093                                  bool check_debug_option)
6094 {
6095         struct si_shader_config *conf = &shader->config;
6096         unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6097         unsigned code_size = si_get_shader_binary_size(shader);
6098         unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6099         unsigned lds_per_wave = 0;
6100         unsigned max_simd_waves = 10;
6101
6102         /* Compute LDS usage for PS. */
6103         switch (processor) {
6104         case PIPE_SHADER_FRAGMENT:
6105                 /* The minimum usage per wave is (num_inputs * 48). The maximum
6106                  * usage is (num_inputs * 48 * 16).
6107                  * We can get anything in between and it varies between waves.
6108                  *
6109                  * The 48 bytes per input for a single primitive is equal to
6110                  * 4 bytes/component * 4 components/input * 3 points.
6111                  *
6112                  * Other stages don't know the size at compile time or don't
6113                  * allocate LDS per wave, but instead they do it per thread group.
6114                  */
6115                 lds_per_wave = conf->lds_size * lds_increment +
6116                                align(num_inputs * 48, lds_increment);
6117                 break;
6118         case PIPE_SHADER_COMPUTE:
6119                 if (shader->selector) {
6120                         unsigned max_workgroup_size =
6121                                 si_get_max_workgroup_size(shader);
6122                         lds_per_wave = (conf->lds_size * lds_increment) /
6123                                        DIV_ROUND_UP(max_workgroup_size, 64);
6124                 }
6125                 break;
6126         }
6127
6128         /* Compute the per-SIMD wave counts. */
6129         if (conf->num_sgprs) {
6130                 if (sscreen->b.chip_class >= VI)
6131                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6132                 else
6133                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6134         }
6135
6136         if (conf->num_vgprs)
6137                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6138
6139         /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6140          * 16KB makes some SIMDs unoccupied). */
6141         if (lds_per_wave)
6142                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6143
6144         if (!check_debug_option ||
6145             r600_can_dump_shader(&sscreen->b, processor)) {
6146                 if (processor == PIPE_SHADER_FRAGMENT) {
6147                         fprintf(file, "*** SHADER CONFIG ***\n"
6148                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6149                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
6150                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6151                 }
6152
6153                 fprintf(file, "*** SHADER STATS ***\n"
6154                         "SGPRS: %d\n"
6155                         "VGPRS: %d\n"
6156                         "Spilled SGPRs: %d\n"
6157                         "Spilled VGPRs: %d\n"
6158                         "Private memory VGPRs: %d\n"
6159                         "Code Size: %d bytes\n"
6160                         "LDS: %d blocks\n"
6161                         "Scratch: %d bytes per wave\n"
6162                         "Max Waves: %d\n"
6163                         "********************\n\n\n",
6164                         conf->num_sgprs, conf->num_vgprs,
6165                         conf->spilled_sgprs, conf->spilled_vgprs,
6166                         conf->private_mem_vgprs, code_size,
6167                         conf->lds_size, conf->scratch_bytes_per_wave,
6168                         max_simd_waves);
6169         }
6170
6171         pipe_debug_message(debug, SHADER_INFO,
6172                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6173                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6174                            "Spilled VGPRs: %d PrivMem VGPRs: %d",
6175                            conf->num_sgprs, conf->num_vgprs, code_size,
6176                            conf->lds_size, conf->scratch_bytes_per_wave,
6177                            max_simd_waves, conf->spilled_sgprs,
6178                            conf->spilled_vgprs, conf->private_mem_vgprs);
6179 }
6180
6181 static const char *si_get_shader_name(struct si_shader *shader,
6182                                       unsigned processor)
6183 {
6184         switch (processor) {
6185         case PIPE_SHADER_VERTEX:
6186                 if (shader->key.as_es)
6187                         return "Vertex Shader as ES";
6188                 else if (shader->key.as_ls)
6189                         return "Vertex Shader as LS";
6190                 else
6191                         return "Vertex Shader as VS";
6192         case PIPE_SHADER_TESS_CTRL:
6193                 return "Tessellation Control Shader";
6194         case PIPE_SHADER_TESS_EVAL:
6195                 if (shader->key.as_es)
6196                         return "Tessellation Evaluation Shader as ES";
6197                 else
6198                         return "Tessellation Evaluation Shader as VS";
6199         case PIPE_SHADER_GEOMETRY:
6200                 if (shader->is_gs_copy_shader)
6201                         return "GS Copy Shader as VS";
6202                 else
6203                         return "Geometry Shader";
6204         case PIPE_SHADER_FRAGMENT:
6205                 return "Pixel Shader";
6206         case PIPE_SHADER_COMPUTE:
6207                 return "Compute Shader";
6208         default:
6209                 return "Unknown Shader";
6210         }
6211 }
6212
6213 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6214                     struct pipe_debug_callback *debug, unsigned processor,
6215                     FILE *file, bool check_debug_option)
6216 {
6217         if (!check_debug_option ||
6218             r600_can_dump_shader(&sscreen->b, processor))
6219                 si_dump_shader_key(processor, &shader->key, file);
6220
6221         if (!check_debug_option && shader->binary.llvm_ir_string) {
6222                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6223                         si_get_shader_name(shader, processor));
6224                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6225         }
6226
6227         if (!check_debug_option ||
6228             (r600_can_dump_shader(&sscreen->b, processor) &&
6229              !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6230                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6231
6232                 if (shader->prolog)
6233                         si_shader_dump_disassembly(&shader->prolog->binary,
6234                                                    debug, "prolog", file);
6235
6236                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6237
6238                 if (shader->epilog)
6239                         si_shader_dump_disassembly(&shader->epilog->binary,
6240                                                    debug, "epilog", file);
6241                 fprintf(file, "\n");
6242         }
6243
6244         si_shader_dump_stats(sscreen, shader, debug, processor, file,
6245                              check_debug_option);
6246 }
6247
6248 int si_compile_llvm(struct si_screen *sscreen,
6249                     struct radeon_shader_binary *binary,
6250                     struct si_shader_config *conf,
6251                     LLVMTargetMachineRef tm,
6252                     LLVMModuleRef mod,
6253                     struct pipe_debug_callback *debug,
6254                     unsigned processor,
6255                     const char *name)
6256 {
6257         int r = 0;
6258         unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6259
6260         if (r600_can_dump_shader(&sscreen->b, processor)) {
6261                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6262
6263                 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6264                         fprintf(stderr, "%s LLVM IR:\n\n", name);
6265                         ac_dump_module(mod);
6266                         fprintf(stderr, "\n");
6267                 }
6268         }
6269
6270         if (sscreen->record_llvm_ir) {
6271                 char *ir = LLVMPrintModuleToString(mod);
6272                 binary->llvm_ir_string = strdup(ir);
6273                 LLVMDisposeMessage(ir);
6274         }
6275
6276         if (!si_replace_shader(count, binary)) {
6277                 r = si_llvm_compile(mod, binary, tm, debug);
6278                 if (r)
6279                         return r;
6280         }
6281
6282         si_shader_binary_read_config(binary, conf, 0);
6283
6284         /* Enable 64-bit and 16-bit denormals, because there is no performance
6285          * cost.
6286          *
6287          * If denormals are enabled, all floating-point output modifiers are
6288          * ignored.
6289          *
6290          * Don't enable denormals for 32-bit floats, because:
6291          * - Floating-point output modifiers would be ignored by the hw.
6292          * - Some opcodes don't support denormals, such as v_mad_f32. We would
6293          *   have to stop using those.
6294          * - SI & CI would be very slow.
6295          */
6296         conf->float_mode |= V_00B028_FP_64_DENORMS;
6297
6298         FREE(binary->config);
6299         FREE(binary->global_symbol_offsets);
6300         binary->config = NULL;
6301         binary->global_symbol_offsets = NULL;
6302
6303         /* Some shaders can't have rodata because their binaries can be
6304          * concatenated.
6305          */
6306         if (binary->rodata_size &&
6307             (processor == PIPE_SHADER_VERTEX ||
6308              processor == PIPE_SHADER_TESS_CTRL ||
6309              processor == PIPE_SHADER_TESS_EVAL ||
6310              processor == PIPE_SHADER_FRAGMENT)) {
6311                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6312                 return -EINVAL;
6313         }
6314
6315         return r;
6316 }
6317
6318 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6319 {
6320         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6321                 LLVMBuildRetVoid(ctx->gallivm.builder);
6322         else
6323                 LLVMBuildRet(ctx->gallivm.builder, ret);
6324 }
6325
6326 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6327 struct si_shader *
6328 si_generate_gs_copy_shader(struct si_screen *sscreen,
6329                            LLVMTargetMachineRef tm,
6330                            struct si_shader_selector *gs_selector,
6331                            struct pipe_debug_callback *debug)
6332 {
6333         struct si_shader_context ctx;
6334         struct si_shader *shader;
6335         struct gallivm_state *gallivm = &ctx.gallivm;
6336         LLVMBuilderRef builder;
6337         struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6338         struct lp_build_context *uint = &bld_base->uint_bld;
6339         struct si_shader_output_values *outputs;
6340         struct tgsi_shader_info *gsinfo = &gs_selector->info;
6341         LLVMValueRef args[9];
6342         int i, r;
6343
6344         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6345
6346         if (!outputs)
6347                 return NULL;
6348
6349         shader = CALLOC_STRUCT(si_shader);
6350         if (!shader) {
6351                 FREE(outputs);
6352                 return NULL;
6353         }
6354
6355
6356         shader->selector = gs_selector;
6357         shader->is_gs_copy_shader = true;
6358
6359         si_init_shader_ctx(&ctx, sscreen, shader, tm);
6360         ctx.type = PIPE_SHADER_VERTEX;
6361
6362         builder = gallivm->builder;
6363
6364         create_meta_data(&ctx);
6365         create_function(&ctx);
6366         preload_ring_buffers(&ctx);
6367
6368         args[0] = ctx.gsvs_ring[0];
6369         args[1] = lp_build_mul_imm(uint,
6370                                    LLVMGetParam(ctx.main_fn,
6371                                                 ctx.param_vertex_id),
6372                                    4);
6373         args[3] = uint->zero;
6374         args[4] = uint->one;  /* OFFEN */
6375         args[5] = uint->zero; /* IDXEN */
6376         args[6] = uint->one;  /* GLC */
6377         args[7] = uint->one;  /* SLC */
6378         args[8] = uint->zero; /* TFE */
6379
6380         /* Fetch the vertex stream ID.*/
6381         LLVMValueRef stream_id;
6382
6383         if (gs_selector->so.num_outputs)
6384                 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6385         else
6386                 stream_id = uint->zero;
6387
6388         /* Fill in output information. */
6389         for (i = 0; i < gsinfo->num_outputs; ++i) {
6390                 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6391                 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6392
6393                 for (int chan = 0; chan < 4; chan++) {
6394                         outputs[i].vertex_stream[chan] =
6395                                 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6396                 }
6397         }
6398
6399         LLVMBasicBlockRef end_bb;
6400         LLVMValueRef switch_inst;
6401
6402         end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6403         switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6404
6405         for (int stream = 0; stream < 4; stream++) {
6406                 LLVMBasicBlockRef bb;
6407                 unsigned offset;
6408
6409                 if (!gsinfo->num_stream_output_components[stream])
6410                         continue;
6411
6412                 if (stream > 0 && !gs_selector->so.num_outputs)
6413                         continue;
6414
6415                 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6416                 LLVMAddCase(switch_inst, lp_build_const_int32(gallivm, stream), bb);
6417                 LLVMPositionBuilderAtEnd(builder, bb);
6418
6419                 /* Fetch vertex data from GSVS ring */
6420                 offset = 0;
6421                 for (i = 0; i < gsinfo->num_outputs; ++i) {
6422                         for (unsigned chan = 0; chan < 4; chan++) {
6423                                 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6424                                     outputs[i].vertex_stream[chan] != stream) {
6425                                         outputs[i].values[chan] = ctx.bld_base.base.undef;
6426                                         continue;
6427                                 }
6428
6429                                 args[2] = lp_build_const_int32(
6430                                         gallivm,
6431                                         offset * gs_selector->gs_max_out_vertices * 16 * 4);
6432                                 offset++;
6433
6434                                 outputs[i].values[chan] =
6435                                         LLVMBuildBitCast(gallivm->builder,
6436                                                  lp_build_intrinsic(gallivm->builder,
6437                                                                  "llvm.SI.buffer.load.dword.i32.i32",
6438                                                                  ctx.i32, args, 9,
6439                                                                  LP_FUNC_ATTR_READONLY),
6440                                                  ctx.f32, "");
6441                         }
6442                 }
6443
6444                 /* Streamout and exports. */
6445                 if (gs_selector->so.num_outputs) {
6446                         si_llvm_emit_streamout(&ctx, outputs,
6447                                                gsinfo->num_outputs,
6448                                                stream);
6449                 }
6450
6451                 if (stream == 0)
6452                         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6453
6454                 LLVMBuildBr(builder, end_bb);
6455         }
6456
6457         LLVMPositionBuilderAtEnd(builder, end_bb);
6458
6459         LLVMBuildRetVoid(gallivm->builder);
6460
6461         /* Dump LLVM IR before any optimization passes */
6462         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6463             r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6464                 ac_dump_module(bld_base->base.gallivm->module);
6465
6466         si_llvm_finalize_module(&ctx,
6467                 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6468
6469         r = si_compile_llvm(sscreen, &ctx.shader->binary,
6470                             &ctx.shader->config, ctx.tm,
6471                             bld_base->base.gallivm->module,
6472                             debug, PIPE_SHADER_GEOMETRY,
6473                             "GS Copy Shader");
6474         if (!r) {
6475                 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6476                         fprintf(stderr, "GS Copy Shader:\n");
6477                 si_shader_dump(sscreen, ctx.shader, debug,
6478                                PIPE_SHADER_GEOMETRY, stderr, true);
6479                 r = si_shader_binary_upload(sscreen, ctx.shader);
6480         }
6481
6482         si_llvm_dispose(&ctx);
6483
6484         FREE(outputs);
6485
6486         if (r != 0) {
6487                 FREE(shader);
6488                 shader = NULL;
6489         }
6490         return shader;
6491 }
6492
6493 static void si_dump_shader_key(unsigned shader, struct si_shader_key *key,
6494                                FILE *f)
6495 {
6496         int i;
6497
6498         fprintf(f, "SHADER KEY\n");
6499
6500         switch (shader) {
6501         case PIPE_SHADER_VERTEX:
6502                 fprintf(f, "  part.vs.prolog.instance_divisors = {");
6503                 for (i = 0; i < ARRAY_SIZE(key->part.vs.prolog.instance_divisors); i++)
6504                         fprintf(f, !i ? "%u" : ", %u",
6505                                 key->part.vs.prolog.instance_divisors[i]);
6506                 fprintf(f, "}\n");
6507                 fprintf(f, "  part.vs.epilog.export_prim_id = %u\n", key->part.vs.epilog.export_prim_id);
6508                 fprintf(f, "  as_es = %u\n", key->as_es);
6509                 fprintf(f, "  as_ls = %u\n", key->as_ls);
6510                 fprintf(f, "  mono.vs.fix_fetch = 0x%"PRIx64"\n", key->mono.vs.fix_fetch);
6511                 break;
6512
6513         case PIPE_SHADER_TESS_CTRL:
6514                 fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
6515                 fprintf(f, "  mono.tcs.inputs_to_copy = 0x%"PRIx64"\n", key->mono.tcs.inputs_to_copy);
6516                 break;
6517
6518         case PIPE_SHADER_TESS_EVAL:
6519                 fprintf(f, "  part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
6520                 fprintf(f, "  as_es = %u\n", key->as_es);
6521                 break;
6522
6523         case PIPE_SHADER_GEOMETRY:
6524                 fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
6525                 break;
6526
6527         case PIPE_SHADER_COMPUTE:
6528                 break;
6529
6530         case PIPE_SHADER_FRAGMENT:
6531                 fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
6532                 fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
6533                 fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
6534                 fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
6535                 fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
6536                 fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
6537                 fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
6538                 fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
6539                 fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
6540                 fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
6541                 fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
6542                 fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
6543                 fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
6544                 fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
6545                 fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
6546                 fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
6547                 break;
6548
6549         default:
6550                 assert(0);
6551         }
6552
6553         if ((shader == PIPE_SHADER_GEOMETRY ||
6554              shader == PIPE_SHADER_TESS_EVAL ||
6555              shader == PIPE_SHADER_VERTEX) &&
6556             !key->as_es && !key->as_ls) {
6557                 fprintf(f, "  opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
6558                 fprintf(f, "  opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
6559                 fprintf(f, "  opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
6560         }
6561 }
6562
6563 static void si_init_shader_ctx(struct si_shader_context *ctx,
6564                                struct si_screen *sscreen,
6565                                struct si_shader *shader,
6566                                LLVMTargetMachineRef tm)
6567 {
6568         struct lp_build_tgsi_context *bld_base;
6569         struct lp_build_tgsi_action tmpl = {};
6570
6571         si_llvm_context_init(ctx, sscreen, shader, tm,
6572                 (shader && shader->selector) ? &shader->selector->info : NULL,
6573                 (shader && shader->selector) ? shader->selector->tokens : NULL);
6574
6575         bld_base = &ctx->bld_base;
6576         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6577
6578         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6579         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6580         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6581
6582         bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6583         bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6584         bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6585         bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6586         bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6587         bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6588         bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6589         bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6590         bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6591         bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6592         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6593         bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6594         bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6595         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6596
6597         bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6598         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6599         bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6600         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6601         bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6602         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6603
6604         tmpl.fetch_args = atomic_fetch_args;
6605         tmpl.emit = atomic_emit;
6606         bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6607         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6608         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6609         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6610         bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6611         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6612         bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6613         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6614         bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6615         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6616         bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6617         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6618         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6619         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6620         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6621         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6622         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6623         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6624         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6625         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6626
6627         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6628
6629         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6630         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6631         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6632         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6633
6634         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6635         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6636         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6637 }
6638
6639 /* Return true if the PARAM export has been eliminated. */
6640 static bool si_eliminate_const_output(struct si_shader_context *ctx,
6641                                       LLVMValueRef inst, unsigned offset)
6642 {
6643         struct si_shader *shader = ctx->shader;
6644         unsigned num_outputs = shader->selector->info.num_outputs;
6645         unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
6646         bool is_zero[4] = {}, is_one[4] = {};
6647
6648         for (i = 0; i < 4; i++) {
6649                 LLVMBool loses_info;
6650                 LLVMValueRef p = LLVMGetOperand(inst, 5 + i);
6651
6652                 /* It's a constant expression. Undef outputs are eliminated too. */
6653                 if (LLVMIsUndef(p)) {
6654                         is_zero[i] = true;
6655                         is_one[i] = true;
6656                 } else if (LLVMIsAConstantFP(p)) {
6657                         double a = LLVMConstRealGetDouble(p, &loses_info);
6658
6659                         if (a == 0)
6660                                 is_zero[i] = true;
6661                         else if (a == 1)
6662                                 is_one[i] = true;
6663                         else
6664                                 return false; /* other constant */
6665                 } else
6666                         return false;
6667         }
6668
6669         /* Only certain combinations of 0 and 1 can be eliminated. */
6670         if (is_zero[0] && is_zero[1] && is_zero[2])
6671                 default_val = is_zero[3] ? 0 : 1;
6672         else if (is_one[0] && is_one[1] && is_one[2])
6673                 default_val = is_zero[3] ? 2 : 3;
6674         else
6675                 return false;
6676
6677         /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
6678         LLVMInstructionEraseFromParent(inst);
6679
6680         /* Change OFFSET to DEFAULT_VAL. */
6681         for (i = 0; i < num_outputs; i++) {
6682                 if (shader->info.vs_output_param_offset[i] == offset) {
6683                         shader->info.vs_output_param_offset[i] =
6684                                 EXP_PARAM_DEFAULT_VAL_0000 + default_val;
6685                         break;
6686                 }
6687         }
6688         return true;
6689 }
6690
6691 struct si_vs_exports {
6692         unsigned num;
6693         unsigned offset[SI_MAX_VS_OUTPUTS];
6694         LLVMValueRef inst[SI_MAX_VS_OUTPUTS];
6695 };
6696
6697 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
6698 {
6699         struct si_shader *shader = ctx->shader;
6700         struct tgsi_shader_info *info = &shader->selector->info;
6701         LLVMBasicBlockRef bb;
6702         struct si_vs_exports exports;
6703         bool removed_any = false;
6704
6705         exports.num = 0;
6706
6707         if (ctx->type == PIPE_SHADER_FRAGMENT ||
6708             ctx->type == PIPE_SHADER_COMPUTE ||
6709             shader->key.as_es ||
6710             shader->key.as_ls)
6711                 return;
6712
6713         /* Process all LLVM instructions. */
6714         bb = LLVMGetFirstBasicBlock(ctx->main_fn);
6715         while (bb) {
6716                 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
6717
6718                 while (inst) {
6719                         LLVMValueRef cur = inst;
6720                         inst = LLVMGetNextInstruction(inst);
6721
6722                         if (LLVMGetInstructionOpcode(cur) != LLVMCall)
6723                                 continue;
6724
6725                         LLVMValueRef callee = lp_get_called_value(cur);
6726
6727                         if (!lp_is_function(callee))
6728                                 continue;
6729
6730                         const char *name = LLVMGetValueName(callee);
6731                         unsigned num_args = LLVMCountParams(callee);
6732
6733                         /* Check if this is an export instruction. */
6734                         if (num_args != 9 || strcmp(name, "llvm.SI.export"))
6735                                 continue;
6736
6737                         LLVMValueRef arg = LLVMGetOperand(cur, 3);
6738                         unsigned target = LLVMConstIntGetZExtValue(arg);
6739
6740                         if (target < V_008DFC_SQ_EXP_PARAM)
6741                                 continue;
6742
6743                         target -= V_008DFC_SQ_EXP_PARAM;
6744
6745                         /* Eliminate constant value PARAM exports. */
6746                         if (si_eliminate_const_output(ctx, cur, target)) {
6747                                 removed_any = true;
6748                         } else {
6749                                 exports.offset[exports.num] = target;
6750                                 exports.inst[exports.num] = cur;
6751                                 exports.num++;
6752                         }
6753                 }
6754                 bb = LLVMGetNextBasicBlock(bb);
6755         }
6756
6757         /* Remove holes in export memory due to removed PARAM exports.
6758          * This is done by renumbering all PARAM exports.
6759          */
6760         if (removed_any) {
6761                 ubyte current_offset[SI_MAX_VS_OUTPUTS];
6762                 unsigned new_count = 0;
6763                 unsigned out, i;
6764
6765                 /* Make a copy of the offsets. We need the old version while
6766                  * we are modifying some of them. */
6767                 assert(sizeof(current_offset) ==
6768                        sizeof(shader->info.vs_output_param_offset));
6769                 memcpy(current_offset, shader->info.vs_output_param_offset,
6770                        sizeof(current_offset));
6771
6772                 for (i = 0; i < exports.num; i++) {
6773                         unsigned offset = exports.offset[i];
6774
6775                         for (out = 0; out < info->num_outputs; out++) {
6776                                 if (current_offset[out] != offset)
6777                                         continue;
6778
6779                                 LLVMSetOperand(exports.inst[i], 3,
6780                                                LLVMConstInt(ctx->i32,
6781                                                             V_008DFC_SQ_EXP_PARAM + new_count, 0));
6782                                 shader->info.vs_output_param_offset[out] = new_count;
6783                                 new_count++;
6784                                 break;
6785                         }
6786                 }
6787                 shader->info.nr_param_exports = new_count;
6788         }
6789 }
6790
6791 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
6792 {
6793         ctx->shader->config.private_mem_vgprs = 0;
6794
6795         /* Process all LLVM instructions. */
6796         LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
6797         while (bb) {
6798                 LLVMValueRef next = LLVMGetFirstInstruction(bb);
6799
6800                 while (next) {
6801                         LLVMValueRef inst = next;
6802                         next = LLVMGetNextInstruction(next);
6803
6804                         if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
6805                                 continue;
6806
6807                         LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
6808                         /* No idea why LLVM aligns allocas to 4 elements. */
6809                         unsigned alignment = LLVMGetAlignment(inst);
6810                         unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
6811                         ctx->shader->config.private_mem_vgprs += dw_size;
6812                 }
6813                 bb = LLVMGetNextBasicBlock(bb);
6814         }
6815 }
6816
6817 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
6818                                  struct si_shader *shader)
6819 {
6820         struct si_shader_selector *sel = shader->selector;
6821         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6822
6823         switch (ctx->type) {
6824         case PIPE_SHADER_VERTEX:
6825                 ctx->load_input = declare_input_vs;
6826                 if (shader->key.as_ls)
6827                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6828                 else if (shader->key.as_es)
6829                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6830                 else
6831                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6832                 break;
6833         case PIPE_SHADER_TESS_CTRL:
6834                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6835                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6836                 bld_base->emit_store = store_output_tcs;
6837                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6838                 break;
6839         case PIPE_SHADER_TESS_EVAL:
6840                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6841                 if (shader->key.as_es)
6842                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6843                 else
6844                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6845                 break;
6846         case PIPE_SHADER_GEOMETRY:
6847                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6848                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6849                 break;
6850         case PIPE_SHADER_FRAGMENT:
6851                 ctx->load_input = declare_input_fs;
6852                 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6853                 break;
6854         case PIPE_SHADER_COMPUTE:
6855                 ctx->declare_memory_region = declare_compute_memory;
6856                 break;
6857         default:
6858                 assert(!"Unsupported shader type");
6859                 return false;
6860         }
6861
6862         create_meta_data(ctx);
6863         create_function(ctx);
6864         preload_ring_buffers(ctx);
6865
6866         if (ctx->type == PIPE_SHADER_GEOMETRY) {
6867                 int i;
6868                 for (i = 0; i < 4; i++) {
6869                         ctx->gs_next_vertex[i] =
6870                                 lp_build_alloca(bld_base->base.gallivm,
6871                                                 ctx->i32, "");
6872                 }
6873         }
6874
6875         if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6876                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6877                 return false;
6878         }
6879
6880         si_llvm_build_ret(ctx, ctx->return_value);
6881         return true;
6882 }
6883
6884 /**
6885  * Compute the VS prolog key, which contains all the information needed to
6886  * build the VS prolog function, and set shader->info bits where needed.
6887  */
6888 static void si_get_vs_prolog_key(struct si_shader *shader,
6889                                  union si_shader_part_key *key)
6890 {
6891         struct tgsi_shader_info *info = &shader->selector->info;
6892
6893         memset(key, 0, sizeof(*key));
6894         key->vs_prolog.states = shader->key.part.vs.prolog;
6895         key->vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6896         key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6897
6898         /* Set the instanceID flag. */
6899         for (unsigned i = 0; i < info->num_inputs; i++)
6900                 if (key->vs_prolog.states.instance_divisors[i])
6901                         shader->info.uses_instanceid = true;
6902 }
6903
6904 /**
6905  * Compute the VS epilog key, which contains all the information needed to
6906  * build the VS epilog function, and set the PrimitiveID output offset.
6907  */
6908 static void si_get_vs_epilog_key(struct si_shader *shader,
6909                                  struct si_vs_epilog_bits *states,
6910                                  union si_shader_part_key *key)
6911 {
6912         memset(key, 0, sizeof(*key));
6913         key->vs_epilog.states = *states;
6914
6915         /* Set up the PrimitiveID output. */
6916         if (shader->key.part.vs.epilog.export_prim_id) {
6917                 unsigned index = shader->selector->info.num_outputs;
6918                 unsigned offset = shader->info.nr_param_exports++;
6919
6920                 key->vs_epilog.prim_id_param_offset = offset;
6921                 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
6922                 shader->info.vs_output_param_offset[index] = offset;
6923         }
6924 }
6925
6926 /**
6927  * Compute the PS prolog key, which contains all the information needed to
6928  * build the PS prolog function, and set related bits in shader->config.
6929  */
6930 static void si_get_ps_prolog_key(struct si_shader *shader,
6931                                  union si_shader_part_key *key,
6932                                  bool separate_prolog)
6933 {
6934         struct tgsi_shader_info *info = &shader->selector->info;
6935
6936         memset(key, 0, sizeof(*key));
6937         key->ps_prolog.states = shader->key.part.ps.prolog;
6938         key->ps_prolog.colors_read = info->colors_read;
6939         key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6940         key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6941         key->ps_prolog.wqm = info->uses_derivatives &&
6942                 (key->ps_prolog.colors_read ||
6943                  key->ps_prolog.states.force_persp_sample_interp ||
6944                  key->ps_prolog.states.force_linear_sample_interp ||
6945                  key->ps_prolog.states.force_persp_center_interp ||
6946                  key->ps_prolog.states.force_linear_center_interp ||
6947                  key->ps_prolog.states.bc_optimize_for_persp ||
6948                  key->ps_prolog.states.bc_optimize_for_linear);
6949
6950         if (info->colors_read) {
6951                 unsigned *color = shader->selector->color_attr_index;
6952
6953                 if (shader->key.part.ps.prolog.color_two_side) {
6954                         /* BCOLORs are stored after the last input. */
6955                         key->ps_prolog.num_interp_inputs = info->num_inputs;
6956                         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6957                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6958                 }
6959
6960                 for (unsigned i = 0; i < 2; i++) {
6961                         unsigned interp = info->input_interpolate[color[i]];
6962                         unsigned location = info->input_interpolate_loc[color[i]];
6963
6964                         if (!(info->colors_read & (0xf << i*4)))
6965                                 continue;
6966
6967                         key->ps_prolog.color_attr_index[i] = color[i];
6968
6969                         if (shader->key.part.ps.prolog.flatshade_colors &&
6970                             interp == TGSI_INTERPOLATE_COLOR)
6971                                 interp = TGSI_INTERPOLATE_CONSTANT;
6972
6973                         switch (interp) {
6974                         case TGSI_INTERPOLATE_CONSTANT:
6975                                 key->ps_prolog.color_interp_vgpr_index[i] = -1;
6976                                 break;
6977                         case TGSI_INTERPOLATE_PERSPECTIVE:
6978                         case TGSI_INTERPOLATE_COLOR:
6979                                 /* Force the interpolation location for colors here. */
6980                                 if (shader->key.part.ps.prolog.force_persp_sample_interp)
6981                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
6982                                 if (shader->key.part.ps.prolog.force_persp_center_interp)
6983                                         location = TGSI_INTERPOLATE_LOC_CENTER;
6984
6985                                 switch (location) {
6986                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
6987                                         key->ps_prolog.color_interp_vgpr_index[i] = 0;
6988                                         shader->config.spi_ps_input_ena |=
6989                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
6990                                         break;
6991                                 case TGSI_INTERPOLATE_LOC_CENTER:
6992                                         key->ps_prolog.color_interp_vgpr_index[i] = 2;
6993                                         shader->config.spi_ps_input_ena |=
6994                                                 S_0286CC_PERSP_CENTER_ENA(1);
6995                                         break;
6996                                 case TGSI_INTERPOLATE_LOC_CENTROID:
6997                                         key->ps_prolog.color_interp_vgpr_index[i] = 4;
6998                                         shader->config.spi_ps_input_ena |=
6999                                                 S_0286CC_PERSP_CENTROID_ENA(1);
7000                                         break;
7001                                 default:
7002                                         assert(0);
7003                                 }
7004                                 break;
7005                         case TGSI_INTERPOLATE_LINEAR:
7006                                 /* Force the interpolation location for colors here. */
7007                                 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7008                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7009                                 if (shader->key.part.ps.prolog.force_linear_center_interp)
7010                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7011
7012                                 /* The VGPR assignment for non-monolithic shaders
7013                                  * works because InitialPSInputAddr is set on the
7014                                  * main shader and PERSP_PULL_MODEL is never used.
7015                                  */
7016                                 switch (location) {
7017                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7018                                         key->ps_prolog.color_interp_vgpr_index[i] =
7019                                                 separate_prolog ? 6 : 9;
7020                                         shader->config.spi_ps_input_ena |=
7021                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
7022                                         break;
7023                                 case TGSI_INTERPOLATE_LOC_CENTER:
7024                                         key->ps_prolog.color_interp_vgpr_index[i] =
7025                                                 separate_prolog ? 8 : 11;
7026                                         shader->config.spi_ps_input_ena |=
7027                                                 S_0286CC_LINEAR_CENTER_ENA(1);
7028                                         break;
7029                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7030                                         key->ps_prolog.color_interp_vgpr_index[i] =
7031                                                 separate_prolog ? 10 : 13;
7032                                         shader->config.spi_ps_input_ena |=
7033                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
7034                                         break;
7035                                 default:
7036                                         assert(0);
7037                                 }
7038                                 break;
7039                         default:
7040                                 assert(0);
7041                         }
7042                 }
7043         }
7044 }
7045
7046 /**
7047  * Check whether a PS prolog is required based on the key.
7048  */
7049 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7050 {
7051         return key->ps_prolog.colors_read ||
7052                key->ps_prolog.states.force_persp_sample_interp ||
7053                key->ps_prolog.states.force_linear_sample_interp ||
7054                key->ps_prolog.states.force_persp_center_interp ||
7055                key->ps_prolog.states.force_linear_center_interp ||
7056                key->ps_prolog.states.bc_optimize_for_persp ||
7057                key->ps_prolog.states.bc_optimize_for_linear ||
7058                key->ps_prolog.states.poly_stipple;
7059 }
7060
7061 /**
7062  * Compute the PS epilog key, which contains all the information needed to
7063  * build the PS epilog function.
7064  */
7065 static void si_get_ps_epilog_key(struct si_shader *shader,
7066                                  union si_shader_part_key *key)
7067 {
7068         struct tgsi_shader_info *info = &shader->selector->info;
7069         memset(key, 0, sizeof(*key));
7070         key->ps_epilog.colors_written = info->colors_written;
7071         key->ps_epilog.writes_z = info->writes_z;
7072         key->ps_epilog.writes_stencil = info->writes_stencil;
7073         key->ps_epilog.writes_samplemask = info->writes_samplemask;
7074         key->ps_epilog.states = shader->key.part.ps.epilog;
7075 }
7076
7077 /**
7078  * Build the GS prolog function. Rotate the input vertices for triangle strips
7079  * with adjacency.
7080  */
7081 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7082                                         union si_shader_part_key *key)
7083 {
7084         const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2;
7085         const unsigned num_vgprs = 8;
7086         struct gallivm_state *gallivm = &ctx->gallivm;
7087         LLVMBuilderRef builder = gallivm->builder;
7088         LLVMTypeRef params[32];
7089         LLVMTypeRef returns[32];
7090         LLVMValueRef func, ret;
7091
7092         for (unsigned i = 0; i < num_sgprs; ++i) {
7093                 params[i] = ctx->i32;
7094                 returns[i] = ctx->i32;
7095         }
7096
7097         for (unsigned i = 0; i < num_vgprs; ++i) {
7098                 params[num_sgprs + i] = ctx->i32;
7099                 returns[num_sgprs + i] = ctx->f32;
7100         }
7101
7102         /* Create the function. */
7103         si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7104                            params, num_sgprs + num_vgprs, num_sgprs - 1);
7105         func = ctx->main_fn;
7106
7107         /* Copy inputs to outputs. This should be no-op, as the registers match,
7108          * but it will prevent the compiler from overwriting them unintentionally.
7109          */
7110         ret = ctx->return_value;
7111         for (unsigned i = 0; i < num_sgprs; i++) {
7112                 LLVMValueRef p = LLVMGetParam(func, i);
7113                 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7114         }
7115         for (unsigned i = 0; i < num_vgprs; i++) {
7116                 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7117                 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7118                 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7119         }
7120
7121         if (key->gs_prolog.states.tri_strip_adj_fix) {
7122                 /* Remap the input vertices for every other primitive. */
7123                 const unsigned vtx_params[6] = {
7124                         num_sgprs,
7125                         num_sgprs + 1,
7126                         num_sgprs + 3,
7127                         num_sgprs + 4,
7128                         num_sgprs + 5,
7129                         num_sgprs + 6
7130                 };
7131                 LLVMValueRef prim_id, rotate;
7132
7133                 prim_id = LLVMGetParam(func, num_sgprs + 2);
7134                 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7135
7136                 for (unsigned i = 0; i < 6; ++i) {
7137                         LLVMValueRef base, rotated, actual;
7138                         base = LLVMGetParam(func, vtx_params[i]);
7139                         rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
7140                         actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
7141                         actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
7142                         ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
7143                 }
7144         }
7145
7146         LLVMBuildRet(builder, ret);
7147 }
7148
7149 /**
7150  * Given a list of shader part functions, build a wrapper function that
7151  * runs them in sequence to form a monolithic shader.
7152  */
7153 static void si_build_wrapper_function(struct si_shader_context *ctx,
7154                                       LLVMValueRef *parts,
7155                                       unsigned num_parts,
7156                                       unsigned main_part)
7157 {
7158         struct gallivm_state *gallivm = &ctx->gallivm;
7159         LLVMBuilderRef builder = ctx->gallivm.builder;
7160         /* PS epilog has one arg per color component */
7161         LLVMTypeRef param_types[48];
7162         LLVMValueRef out[48];
7163         LLVMTypeRef function_type;
7164         unsigned num_params;
7165         unsigned num_out;
7166         MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7167         unsigned num_sgprs, num_vgprs;
7168         unsigned last_sgpr_param;
7169         unsigned gprs;
7170
7171         for (unsigned i = 0; i < num_parts; ++i) {
7172                 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7173                 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7174         }
7175
7176         /* The parameters of the wrapper function correspond to those of the
7177          * first part in terms of SGPRs and VGPRs, but we use the types of the
7178          * main part to get the right types. This is relevant for the
7179          * dereferenceable attribute on descriptor table pointers.
7180          */
7181         num_sgprs = 0;
7182         num_vgprs = 0;
7183
7184         function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7185         num_params = LLVMCountParamTypes(function_type);
7186
7187         for (unsigned i = 0; i < num_params; ++i) {
7188                 LLVMValueRef param = LLVMGetParam(parts[0], i);
7189
7190                 if (ac_is_sgpr_param(param)) {
7191                         assert(num_vgprs == 0);
7192                         num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7193                 } else {
7194                         num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7195                 }
7196         }
7197         assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7198
7199         num_params = 0;
7200         last_sgpr_param = 0;
7201         gprs = 0;
7202         while (gprs < num_sgprs + num_vgprs) {
7203                 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7204                 unsigned size;
7205
7206                 param_types[num_params] = LLVMTypeOf(param);
7207                 if (gprs < num_sgprs)
7208                         last_sgpr_param = num_params;
7209                 size = llvm_get_type_size(param_types[num_params]) / 4;
7210                 num_params++;
7211
7212                 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7213                 assert(gprs + size <= num_sgprs + num_vgprs &&
7214                        (gprs >= num_sgprs || gprs + size <= num_sgprs));
7215
7216                 gprs += size;
7217         }
7218
7219         si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
7220
7221         /* Record the arguments of the function as if they were an output of
7222          * a previous part.
7223          */
7224         num_out = 0;
7225         num_out_sgpr = 0;
7226
7227         for (unsigned i = 0; i < num_params; ++i) {
7228                 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7229                 LLVMTypeRef param_type = LLVMTypeOf(param);
7230                 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7231                 unsigned size = llvm_get_type_size(param_type) / 4;
7232
7233                 if (size == 1) {
7234                         if (param_type != out_type)
7235                                 param = LLVMBuildBitCast(builder, param, out_type, "");
7236                         out[num_out++] = param;
7237                 } else {
7238                         LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7239
7240                         if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7241                                 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7242                                 param_type = ctx->i64;
7243                         }
7244
7245                         if (param_type != vector_type)
7246                                 param = LLVMBuildBitCast(builder, param, vector_type, "");
7247
7248                         for (unsigned j = 0; j < size; ++j)
7249                                 out[num_out++] = LLVMBuildExtractElement(
7250                                         builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7251                 }
7252
7253                 if (i <= last_sgpr_param)
7254                         num_out_sgpr = num_out;
7255         }
7256
7257         /* Now chain the parts. */
7258         for (unsigned part = 0; part < num_parts; ++part) {
7259                 LLVMValueRef in[48];
7260                 LLVMValueRef ret;
7261                 LLVMTypeRef ret_type;
7262                 unsigned out_idx = 0;
7263
7264                 num_params = LLVMCountParams(parts[part]);
7265                 assert(num_params <= ARRAY_SIZE(param_types));
7266
7267                 /* Derive arguments for the next part from outputs of the
7268                  * previous one.
7269                  */
7270                 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7271                         LLVMValueRef param;
7272                         LLVMTypeRef param_type;
7273                         bool is_sgpr;
7274                         unsigned param_size;
7275                         LLVMValueRef arg = NULL;
7276
7277                         param = LLVMGetParam(parts[part], param_idx);
7278                         param_type = LLVMTypeOf(param);
7279                         param_size = llvm_get_type_size(param_type) / 4;
7280                         is_sgpr = ac_is_sgpr_param(param);
7281
7282                         if (is_sgpr) {
7283 #if HAVE_LLVM < 0x0400
7284                                 LLVMRemoveAttribute(param, LLVMByValAttribute);
7285 #else
7286                                 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7287                                 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7288 #endif
7289                                 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7290                         }
7291
7292                         assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7293                         assert(is_sgpr || out_idx >= num_out_sgpr);
7294
7295                         if (param_size == 1)
7296                                 arg = out[out_idx];
7297                         else
7298                                 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7299
7300                         if (LLVMTypeOf(arg) != param_type) {
7301                                 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7302                                         arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7303                                         arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7304                                 } else {
7305                                         arg = LLVMBuildBitCast(builder, arg, param_type, "");
7306                                 }
7307                         }
7308
7309                         in[param_idx] = arg;
7310                         out_idx += param_size;
7311                 }
7312
7313                 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7314                 ret_type = LLVMTypeOf(ret);
7315
7316                 /* Extract the returned GPRs. */
7317                 num_out = 0;
7318                 num_out_sgpr = 0;
7319
7320                 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7321                         assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7322
7323                         unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7324
7325                         for (unsigned i = 0; i < ret_size; ++i) {
7326                                 LLVMValueRef val =
7327                                         LLVMBuildExtractValue(builder, ret, i, "");
7328
7329                                 out[num_out++] = val;
7330
7331                                 if (LLVMTypeOf(val) == ctx->i32) {
7332                                         assert(num_out_sgpr + 1 == num_out);
7333                                         num_out_sgpr = num_out;
7334                                 }
7335                         }
7336                 }
7337         }
7338
7339         LLVMBuildRetVoid(builder);
7340 }
7341
7342 int si_compile_tgsi_shader(struct si_screen *sscreen,
7343                            LLVMTargetMachineRef tm,
7344                            struct si_shader *shader,
7345                            bool is_monolithic,
7346                            struct pipe_debug_callback *debug)
7347 {
7348         struct si_shader_selector *sel = shader->selector;
7349         struct si_shader_context ctx;
7350         struct lp_build_tgsi_context *bld_base;
7351         LLVMModuleRef mod;
7352         int r = -1;
7353
7354         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7355          * conversion fails. */
7356         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7357             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7358                 tgsi_dump(sel->tokens, 0);
7359                 si_dump_streamout(&sel->so);
7360         }
7361
7362         si_init_shader_ctx(&ctx, sscreen, shader, tm);
7363         ctx.separate_prolog = !is_monolithic;
7364
7365         memset(shader->info.vs_output_param_offset, EXP_PARAM_UNDEFINED,
7366                sizeof(shader->info.vs_output_param_offset));
7367
7368         shader->info.uses_instanceid = sel->info.uses_instanceid;
7369
7370         bld_base = &ctx.bld_base;
7371         ctx.load_system_value = declare_system_value;
7372
7373         if (!si_compile_tgsi_main(&ctx, shader)) {
7374                 si_llvm_dispose(&ctx);
7375                 return -1;
7376         }
7377
7378         if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7379                 LLVMValueRef parts[3];
7380                 bool need_prolog;
7381                 bool need_epilog;
7382
7383                 need_prolog = sel->info.num_inputs;
7384                 need_epilog = !shader->key.as_es && !shader->key.as_ls;
7385
7386                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7387
7388                 if (need_prolog) {
7389                         union si_shader_part_key prolog_key;
7390                         si_get_vs_prolog_key(shader, &prolog_key);
7391                         si_build_vs_prolog_function(&ctx, &prolog_key);
7392                         parts[0] = ctx.main_fn;
7393                 }
7394
7395                 if (need_epilog) {
7396                         union si_shader_part_key epilog_key;
7397                         si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
7398                         si_build_vs_epilog_function(&ctx, &epilog_key);
7399                         parts[need_prolog ? 2 : 1] = ctx.main_fn;
7400                 }
7401
7402                 si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
7403                                           need_prolog ? 1 : 0);
7404         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7405                 LLVMValueRef parts[2];
7406                 union si_shader_part_key epilog_key;
7407
7408                 parts[0] = ctx.main_fn;
7409
7410                 memset(&epilog_key, 0, sizeof(epilog_key));
7411                 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7412                 si_build_tcs_epilog_function(&ctx, &epilog_key);
7413                 parts[1] = ctx.main_fn;
7414
7415                 si_build_wrapper_function(&ctx, parts, 2, 0);
7416         } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
7417                    !shader->key.as_es) {
7418                 LLVMValueRef parts[2];
7419                 union si_shader_part_key epilog_key;
7420
7421                 parts[0] = ctx.main_fn;
7422
7423                 si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
7424                 si_build_vs_epilog_function(&ctx, &epilog_key);
7425                 parts[1] = ctx.main_fn;
7426
7427                 si_build_wrapper_function(&ctx, parts, 2, 0);
7428         } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
7429                 LLVMValueRef parts[2];
7430                 union si_shader_part_key prolog_key;
7431
7432                 parts[1] = ctx.main_fn;
7433
7434                 memset(&prolog_key, 0, sizeof(prolog_key));
7435                 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7436                 si_build_gs_prolog_function(&ctx, &prolog_key);
7437                 parts[0] = ctx.main_fn;
7438
7439                 si_build_wrapper_function(&ctx, parts, 2, 1);
7440         } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
7441                 LLVMValueRef parts[3];
7442                 union si_shader_part_key prolog_key;
7443                 union si_shader_part_key epilog_key;
7444                 bool need_prolog;
7445
7446                 si_get_ps_prolog_key(shader, &prolog_key, false);
7447                 need_prolog = si_need_ps_prolog(&prolog_key);
7448
7449                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7450
7451                 if (need_prolog) {
7452                         si_build_ps_prolog_function(&ctx, &prolog_key);
7453                         parts[0] = ctx.main_fn;
7454                 }
7455
7456                 si_get_ps_epilog_key(shader, &epilog_key);
7457                 si_build_ps_epilog_function(&ctx, &epilog_key);
7458                 parts[need_prolog ? 2 : 1] = ctx.main_fn;
7459
7460                 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, need_prolog ? 1 : 0);
7461         }
7462
7463         mod = bld_base->base.gallivm->module;
7464
7465         /* Dump LLVM IR before any optimization passes */
7466         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
7467             r600_can_dump_shader(&sscreen->b, ctx.type))
7468                 ac_dump_module(mod);
7469
7470         si_llvm_finalize_module(&ctx,
7471                                     r600_extra_shader_checks(&sscreen->b, ctx.type));
7472
7473         /* Post-optimization transformations and analysis. */
7474         si_eliminate_const_vs_outputs(&ctx);
7475
7476         if ((debug && debug->debug_message) ||
7477             r600_can_dump_shader(&sscreen->b, ctx.type))
7478                 si_count_scratch_private_memory(&ctx);
7479
7480         /* Compile to bytecode. */
7481         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
7482                             mod, debug, ctx.type, "TGSI shader");
7483         si_llvm_dispose(&ctx);
7484         if (r) {
7485                 fprintf(stderr, "LLVM failed to compile shader\n");
7486                 return r;
7487         }
7488
7489         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
7490          * LLVM 3.9svn has this bug.
7491          */
7492         if (sel->type == PIPE_SHADER_COMPUTE) {
7493                 unsigned wave_size = 64;
7494                 unsigned max_vgprs = 256;
7495                 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
7496                 unsigned max_sgprs_per_wave = 128;
7497                 unsigned max_block_threads = si_get_max_workgroup_size(shader);
7498                 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
7499                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
7500
7501                 max_vgprs = max_vgprs / min_waves_per_simd;
7502                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
7503
7504                 if (shader->config.num_sgprs > max_sgprs ||
7505                     shader->config.num_vgprs > max_vgprs) {
7506                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
7507                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
7508                                 shader->config.num_sgprs, shader->config.num_vgprs,
7509                                 max_sgprs, max_vgprs);
7510
7511                         /* Just terminate the process, because dependent
7512                          * shaders can hang due to bad input data, but use
7513                          * the env var to allow shader-db to work.
7514                          */
7515                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
7516                                 abort();
7517                 }
7518         }
7519
7520         /* Add the scratch offset to input SGPRs. */
7521         if (shader->config.scratch_bytes_per_wave)
7522                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
7523
7524         /* Calculate the number of fragment input VGPRs. */
7525         if (ctx.type == PIPE_SHADER_FRAGMENT) {
7526                 shader->info.num_input_vgprs = 0;
7527                 shader->info.face_vgpr_index = -1;
7528
7529                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
7530                         shader->info.num_input_vgprs += 2;
7531                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
7532                         shader->info.num_input_vgprs += 2;
7533                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
7534                         shader->info.num_input_vgprs += 2;
7535                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
7536                         shader->info.num_input_vgprs += 3;
7537                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
7538                         shader->info.num_input_vgprs += 2;
7539                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
7540                         shader->info.num_input_vgprs += 2;
7541                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
7542                         shader->info.num_input_vgprs += 2;
7543                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
7544                         shader->info.num_input_vgprs += 1;
7545                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
7546                         shader->info.num_input_vgprs += 1;
7547                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
7548                         shader->info.num_input_vgprs += 1;
7549                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
7550                         shader->info.num_input_vgprs += 1;
7551                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
7552                         shader->info.num_input_vgprs += 1;
7553                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
7554                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
7555                         shader->info.num_input_vgprs += 1;
7556                 }
7557                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
7558                         shader->info.num_input_vgprs += 1;
7559                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
7560                         shader->info.num_input_vgprs += 1;
7561                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
7562                         shader->info.num_input_vgprs += 1;
7563         }
7564
7565         return 0;
7566 }
7567
7568 /**
7569  * Create, compile and return a shader part (prolog or epilog).
7570  *
7571  * \param sscreen       screen
7572  * \param list          list of shader parts of the same category
7573  * \param type          shader type
7574  * \param key           shader part key
7575  * \param prolog        whether the part being requested is a prolog
7576  * \param tm            LLVM target machine
7577  * \param debug         debug callback
7578  * \param build         the callback responsible for building the main function
7579  * \return              non-NULL on success
7580  */
7581 static struct si_shader_part *
7582 si_get_shader_part(struct si_screen *sscreen,
7583                    struct si_shader_part **list,
7584                    enum pipe_shader_type type,
7585                    bool prolog,
7586                    union si_shader_part_key *key,
7587                    LLVMTargetMachineRef tm,
7588                    struct pipe_debug_callback *debug,
7589                    void (*build)(struct si_shader_context *,
7590                                  union si_shader_part_key *),
7591                    const char *name)
7592 {
7593         struct si_shader_part *result;
7594
7595         pipe_mutex_lock(sscreen->shader_parts_mutex);
7596
7597         /* Find existing. */
7598         for (result = *list; result; result = result->next) {
7599                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
7600                         pipe_mutex_unlock(sscreen->shader_parts_mutex);
7601                         return result;
7602                 }
7603         }
7604
7605         /* Compile a new one. */
7606         result = CALLOC_STRUCT(si_shader_part);
7607         result->key = *key;
7608
7609         struct si_shader shader = {};
7610         struct si_shader_context ctx;
7611         struct gallivm_state *gallivm = &ctx.gallivm;
7612
7613         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7614         ctx.type = type;
7615
7616         switch (type) {
7617         case PIPE_SHADER_VERTEX:
7618                 break;
7619         case PIPE_SHADER_TESS_CTRL:
7620                 assert(!prolog);
7621                 shader.key.part.tcs.epilog = key->tcs_epilog.states;
7622                 break;
7623         case PIPE_SHADER_GEOMETRY:
7624                 assert(prolog);
7625                 break;
7626         case PIPE_SHADER_FRAGMENT:
7627                 if (prolog)
7628                         shader.key.part.ps.prolog = key->ps_prolog.states;
7629                 else
7630                         shader.key.part.ps.epilog = key->ps_epilog.states;
7631                 break;
7632         default:
7633                 unreachable("bad shader part");
7634         }
7635
7636         build(&ctx, key);
7637
7638         /* Compile. */
7639         si_llvm_finalize_module(&ctx,
7640                 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
7641
7642         if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
7643                             gallivm->module, debug, ctx.type, name)) {
7644                 FREE(result);
7645                 result = NULL;
7646                 goto out;
7647         }
7648
7649         result->next = *list;
7650         *list = result;
7651
7652 out:
7653         si_llvm_dispose(&ctx);
7654         pipe_mutex_unlock(sscreen->shader_parts_mutex);
7655         return result;
7656 }
7657
7658 /**
7659  * Build the vertex shader prolog function.
7660  *
7661  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
7662  * All inputs are returned unmodified. The vertex load indices are
7663  * stored after them, which will be used by the API VS for fetching inputs.
7664  *
7665  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
7666  *   input_v0,
7667  *   input_v1,
7668  *   input_v2,
7669  *   input_v3,
7670  *   (VertexID + BaseVertex),
7671  *   (InstanceID + StartInstance),
7672  *   (InstanceID / 2 + StartInstance)
7673  */
7674 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
7675                                         union si_shader_part_key *key)
7676 {
7677         struct gallivm_state *gallivm = &ctx->gallivm;
7678         LLVMTypeRef *params, *returns;
7679         LLVMValueRef ret, func;
7680         int last_sgpr, num_params, num_returns, i;
7681
7682         ctx->param_vertex_id = key->vs_prolog.num_input_sgprs;
7683         ctx->param_instance_id = key->vs_prolog.num_input_sgprs + 3;
7684
7685         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
7686         params = alloca((key->vs_prolog.num_input_sgprs + 4) *
7687                         sizeof(LLVMTypeRef));
7688         returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
7689                           key->vs_prolog.last_input + 1) *
7690                          sizeof(LLVMTypeRef));
7691         num_params = 0;
7692         num_returns = 0;
7693
7694         /* Declare input and output SGPRs. */
7695         num_params = 0;
7696         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7697                 params[num_params++] = ctx->i32;
7698                 returns[num_returns++] = ctx->i32;
7699         }
7700         last_sgpr = num_params - 1;
7701
7702         /* 4 preloaded VGPRs (outputs must be floats) */
7703         for (i = 0; i < 4; i++) {
7704                 params[num_params++] = ctx->i32;
7705                 returns[num_returns++] = ctx->f32;
7706         }
7707
7708         /* Vertex load indices. */
7709         for (i = 0; i <= key->vs_prolog.last_input; i++)
7710                 returns[num_returns++] = ctx->f32;
7711
7712         /* Create the function. */
7713         si_create_function(ctx, "vs_prolog", returns, num_returns, params,
7714                            num_params, last_sgpr);
7715         func = ctx->main_fn;
7716
7717         /* Copy inputs to outputs. This should be no-op, as the registers match,
7718          * but it will prevent the compiler from overwriting them unintentionally.
7719          */
7720         ret = ctx->return_value;
7721         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7722                 LLVMValueRef p = LLVMGetParam(func, i);
7723                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7724         }
7725         for (i = num_params - 4; i < num_params; i++) {
7726                 LLVMValueRef p = LLVMGetParam(func, i);
7727                 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
7728                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7729         }
7730
7731         /* Compute vertex load indices from instance divisors. */
7732         for (i = 0; i <= key->vs_prolog.last_input; i++) {
7733                 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
7734                 LLVMValueRef index;
7735
7736                 if (divisor) {
7737                         /* InstanceID / Divisor + StartInstance */
7738                         index = get_instance_index_for_fetch(ctx,
7739                                                              SI_SGPR_START_INSTANCE,
7740                                                              divisor);
7741                 } else {
7742                         /* VertexID + BaseVertex */
7743                         index = LLVMBuildAdd(gallivm->builder,
7744                                              LLVMGetParam(func, ctx->param_vertex_id),
7745                                              LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
7746                 }
7747
7748                 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
7749                 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
7750                                            num_params++, "");
7751         }
7752
7753         si_llvm_build_ret(ctx, ret);
7754 }
7755
7756 /**
7757  * Build the vertex shader epilog function. This is also used by the tessellation
7758  * evaluation shader compiled as VS.
7759  *
7760  * The input is PrimitiveID.
7761  *
7762  * If PrimitiveID is required by the pixel shader, export it.
7763  * Otherwise, do nothing.
7764  */
7765 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
7766                                         union si_shader_part_key *key)
7767 {
7768         struct gallivm_state *gallivm = &ctx->gallivm;
7769         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7770         LLVMTypeRef params[5];
7771         int num_params, i;
7772
7773         /* Declare input VGPRs. */
7774         num_params = key->vs_epilog.states.export_prim_id ?
7775                            (VS_EPILOG_PRIMID_LOC + 1) : 0;
7776         assert(num_params <= ARRAY_SIZE(params));
7777
7778         for (i = 0; i < num_params; i++)
7779                 params[i] = ctx->f32;
7780
7781         /* Create the function. */
7782         si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
7783
7784         /* Emit exports. */
7785         if (key->vs_epilog.states.export_prim_id) {
7786                 struct lp_build_context *base = &bld_base->base;
7787                 struct lp_build_context *uint = &bld_base->uint_bld;
7788                 LLVMValueRef args[9];
7789
7790                 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
7791                 args[1] = uint->zero; /* whether the EXEC mask is valid */
7792                 args[2] = uint->zero; /* DONE bit */
7793                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
7794                                                key->vs_epilog.prim_id_param_offset);
7795                 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
7796                 args[5] = LLVMGetParam(ctx->main_fn,
7797                                        VS_EPILOG_PRIMID_LOC); /* X */
7798                 args[6] = base->undef; /* Y */
7799                 args[7] = base->undef; /* Z */
7800                 args[8] = base->undef; /* W */
7801
7802                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
7803                                    LLVMVoidTypeInContext(base->gallivm->context),
7804                                    args, 9, 0);
7805         }
7806
7807         LLVMBuildRetVoid(gallivm->builder);
7808 }
7809
7810 /**
7811  * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7812  */
7813 static bool si_get_vs_epilog(struct si_screen *sscreen,
7814                              LLVMTargetMachineRef tm,
7815                              struct si_shader *shader,
7816                              struct pipe_debug_callback *debug,
7817                              struct si_vs_epilog_bits *states)
7818 {
7819         union si_shader_part_key epilog_key;
7820
7821         si_get_vs_epilog_key(shader, states, &epilog_key);
7822
7823         shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7824                                             PIPE_SHADER_VERTEX, true,
7825                                             &epilog_key, tm, debug,
7826                                             si_build_vs_epilog_function,
7827                                             "Vertex Shader Epilog");
7828         return shader->epilog != NULL;
7829 }
7830
7831 /**
7832  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7833  */
7834 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7835                                       LLVMTargetMachineRef tm,
7836                                       struct si_shader *shader,
7837                                       struct pipe_debug_callback *debug)
7838 {
7839         struct tgsi_shader_info *info = &shader->selector->info;
7840         union si_shader_part_key prolog_key;
7841
7842         /* Get the prolog. */
7843         si_get_vs_prolog_key(shader, &prolog_key);
7844
7845         /* The prolog is a no-op if there are no inputs. */
7846         if (info->num_inputs) {
7847                 shader->prolog =
7848                         si_get_shader_part(sscreen, &sscreen->vs_prologs,
7849                                            PIPE_SHADER_VERTEX, true,
7850                                            &prolog_key, tm, debug,
7851                                            si_build_vs_prolog_function,
7852                                            "Vertex Shader Prolog");
7853                 if (!shader->prolog)
7854                         return false;
7855         }
7856
7857         /* Get the epilog. */
7858         if (!shader->key.as_es && !shader->key.as_ls &&
7859             !si_get_vs_epilog(sscreen, tm, shader, debug,
7860                               &shader->key.part.vs.epilog))
7861                 return false;
7862
7863         return true;
7864 }
7865
7866 /**
7867  * Select and compile (or reuse) TES parts (epilog).
7868  */
7869 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7870                                        LLVMTargetMachineRef tm,
7871                                        struct si_shader *shader,
7872                                        struct pipe_debug_callback *debug)
7873 {
7874         if (shader->key.as_es)
7875                 return true;
7876
7877         /* TES compiled as VS. */
7878         return si_get_vs_epilog(sscreen, tm, shader, debug,
7879                                 &shader->key.part.tes.epilog);
7880 }
7881
7882 /**
7883  * Compile the TCS epilog function. This writes tesselation factors to memory
7884  * based on the output primitive type of the tesselator (determined by TES).
7885  */
7886 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
7887                                          union si_shader_part_key *key)
7888 {
7889         struct gallivm_state *gallivm = &ctx->gallivm;
7890         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7891         LLVMTypeRef params[16];
7892         LLVMValueRef func;
7893         int last_sgpr, num_params;
7894
7895         /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7896         params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
7897         params[SI_PARAM_CONST_BUFFERS] = ctx->i64;
7898         params[SI_PARAM_SAMPLERS] = ctx->i64;
7899         params[SI_PARAM_IMAGES] = ctx->i64;
7900         params[SI_PARAM_SHADER_BUFFERS] = ctx->i64;
7901         params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
7902         params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
7903         params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
7904         params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
7905         params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
7906         params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
7907         last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7908         num_params = last_sgpr + 1;
7909
7910         params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
7911         params[num_params++] = ctx->i32; /* invocation ID within the patch */
7912         params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
7913
7914         /* Create the function. */
7915         si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
7916         declare_tess_lds(ctx);
7917         func = ctx->main_fn;
7918
7919         si_write_tess_factors(bld_base,
7920                               LLVMGetParam(func, last_sgpr + 1),
7921                               LLVMGetParam(func, last_sgpr + 2),
7922                               LLVMGetParam(func, last_sgpr + 3));
7923
7924         LLVMBuildRetVoid(gallivm->builder);
7925 }
7926
7927 /**
7928  * Select and compile (or reuse) TCS parts (epilog).
7929  */
7930 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7931                                        LLVMTargetMachineRef tm,
7932                                        struct si_shader *shader,
7933                                        struct pipe_debug_callback *debug)
7934 {
7935         union si_shader_part_key epilog_key;
7936
7937         /* Get the epilog. */
7938         memset(&epilog_key, 0, sizeof(epilog_key));
7939         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7940
7941         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7942                                             PIPE_SHADER_TESS_CTRL, false,
7943                                             &epilog_key, tm, debug,
7944                                             si_build_tcs_epilog_function,
7945                                             "Tessellation Control Shader Epilog");
7946         return shader->epilog != NULL;
7947 }
7948
7949 /**
7950  * Select and compile (or reuse) GS parts (prolog).
7951  */
7952 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7953                                       LLVMTargetMachineRef tm,
7954                                       struct si_shader *shader,
7955                                       struct pipe_debug_callback *debug)
7956 {
7957         union si_shader_part_key prolog_key;
7958
7959         if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7960                 return true;
7961
7962         memset(&prolog_key, 0, sizeof(prolog_key));
7963         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7964
7965         shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7966                                             PIPE_SHADER_GEOMETRY, true,
7967                                             &prolog_key, tm, debug,
7968                                             si_build_gs_prolog_function,
7969                                             "Geometry Shader Prolog");
7970         return shader->prolog != NULL;
7971 }
7972
7973 /**
7974  * Build the pixel shader prolog function. This handles:
7975  * - two-side color selection and interpolation
7976  * - overriding interpolation parameters for the API PS
7977  * - polygon stippling
7978  *
7979  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7980  * overriden by other states. (e.g. per-sample interpolation)
7981  * Interpolated colors are stored after the preloaded VGPRs.
7982  */
7983 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7984                                         union si_shader_part_key *key)
7985 {
7986         struct gallivm_state *gallivm = &ctx->gallivm;
7987         LLVMTypeRef *params;
7988         LLVMValueRef ret, func;
7989         int last_sgpr, num_params, num_returns, i, num_color_channels;
7990
7991         assert(si_need_ps_prolog(key));
7992
7993         /* Number of inputs + 8 color elements. */
7994         params = alloca((key->ps_prolog.num_input_sgprs +
7995                          key->ps_prolog.num_input_vgprs + 8) *
7996                         sizeof(LLVMTypeRef));
7997
7998         /* Declare inputs. */
7999         num_params = 0;
8000         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8001                 params[num_params++] = ctx->i32;
8002         last_sgpr = num_params - 1;
8003
8004         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8005                 params[num_params++] = ctx->f32;
8006
8007         /* Declare outputs (same as inputs + add colors if needed) */
8008         num_returns = num_params;
8009         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8010         for (i = 0; i < num_color_channels; i++)
8011                 params[num_returns++] = ctx->f32;
8012
8013         /* Create the function. */
8014         si_create_function(ctx, "ps_prolog", params, num_returns, params,
8015                            num_params, last_sgpr);
8016         func = ctx->main_fn;
8017
8018         /* Copy inputs to outputs. This should be no-op, as the registers match,
8019          * but it will prevent the compiler from overwriting them unintentionally.
8020          */
8021         ret = ctx->return_value;
8022         for (i = 0; i < num_params; i++) {
8023                 LLVMValueRef p = LLVMGetParam(func, i);
8024                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8025         }
8026
8027         /* Polygon stippling. */
8028         if (key->ps_prolog.states.poly_stipple) {
8029                 /* POS_FIXED_PT is always last. */
8030                 unsigned pos = key->ps_prolog.num_input_sgprs +
8031                                key->ps_prolog.num_input_vgprs - 1;
8032                 LLVMValueRef ptr[2], list;
8033
8034                 /* Get the pointer to rw buffers. */
8035                 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8036                 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8037                 list = lp_build_gather_values(gallivm, ptr, 2);
8038                 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8039                 list = LLVMBuildIntToPtr(gallivm->builder, list,
8040                                           const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
8041
8042                 si_llvm_emit_polygon_stipple(ctx, list, pos);
8043         }
8044
8045         if (key->ps_prolog.states.bc_optimize_for_persp ||
8046             key->ps_prolog.states.bc_optimize_for_linear) {
8047                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8048                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8049
8050                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8051                  * The hw doesn't compute CENTROID if the whole wave only
8052                  * contains fully-covered quads.
8053                  *
8054                  * PRIM_MASK is after user SGPRs.
8055                  */
8056                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8057                 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8058                                             LLVMConstInt(ctx->i32, 31, 0), "");
8059                 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8060                                              ctx->i1, "");
8061
8062                 if (key->ps_prolog.states.bc_optimize_for_persp) {
8063                         /* Read PERSP_CENTER. */
8064                         for (i = 0; i < 2; i++)
8065                                 center[i] = LLVMGetParam(func, base + 2 + i);
8066                         /* Read PERSP_CENTROID. */
8067                         for (i = 0; i < 2; i++)
8068                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
8069                         /* Select PERSP_CENTROID. */
8070                         for (i = 0; i < 2; i++) {
8071                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8072                                                       center[i], centroid[i], "");
8073                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8074                                                            tmp, base + 4 + i, "");
8075                         }
8076                 }
8077                 if (key->ps_prolog.states.bc_optimize_for_linear) {
8078                         /* Read LINEAR_CENTER. */
8079                         for (i = 0; i < 2; i++)
8080                                 center[i] = LLVMGetParam(func, base + 8 + i);
8081                         /* Read LINEAR_CENTROID. */
8082                         for (i = 0; i < 2; i++)
8083                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
8084                         /* Select LINEAR_CENTROID. */
8085                         for (i = 0; i < 2; i++) {
8086                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8087                                                       center[i], centroid[i], "");
8088                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8089                                                            tmp, base + 10 + i, "");
8090                         }
8091                 }
8092         }
8093
8094         /* Force per-sample interpolation. */
8095         if (key->ps_prolog.states.force_persp_sample_interp) {
8096                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8097                 LLVMValueRef persp_sample[2];
8098
8099                 /* Read PERSP_SAMPLE. */
8100                 for (i = 0; i < 2; i++)
8101                         persp_sample[i] = LLVMGetParam(func, base + i);
8102                 /* Overwrite PERSP_CENTER. */
8103                 for (i = 0; i < 2; i++)
8104                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8105                                                    persp_sample[i], base + 2 + i, "");
8106                 /* Overwrite PERSP_CENTROID. */
8107                 for (i = 0; i < 2; i++)
8108                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8109                                                    persp_sample[i], base + 4 + i, "");
8110         }
8111         if (key->ps_prolog.states.force_linear_sample_interp) {
8112                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8113                 LLVMValueRef linear_sample[2];
8114
8115                 /* Read LINEAR_SAMPLE. */
8116                 for (i = 0; i < 2; i++)
8117                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8118                 /* Overwrite LINEAR_CENTER. */
8119                 for (i = 0; i < 2; i++)
8120                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8121                                                    linear_sample[i], base + 8 + i, "");
8122                 /* Overwrite LINEAR_CENTROID. */
8123                 for (i = 0; i < 2; i++)
8124                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8125                                                    linear_sample[i], base + 10 + i, "");
8126         }
8127
8128         /* Force center interpolation. */
8129         if (key->ps_prolog.states.force_persp_center_interp) {
8130                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8131                 LLVMValueRef persp_center[2];
8132
8133                 /* Read PERSP_CENTER. */
8134                 for (i = 0; i < 2; i++)
8135                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
8136                 /* Overwrite PERSP_SAMPLE. */
8137                 for (i = 0; i < 2; i++)
8138                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8139                                                    persp_center[i], base + i, "");
8140                 /* Overwrite PERSP_CENTROID. */
8141                 for (i = 0; i < 2; i++)
8142                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8143                                                    persp_center[i], base + 4 + i, "");
8144         }
8145         if (key->ps_prolog.states.force_linear_center_interp) {
8146                 unsigned i, base = key->ps_prolog.num_input_sgprs;
8147                 LLVMValueRef linear_center[2];
8148
8149                 /* Read LINEAR_CENTER. */
8150                 for (i = 0; i < 2; i++)
8151                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
8152                 /* Overwrite LINEAR_SAMPLE. */
8153                 for (i = 0; i < 2; i++)
8154                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8155                                                    linear_center[i], base + 6 + i, "");
8156                 /* Overwrite LINEAR_CENTROID. */
8157                 for (i = 0; i < 2; i++)
8158                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
8159                                                    linear_center[i], base + 10 + i, "");
8160         }
8161
8162         /* Interpolate colors. */
8163         for (i = 0; i < 2; i++) {
8164                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8165                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8166                                      key->ps_prolog.face_vgpr_index;
8167                 LLVMValueRef interp[2], color[4];
8168                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8169
8170                 if (!writemask)
8171                         continue;
8172
8173                 /* If the interpolation qualifier is not CONSTANT (-1). */
8174                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8175                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8176                                                key->ps_prolog.color_interp_vgpr_index[i];
8177
8178                         /* Get the (i,j) updated by bc_optimize handling. */
8179                         interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8180                                                           interp_vgpr, "");
8181                         interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8182                                                           interp_vgpr + 1, "");
8183                         interp_ij = lp_build_gather_values(gallivm, interp, 2);
8184                 }
8185
8186                 /* Use the absolute location of the input. */
8187                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8188
8189                 if (key->ps_prolog.states.color_two_side) {
8190                         face = LLVMGetParam(func, face_vgpr);
8191                         face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8192                 }
8193
8194                 interp_fs_input(ctx,
8195                                 key->ps_prolog.color_attr_index[i],
8196                                 TGSI_SEMANTIC_COLOR, i,
8197                                 key->ps_prolog.num_interp_inputs,
8198                                 key->ps_prolog.colors_read, interp_ij,
8199                                 prim_mask, face, color);
8200
8201                 while (writemask) {
8202                         unsigned chan = u_bit_scan(&writemask);
8203                         ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8204                                                    num_params++, "");
8205                 }
8206         }
8207
8208         /* Tell LLVM to insert WQM instruction sequence when needed. */
8209         if (key->ps_prolog.wqm) {
8210                 LLVMAddTargetDependentFunctionAttr(func,
8211                                                    "amdgpu-ps-wqm-outputs", "");
8212         }
8213
8214         si_llvm_build_ret(ctx, ret);
8215 }
8216
8217 /**
8218  * Build the pixel shader epilog function. This handles everything that must be
8219  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8220  */
8221 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8222                                         union si_shader_part_key *key)
8223 {
8224         struct gallivm_state *gallivm = &ctx->gallivm;
8225         struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8226         LLVMTypeRef params[16+8*4+3];
8227         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8228         int last_sgpr, num_params, i;
8229         struct si_ps_exports exp = {};
8230
8231         /* Declare input SGPRs. */
8232         params[SI_PARAM_RW_BUFFERS] = ctx->i64;
8233         params[SI_PARAM_CONST_BUFFERS] = ctx->i64;
8234         params[SI_PARAM_SAMPLERS] = ctx->i64;
8235         params[SI_PARAM_IMAGES] = ctx->i64;
8236         params[SI_PARAM_SHADER_BUFFERS] = ctx->i64;
8237         params[SI_PARAM_ALPHA_REF] = ctx->f32;
8238         last_sgpr = SI_PARAM_ALPHA_REF;
8239
8240         /* Declare input VGPRs. */
8241         num_params = (last_sgpr + 1) +
8242                      util_bitcount(key->ps_epilog.colors_written) * 4 +
8243                      key->ps_epilog.writes_z +
8244                      key->ps_epilog.writes_stencil +
8245                      key->ps_epilog.writes_samplemask;
8246
8247         num_params = MAX2(num_params,
8248                           last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8249
8250         assert(num_params <= ARRAY_SIZE(params));
8251
8252         for (i = last_sgpr + 1; i < num_params; i++)
8253                 params[i] = ctx->f32;
8254
8255         /* Create the function. */
8256         si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
8257         /* Disable elimination of unused inputs. */
8258         si_llvm_add_attribute(ctx->main_fn,
8259                                   "InitialPSInputAddr", 0xffffff);
8260
8261         /* Process colors. */
8262         unsigned vgpr = last_sgpr + 1;
8263         unsigned colors_written = key->ps_epilog.colors_written;
8264         int last_color_export = -1;
8265
8266         /* Find the last color export. */
8267         if (!key->ps_epilog.writes_z &&
8268             !key->ps_epilog.writes_stencil &&
8269             !key->ps_epilog.writes_samplemask) {
8270                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8271
8272                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8273                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8274                         /* Just set this if any of the colorbuffers are enabled. */
8275                         if (spi_format &
8276                             ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8277                                 last_color_export = 0;
8278                 } else {
8279                         for (i = 0; i < 8; i++)
8280                                 if (colors_written & (1 << i) &&
8281                                     (spi_format >> (i * 4)) & 0xf)
8282                                         last_color_export = i;
8283                 }
8284         }
8285
8286         while (colors_written) {
8287                 LLVMValueRef color[4];
8288                 int mrt = u_bit_scan(&colors_written);
8289
8290                 for (i = 0; i < 4; i++)
8291                         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8292
8293                 si_export_mrt_color(bld_base, color, mrt,
8294                                     num_params - 1,
8295                                     mrt == last_color_export, &exp);
8296         }
8297
8298         /* Process depth, stencil, samplemask. */
8299         if (key->ps_epilog.writes_z)
8300                 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8301         if (key->ps_epilog.writes_stencil)
8302                 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8303         if (key->ps_epilog.writes_samplemask)
8304                 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8305
8306         if (depth || stencil || samplemask)
8307                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8308         else if (last_color_export == -1)
8309                 si_export_null(bld_base);
8310
8311         if (exp.num)
8312                 si_emit_ps_exports(ctx, &exp);
8313
8314         /* Compile. */
8315         LLVMBuildRetVoid(gallivm->builder);
8316 }
8317
8318 /**
8319  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
8320  */
8321 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
8322                                       LLVMTargetMachineRef tm,
8323                                       struct si_shader *shader,
8324                                       struct pipe_debug_callback *debug)
8325 {
8326         union si_shader_part_key prolog_key;
8327         union si_shader_part_key epilog_key;
8328
8329         /* Get the prolog. */
8330         si_get_ps_prolog_key(shader, &prolog_key, true);
8331
8332         /* The prolog is a no-op if these aren't set. */
8333         if (si_need_ps_prolog(&prolog_key)) {
8334                 shader->prolog =
8335                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
8336                                            PIPE_SHADER_FRAGMENT, true,
8337                                            &prolog_key, tm, debug,
8338                                            si_build_ps_prolog_function,
8339                                            "Fragment Shader Prolog");
8340                 if (!shader->prolog)
8341                         return false;
8342         }
8343
8344         /* Get the epilog. */
8345         si_get_ps_epilog_key(shader, &epilog_key);
8346
8347         shader->epilog =
8348                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
8349                                    PIPE_SHADER_FRAGMENT, false,
8350                                    &epilog_key, tm, debug,
8351                                    si_build_ps_epilog_function,
8352                                    "Fragment Shader Epilog");
8353         if (!shader->epilog)
8354                 return false;
8355
8356         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
8357         if (shader->key.part.ps.prolog.poly_stipple) {
8358                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
8359                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
8360         }
8361
8362         /* Set up the enable bits for per-sample shading if needed. */
8363         if (shader->key.part.ps.prolog.force_persp_sample_interp &&
8364             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8365              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8366                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
8367                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8368                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
8369         }
8370         if (shader->key.part.ps.prolog.force_linear_sample_interp &&
8371             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8372              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8373                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
8374                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8375                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
8376         }
8377         if (shader->key.part.ps.prolog.force_persp_center_interp &&
8378             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8379              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8380                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
8381                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8382                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8383         }
8384         if (shader->key.part.ps.prolog.force_linear_center_interp &&
8385             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8386              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8387                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
8388                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8389                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8390         }
8391
8392         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
8393         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
8394             !(shader->config.spi_ps_input_ena & 0xf)) {
8395                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8396                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
8397         }
8398
8399         /* At least one pair of interpolation weights must be enabled. */
8400         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
8401                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8402                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
8403         }
8404
8405         /* The sample mask input is always enabled, because the API shader always
8406          * passes it through to the epilog. Disable it here if it's unused.
8407          */
8408         if (!shader->key.part.ps.epilog.poly_line_smoothing &&
8409             !shader->selector->info.reads_samplemask)
8410                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
8411
8412         return true;
8413 }
8414
8415 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
8416                                       unsigned *lds_size)
8417 {
8418         /* SPI barrier management bug:
8419          *   Make sure we have at least 4k of LDS in use to avoid the bug.
8420          *   It applies to workgroup sizes of more than one wavefront.
8421          */
8422         if (sscreen->b.family == CHIP_BONAIRE ||
8423             sscreen->b.family == CHIP_KABINI ||
8424             sscreen->b.family == CHIP_MULLINS)
8425                 *lds_size = MAX2(*lds_size, 8);
8426 }
8427
8428 static void si_fix_resource_usage(struct si_screen *sscreen,
8429                                   struct si_shader *shader)
8430 {
8431         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
8432
8433         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
8434
8435         if (shader->selector->type == PIPE_SHADER_COMPUTE &&
8436             si_get_max_workgroup_size(shader) > 64) {
8437                 si_multiwave_lds_size_workaround(sscreen,
8438                                                  &shader->config.lds_size);
8439         }
8440 }
8441
8442 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
8443                      struct si_shader *shader,
8444                      struct pipe_debug_callback *debug)
8445 {
8446         struct si_shader_selector *sel = shader->selector;
8447         struct si_shader *mainp = sel->main_shader_part;
8448         int r;
8449
8450         /* LS, ES, VS are compiled on demand if the main part hasn't been
8451          * compiled for that stage.
8452          *
8453          * Vertex shaders are compiled on demand when a vertex fetch
8454          * workaround must be applied.
8455          */
8456         if (shader->is_monolithic) {
8457                 /* Monolithic shader (compiled as a whole, has many variants,
8458                  * may take a long time to compile).
8459                  */
8460                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
8461                 if (r)
8462                         return r;
8463         } else {
8464                 /* The shader consists of 2-3 parts:
8465                  *
8466                  * - the middle part is the user shader, it has 1 variant only
8467                  *   and it was compiled during the creation of the shader
8468                  *   selector
8469                  * - the prolog part is inserted at the beginning
8470                  * - the epilog part is inserted at the end
8471                  *
8472                  * The prolog and epilog have many (but simple) variants.
8473                  */
8474
8475                 /* Copy the compiled TGSI shader data over. */
8476                 shader->is_binary_shared = true;
8477                 shader->binary = mainp->binary;
8478                 shader->config = mainp->config;
8479                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
8480                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
8481                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
8482                 memcpy(shader->info.vs_output_param_offset,
8483                        mainp->info.vs_output_param_offset,
8484                        sizeof(mainp->info.vs_output_param_offset));
8485                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
8486                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
8487                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
8488
8489                 /* Select prologs and/or epilogs. */
8490                 switch (sel->type) {
8491                 case PIPE_SHADER_VERTEX:
8492                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
8493                                 return -1;
8494                         break;
8495                 case PIPE_SHADER_TESS_CTRL:
8496                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
8497                                 return -1;
8498                         break;
8499                 case PIPE_SHADER_TESS_EVAL:
8500                         if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
8501                                 return -1;
8502                         break;
8503                 case PIPE_SHADER_GEOMETRY:
8504                         if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
8505                                 return -1;
8506                         break;
8507                 case PIPE_SHADER_FRAGMENT:
8508                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
8509                                 return -1;
8510
8511                         /* Make sure we have at least as many VGPRs as there
8512                          * are allocated inputs.
8513                          */
8514                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8515                                                         shader->info.num_input_vgprs);
8516                         break;
8517                 }
8518
8519                 /* Update SGPR and VGPR counts. */
8520                 if (shader->prolog) {
8521                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8522                                                         shader->prolog->config.num_sgprs);
8523                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8524                                                         shader->prolog->config.num_vgprs);
8525                 }
8526                 if (shader->epilog) {
8527                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8528                                                         shader->epilog->config.num_sgprs);
8529                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8530                                                         shader->epilog->config.num_vgprs);
8531                 }
8532         }
8533
8534         si_fix_resource_usage(sscreen, shader);
8535         si_shader_dump(sscreen, shader, debug, sel->info.processor,
8536                        stderr, true);
8537
8538         /* Upload. */
8539         r = si_shader_binary_upload(sscreen, shader);
8540         if (r) {
8541                 fprintf(stderr, "LLVM failed to upload shader\n");
8542                 return r;
8543         }
8544
8545         return 0;
8546 }
8547
8548 void si_shader_destroy(struct si_shader *shader)
8549 {
8550         if (shader->scratch_bo)
8551                 r600_resource_reference(&shader->scratch_bo, NULL);
8552
8553         r600_resource_reference(&shader->bo, NULL);
8554
8555         if (!shader->is_binary_shared)
8556                 radeon_shader_binary_clean(&shader->binary);
8557
8558         free(shader->shader_log);
8559 }