src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_bitarit.h"
  35 #include "gallivm/lp_bld_flow.h"
  36 #include "gallivm/lp_bld_misc.h"
  37 #include "radeon/r600_cs.h"
  38 #include "radeon/radeon_llvm.h"
  39 #include "radeon/radeon_elf_util.h"
  40 #include "radeon/radeon_llvm_emit.h"
  41 #include "util/u_memory.h"
  42 #include "util/u_string.h"
  43 #include "tgsi/tgsi_parse.h"
  44 #include "tgsi/tgsi_build.h"
  45 #include "tgsi/tgsi_util.h"
  46 #include "tgsi/tgsi_dump.h"
  47
  48 #include "si_pipe.h"
  49 #include "si_shader.h"
  50 #include "sid.h"
  51
  52 #include <errno.h>
  53
  54 static const char *scratch_rsrc_dword0_symbol =
  55         "SCRATCH_RSRC_DWORD0";
  56
  57 static const char *scratch_rsrc_dword1_symbol =
  58         "SCRATCH_RSRC_DWORD1";
  59
  60 struct si_shader_output_values
  61 {
  62         LLVMValueRef values[4];
  63         unsigned name;
  64         unsigned sid;
  65 };
  66
  67 struct si_shader_context
  68 {
  69         struct radeon_llvm_context radeon_bld;
  70         struct si_shader *shader;
  71         struct si_screen *screen;
  72
  73         unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
  74         bool is_gs_copy_shader;
  75
  76         /* Whether to generate the optimized shader variant compiled as a whole
  77          * (without a prolog and epilog)
  78          */
  79         bool is_monolithic;
  80
  81         int param_streamout_config;
  82         int param_streamout_write_index;
  83         int param_streamout_offset[4];
  84         int param_vertex_id;
  85         int param_rel_auto_id;
  86         int param_vs_prim_id;
  87         int param_instance_id;
  88         int param_vertex_index0;
  89         int param_tes_u;
  90         int param_tes_v;
  91         int param_tes_rel_patch_id;
  92         int param_tes_patch_id;
  93         int param_es2gs_offset;
  94         int param_oc_lds;
  95
  96         /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
  97          * 0x800000 for VS, 0x1 for ES.
  98          */
  99         int param_tess_offchip;
 100
 101         LLVMTargetMachineRef tm;
 102
 103         unsigned invariant_load_md_kind;
 104         unsigned range_md_kind;
 105         unsigned uniform_md_kind;
 106         LLVMValueRef empty_md;
 107
 108         LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
 109         LLVMValueRef lds;
 110         LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
 111         LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
 112         LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
 113         LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
 114         LLVMValueRef fmasks[SI_NUM_SAMPLERS];
 115         LLVMValueRef images[SI_NUM_IMAGES];
 116         LLVMValueRef so_buffers[4];
 117         LLVMValueRef esgs_ring;
 118         LLVMValueRef gsvs_ring[4];
 119         LLVMValueRef gs_next_vertex[4];
 120         LLVMValueRef return_value;
 121
 122         LLVMTypeRef voidt;
 123         LLVMTypeRef i1;
 124         LLVMTypeRef i8;
 125         LLVMTypeRef i32;
 126         LLVMTypeRef i64;
 127         LLVMTypeRef i128;
 128         LLVMTypeRef f32;
 129         LLVMTypeRef v16i8;
 130         LLVMTypeRef v2i32;
 131         LLVMTypeRef v4i32;
 132         LLVMTypeRef v4f32;
 133         LLVMTypeRef v8i32;
 134
 135         LLVMValueRef shared_memory;
 136 };
 137
 138 static struct si_shader_context *si_shader_context(
 139         struct lp_build_tgsi_context *bld_base)
 140 {
 141         return (struct si_shader_context *)bld_base;
 142 }
 143
 144 static void si_init_shader_ctx(struct si_shader_context *ctx,
 145                                struct si_screen *sscreen,
 146                                struct si_shader *shader,
 147                                LLVMTargetMachineRef tm);
 148
 149 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 150                                  struct lp_build_tgsi_context *bld_base,
 151                                  struct lp_build_emit_data *emit_data);
 152
 153 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
 154                                FILE *f);
 155
 156 /* Ideally pass the sample mask input to the PS epilog as v13, which
 157  * is its usual location, so that the shader doesn't have to add v_mov.
 158  */
 159 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
 160
 161 /* The VS location of the PrimitiveID input is the same in the epilog,
 162  * so that the main shader part doesn't have to move it.
 163  */
 164 #define VS_EPILOG_PRIMID_LOC 2
 165
 166 #define PERSPECTIVE_BASE 0
 167 #define LINEAR_BASE 9
 168
 169 #define SAMPLE_OFFSET 0
 170 #define CENTER_OFFSET 2
 171 #define CENTROID_OFSET 4
 172
 173 #define USE_SGPR_MAX_SUFFIX_LEN 5
 174 #define CONST_ADDR_SPACE 2
 175 #define LOCAL_ADDR_SPACE 3
 176 #define USER_SGPR_ADDR_SPACE 8
 177
 178
 179 #define SENDMSG_GS 2
 180 #define SENDMSG_GS_DONE 3
 181
 182 #define SENDMSG_GS_OP_NOP      (0 << 4)
 183 #define SENDMSG_GS_OP_CUT      (1 << 4)
 184 #define SENDMSG_GS_OP_EMIT     (2 << 4)
 185 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
 186
 187 /**
 188  * Returns a unique index for a semantic name and index. The index must be
 189  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 190  * calculated.
 191  */
 192 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 193 {
 194         switch (semantic_name) {
 195         case TGSI_SEMANTIC_POSITION:
 196                 return 0;
 197         case TGSI_SEMANTIC_PSIZE:
 198                 return 1;
 199         case TGSI_SEMANTIC_CLIPDIST:
 200                 assert(index <= 1);
 201                 return 2 + index;
 202         case TGSI_SEMANTIC_GENERIC:
 203                 if (index <= 63-4)
 204                         return 4 + index;
 205                 else
 206                         /* same explanation as in the default statement,
 207                          * the only user hitting this is st/nine.
 208                          */
 209                         return 0;
 210
 211         /* patch indices are completely separate and thus start from 0 */
 212         case TGSI_SEMANTIC_TESSOUTER:
 213                 return 0;
 214         case TGSI_SEMANTIC_TESSINNER:
 215                 return 1;
 216         case TGSI_SEMANTIC_PATCH:
 217                 return 2 + index;
 218
 219         default:
 220                 /* Don't fail here. The result of this function is only used
 221                  * for LS, TCS, TES, and GS, where legacy GL semantics can't
 222                  * occur, but this function is called for all vertex shaders
 223                  * before it's known whether LS will be compiled or not.
 224                  */
 225                 return 0;
 226         }
 227 }
 228
 229 /**
 230  * Get the value of a shader input parameter and extract a bitfield.
 231  */
 232 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 233                                  unsigned param, unsigned rshift,
 234                                  unsigned bitwidth)
 235 {
 236         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 237         LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
 238                                           param);
 239
 240         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 241                 value = bitcast(&ctx->radeon_bld.soa.bld_base,
 242                                 TGSI_TYPE_UNSIGNED, value);
 243
 244         if (rshift)
 245                 value = LLVMBuildLShr(gallivm->builder, value,
 246                                       lp_build_const_int32(gallivm, rshift), "");
 247
 248         if (rshift + bitwidth < 32) {
 249                 unsigned mask = (1 << bitwidth) - 1;
 250                 value = LLVMBuildAnd(gallivm->builder, value,
 251                                      lp_build_const_int32(gallivm, mask), "");
 252         }
 253
 254         return value;
 255 }
 256
 257 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 258 {
 259         switch (ctx->type) {
 260         case PIPE_SHADER_TESS_CTRL:
 261                 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
 262
 263         case PIPE_SHADER_TESS_EVAL:
 264                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 265                                     ctx->param_tes_rel_patch_id);
 266
 267         default:
 268                 assert(0);
 269                 return NULL;
 270         }
 271 }
 272
 273 /* Tessellation shaders pass outputs to the next shader using LDS.
 274  *
 275  * LS outputs = TCS inputs
 276  * TCS outputs = TES inputs
 277  *
 278  * The LDS layout is:
 279  * - TCS inputs for patch 0
 280  * - TCS inputs for patch 1
 281  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 282  * - ...
 283  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 284  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 285  * - TCS outputs for patch 1
 286  * - Per-patch TCS outputs for patch 1
 287  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 288  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 289  * - ...
 290  *
 291  * All three shaders VS(LS), TCS, TES share the same LDS space.
 292  */
 293
 294 static LLVMValueRef
 295 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 296 {
 297         if (ctx->type == PIPE_SHADER_VERTEX)
 298                 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
 299         else if (ctx->type == PIPE_SHADER_TESS_CTRL)
 300                 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
 301         else {
 302                 assert(0);
 303                 return NULL;
 304         }
 305 }
 306
 307 static LLVMValueRef
 308 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 309 {
 310         return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
 311 }
 312
 313 static LLVMValueRef
 314 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 315 {
 316         return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
 317                                 unpack_param(ctx,
 318                                              SI_PARAM_TCS_OUT_OFFSETS,
 319                                              0, 16),
 320                                 4);
 321 }
 322
 323 static LLVMValueRef
 324 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 325 {
 326         return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
 327                                 unpack_param(ctx,
 328                                              SI_PARAM_TCS_OUT_OFFSETS,
 329                                              16, 16),
 330                                 4);
 331 }
 332
 333 static LLVMValueRef
 334 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 335 {
 336         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 337         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 338         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 339
 340         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 341 }
 342
 343 static LLVMValueRef
 344 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 345 {
 346         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 347         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 348         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 349         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 350
 351         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 352                             LLVMBuildMul(gallivm->builder, patch_stride,
 353                                          rel_patch_id, ""),
 354                             "");
 355 }
 356
 357 static LLVMValueRef
 358 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 359 {
 360         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 361         LLVMValueRef patch0_patch_data_offset =
 362                 get_tcs_out_patch0_patch_data_offset(ctx);
 363         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 364         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 365
 366         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 367                             LLVMBuildMul(gallivm->builder, patch_stride,
 368                                          rel_patch_id, ""),
 369                             "");
 370 }
 371
 372 static void build_indexed_store(struct si_shader_context *ctx,
 373                                 LLVMValueRef base_ptr, LLVMValueRef index,
 374                                 LLVMValueRef value)
 375 {
 376         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 377         struct gallivm_state *gallivm = bld_base->base.gallivm;
 378         LLVMValueRef indices[2], pointer;
 379
 380         indices[0] = bld_base->uint_bld.zero;
 381         indices[1] = index;
 382
 383         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 384         LLVMBuildStore(gallivm->builder, value, pointer);
 385 }
 386
 387 /**
 388  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
 389  * It's equivalent to doing a load from &base_ptr[index].
 390  *
 391  * \param base_ptr  Where the array starts.
 392  * \param index     The element index into the array.
 393  * \param uniform   Whether the base_ptr and index can be assumed to be
 394  *                  dynamically uniform
 395  */
 396 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
 397                                        LLVMValueRef base_ptr, LLVMValueRef index,
 398                                        bool uniform)
 399 {
 400         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 401         struct gallivm_state *gallivm = bld_base->base.gallivm;
 402         LLVMValueRef indices[2], pointer;
 403
 404         indices[0] = bld_base->uint_bld.zero;
 405         indices[1] = index;
 406
 407         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 408         if (uniform)
 409                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
 410         return LLVMBuildLoad(gallivm->builder, pointer, "");
 411 }
 412
 413 /**
 414  * Do a load from &base_ptr[index], but also add a flag that it's loading
 415  * a constant from a dynamically uniform index.
 416  */
 417 static LLVMValueRef build_indexed_load_const(
 418         struct si_shader_context *ctx,
 419         LLVMValueRef base_ptr, LLVMValueRef index)
 420 {
 421         LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
 422         LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
 423         return result;
 424 }
 425
 426 static LLVMValueRef get_instance_index_for_fetch(
 427         struct radeon_llvm_context *radeon_bld,
 428         unsigned param_start_instance, unsigned divisor)
 429 {
 430         struct si_shader_context *ctx =
 431                 si_shader_context(&radeon_bld->soa.bld_base);
 432         struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
 433
 434         LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
 435                                            ctx->param_instance_id);
 436
 437         /* The division must be done before START_INSTANCE is added. */
 438         if (divisor > 1)
 439                 result = LLVMBuildUDiv(gallivm->builder, result,
 440                                 lp_build_const_int32(gallivm, divisor), "");
 441
 442         return LLVMBuildAdd(gallivm->builder, result,
 443                             LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 444 }
 445
 446 static void declare_input_vs(
 447         struct radeon_llvm_context *radeon_bld,
 448         unsigned input_index,
 449         const struct tgsi_full_declaration *decl)
 450 {
 451         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 452         struct gallivm_state *gallivm = base->gallivm;
 453         struct si_shader_context *ctx =
 454                 si_shader_context(&radeon_bld->soa.bld_base);
 455         unsigned divisor =
 456                 ctx->shader->key.vs.prolog.instance_divisors[input_index];
 457
 458         unsigned chan;
 459
 460         LLVMValueRef t_list_ptr;
 461         LLVMValueRef t_offset;
 462         LLVMValueRef t_list;
 463         LLVMValueRef attribute_offset;
 464         LLVMValueRef buffer_index;
 465         LLVMValueRef args[3];
 466         LLVMValueRef input;
 467
 468         /* Load the T list */
 469         t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
 470
 471         t_offset = lp_build_const_int32(gallivm, input_index);
 472
 473         t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
 474
 475         /* Build the attribute offset */
 476         attribute_offset = lp_build_const_int32(gallivm, 0);
 477
 478         if (!ctx->is_monolithic) {
 479                 buffer_index = LLVMGetParam(radeon_bld->main_fn,
 480                                             ctx->param_vertex_index0 +
 481                                             input_index);
 482         } else if (divisor) {
 483                 /* Build index from instance ID, start instance and divisor */
 484                 ctx->shader->info.uses_instanceid = true;
 485                 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
 486                                                             SI_PARAM_START_INSTANCE,
 487                                                             divisor);
 488         } else {
 489                 /* Load the buffer index for vertices. */
 490                 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
 491                                                       ctx->param_vertex_id);
 492                 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
 493                                                         SI_PARAM_BASE_VERTEX);
 494                 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
 495         }
 496
 497         args[0] = t_list;
 498         args[1] = attribute_offset;
 499         args[2] = buffer_index;
 500         input = lp_build_intrinsic(gallivm->builder,
 501                 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
 502                 LLVMReadNoneAttribute);
 503
 504         /* Break up the vec4 into individual components */
 505         for (chan = 0; chan < 4; chan++) {
 506                 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 507                 /* XXX: Use a helper function for this.  There is one in
 508                  * tgsi_llvm.c. */
 509                 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
 510                                 LLVMBuildExtractElement(gallivm->builder,
 511                                 input, llvm_chan, "");
 512         }
 513 }
 514
 515 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 516                                      unsigned swizzle)
 517 {
 518         struct si_shader_context *ctx = si_shader_context(bld_base);
 519
 520         if (swizzle > 0)
 521                 return bld_base->uint_bld.zero;
 522
 523         switch (ctx->type) {
 524         case PIPE_SHADER_VERTEX:
 525                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 526                                     ctx->param_vs_prim_id);
 527         case PIPE_SHADER_TESS_CTRL:
 528                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 529                                     SI_PARAM_PATCH_ID);
 530         case PIPE_SHADER_TESS_EVAL:
 531                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 532                                     ctx->param_tes_patch_id);
 533         case PIPE_SHADER_GEOMETRY:
 534                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 535                                     SI_PARAM_PRIMITIVE_ID);
 536         default:
 537                 assert(0);
 538                 return bld_base->uint_bld.zero;
 539         }
 540 }
 541
 542 /**
 543  * Return the value of tgsi_ind_register for indexing.
 544  * This is the indirect index with the constant offset added to it.
 545  */
 546 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 547                                        const struct tgsi_ind_register *ind,
 548                                        int rel_index)
 549 {
 550         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 551         LLVMValueRef result;
 552
 553         result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
 554         result = LLVMBuildLoad(gallivm->builder, result, "");
 555         result = LLVMBuildAdd(gallivm->builder, result,
 556                               lp_build_const_int32(gallivm, rel_index), "");
 557         return result;
 558 }
 559
 560 /**
 561  * Like get_indirect_index, but restricts the return value to a (possibly
 562  * undefined) value inside [0..num).
 563  */
 564 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
 565                                                const struct tgsi_ind_register *ind,
 566                                                int rel_index, unsigned num)
 567 {
 568         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 569         LLVMBuilderRef builder = gallivm->builder;
 570         LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
 571         LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
 572         LLVMValueRef cc;
 573
 574         /* LLVM 3.8: If indirect resource indexing is used:
 575          * - SI & CIK hang
 576          * - VI crashes
 577          */
 578         if (HAVE_LLVM <= 0x0308)
 579                 return LLVMGetUndef(ctx->i32);
 580
 581         if (util_is_power_of_two(num)) {
 582                 result = LLVMBuildAnd(builder, result, c_max, "");
 583         } else {
 584                 /* In theory, this MAX pattern should result in code that is
 585                  * as good as the bit-wise AND above.
 586                  *
 587                  * In practice, LLVM generates worse code (at the time of
 588                  * writing), because its value tracking is not strong enough.
 589                  */
 590                 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
 591                 result = LLVMBuildSelect(builder, cc, result, c_max, "");
 592         }
 593
 594         return result;
 595 }
 596
 597
 598 /**
 599  * Calculate a dword address given an input or output register and a stride.
 600  */
 601 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 602                                    const struct tgsi_full_dst_register *dst,
 603                                    const struct tgsi_full_src_register *src,
 604                                    LLVMValueRef vertex_dw_stride,
 605                                    LLVMValueRef base_addr)
 606 {
 607         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 608         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 609         ubyte *name, *index, *array_first;
 610         int first, param;
 611         struct tgsi_full_dst_register reg;
 612
 613         /* Set the register description. The address computation is the same
 614          * for sources and destinations. */
 615         if (src) {
 616                 reg.Register.File = src->Register.File;
 617                 reg.Register.Index = src->Register.Index;
 618                 reg.Register.Indirect = src->Register.Indirect;
 619                 reg.Register.Dimension = src->Register.Dimension;
 620                 reg.Indirect = src->Indirect;
 621                 reg.Dimension = src->Dimension;
 622                 reg.DimIndirect = src->DimIndirect;
 623         } else
 624                 reg = *dst;
 625
 626         /* If the register is 2-dimensional (e.g. an array of vertices
 627          * in a primitive), calculate the base address of the vertex. */
 628         if (reg.Register.Dimension) {
 629                 LLVMValueRef index;
 630
 631                 if (reg.Dimension.Indirect)
 632                         index = get_indirect_index(ctx, &reg.DimIndirect,
 633                                                    reg.Dimension.Index);
 634                 else
 635                         index = lp_build_const_int32(gallivm, reg.Dimension.Index);
 636
 637                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 638                                          LLVMBuildMul(gallivm->builder, index,
 639                                                       vertex_dw_stride, ""), "");
 640         }
 641
 642         /* Get information about the register. */
 643         if (reg.Register.File == TGSI_FILE_INPUT) {
 644                 name = info->input_semantic_name;
 645                 index = info->input_semantic_index;
 646                 array_first = info->input_array_first;
 647         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 648                 name = info->output_semantic_name;
 649                 index = info->output_semantic_index;
 650                 array_first = info->output_array_first;
 651         } else {
 652                 assert(0);
 653                 return NULL;
 654         }
 655
 656         if (reg.Register.Indirect) {
 657                 /* Add the relative address of the element. */
 658                 LLVMValueRef ind_index;
 659
 660                 if (reg.Indirect.ArrayID)
 661                         first = array_first[reg.Indirect.ArrayID];
 662                 else
 663                         first = reg.Register.Index;
 664
 665                 ind_index = get_indirect_index(ctx, &reg.Indirect,
 666                                            reg.Register.Index - first);
 667
 668                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 669                                     LLVMBuildMul(gallivm->builder, ind_index,
 670                                                  lp_build_const_int32(gallivm, 4), ""), "");
 671
 672                 param = si_shader_io_get_unique_index(name[first], index[first]);
 673         } else {
 674                 param = si_shader_io_get_unique_index(name[reg.Register.Index],
 675                                                       index[reg.Register.Index]);
 676         }
 677
 678         /* Add the base address of the element. */
 679         return LLVMBuildAdd(gallivm->builder, base_addr,
 680                             lp_build_const_int32(gallivm, param * 4), "");
 681 }
 682
 683 /* The offchip buffer layout for TCS->TES is
 684  *
 685  * - attribute 0 of patch 0 vertex 0
 686  * - attribute 0 of patch 0 vertex 1
 687  * - attribute 0 of patch 0 vertex 2
 688  *   ...
 689  * - attribute 0 of patch 1 vertex 0
 690  * - attribute 0 of patch 1 vertex 1
 691  *   ...
 692  * - attribute 1 of patch 0 vertex 0
 693  * - attribute 1 of patch 0 vertex 1
 694  *   ...
 695  * - per patch attribute 0 of patch 0
 696  * - per patch attribute 0 of patch 1
 697  *   ...
 698  *
 699  * Note that every attribute has 4 components.
 700  */
 701 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 702                                                LLVMValueRef vertex_index,
 703                                                LLVMValueRef param_index)
 704 {
 705         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 706         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 707         LLVMValueRef param_stride, constant16;
 708
 709         vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
 710         num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
 711         total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 712                                       num_patches, "");
 713
 714         constant16 = lp_build_const_int32(gallivm, 16);
 715         if (vertex_index) {
 716                 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
 717                                          vertices_per_patch, "");
 718
 719                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 720                                          vertex_index, "");
 721
 722                 param_stride = total_vertices;
 723         } else {
 724                 base_addr = get_rel_patch_id(ctx);
 725                 param_stride = num_patches;
 726         }
 727
 728         base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 729                                  LLVMBuildMul(gallivm->builder, param_index,
 730                                               param_stride, ""), "");
 731
 732         base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 733
 734         if (!vertex_index) {
 735                 LLVMValueRef patch_data_offset =
 736                            unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
 737
 738                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 739                                          patch_data_offset, "");
 740         }
 741         return base_addr;
 742 }
 743
 744 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 745                                        struct si_shader_context *ctx,
 746                                        const struct tgsi_full_dst_register *dst,
 747                                        const struct tgsi_full_src_register *src)
 748 {
 749         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 750         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 751         ubyte *name, *index, *array_first;
 752         struct tgsi_full_src_register reg;
 753         LLVMValueRef vertex_index = NULL;
 754         LLVMValueRef param_index = NULL;
 755         unsigned param_index_base, param_base;
 756
 757         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 758
 759         if (reg.Register.Dimension) {
 760
 761                 if (reg.Dimension.Indirect)
 762                         vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
 763                                                           reg.Dimension.Index);
 764                 else
 765                         vertex_index = lp_build_const_int32(gallivm,
 766                                                             reg.Dimension.Index);
 767         }
 768
 769         /* Get information about the register. */
 770         if (reg.Register.File == TGSI_FILE_INPUT) {
 771                 name = info->input_semantic_name;
 772                 index = info->input_semantic_index;
 773                 array_first = info->input_array_first;
 774         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 775                 name = info->output_semantic_name;
 776                 index = info->output_semantic_index;
 777                 array_first = info->output_array_first;
 778         } else {
 779                 assert(0);
 780                 return NULL;
 781         }
 782
 783         if (reg.Register.Indirect) {
 784                 if (reg.Indirect.ArrayID)
 785                         param_base = array_first[reg.Indirect.ArrayID];
 786                 else
 787                         param_base = reg.Register.Index;
 788
 789                 param_index = get_indirect_index(ctx, &reg.Indirect,
 790                                                  reg.Register.Index - param_base);
 791
 792         } else {
 793                 param_base = reg.Register.Index;
 794                 param_index = lp_build_const_int32(gallivm, 0);
 795         }
 796
 797         param_index_base = si_shader_io_get_unique_index(name[param_base],
 798                                                          index[param_base]);
 799
 800         param_index = LLVMBuildAdd(gallivm->builder, param_index,
 801                                    lp_build_const_int32(gallivm, param_index_base),
 802                                    "");
 803
 804         return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
 805 }
 806
 807 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
 808  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
 809  * or v4i32 (num_channels=3,4). */
 810 static void build_tbuffer_store(struct si_shader_context *ctx,
 811                                 LLVMValueRef rsrc,
 812                                 LLVMValueRef vdata,
 813                                 unsigned num_channels,
 814                                 LLVMValueRef vaddr,
 815                                 LLVMValueRef soffset,
 816                                 unsigned inst_offset,
 817                                 unsigned dfmt,
 818                                 unsigned nfmt,
 819                                 unsigned offen,
 820                                 unsigned idxen,
 821                                 unsigned glc,
 822                                 unsigned slc,
 823                                 unsigned tfe)
 824 {
 825         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 826         LLVMValueRef args[] = {
 827                 rsrc,
 828                 vdata,
 829                 LLVMConstInt(ctx->i32, num_channels, 0),
 830                 vaddr,
 831                 soffset,
 832                 LLVMConstInt(ctx->i32, inst_offset, 0),
 833                 LLVMConstInt(ctx->i32, dfmt, 0),
 834                 LLVMConstInt(ctx->i32, nfmt, 0),
 835                 LLVMConstInt(ctx->i32, offen, 0),
 836                 LLVMConstInt(ctx->i32, idxen, 0),
 837                 LLVMConstInt(ctx->i32, glc, 0),
 838                 LLVMConstInt(ctx->i32, slc, 0),
 839                 LLVMConstInt(ctx->i32, tfe, 0)
 840         };
 841
 842         /* The instruction offset field has 12 bits */
 843         assert(offen || inst_offset < (1 << 12));
 844
 845         /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
 846         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 847         const char *types[] = {"i32", "v2i32", "v4i32"};
 848         char name[256];
 849         snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
 850
 851         lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
 852                            args, ARRAY_SIZE(args), 0);
 853 }
 854
 855 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
 856                                      LLVMValueRef rsrc,
 857                                      LLVMValueRef vdata,
 858                                      unsigned num_channels,
 859                                      LLVMValueRef vaddr,
 860                                      LLVMValueRef soffset,
 861                                      unsigned inst_offset)
 862 {
 863         static unsigned dfmt[] = {
 864                 V_008F0C_BUF_DATA_FORMAT_32,
 865                 V_008F0C_BUF_DATA_FORMAT_32_32,
 866                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
 867                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
 868         };
 869         assert(num_channels >= 1 && num_channels <= 4);
 870
 871         build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
 872                             inst_offset, dfmt[num_channels-1],
 873                             V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
 874 }
 875
 876 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
 877                                       LLVMValueRef rsrc,
 878                                       int num_channels,
 879                                       LLVMValueRef vindex,
 880                                       LLVMValueRef voffset,
 881                                       LLVMValueRef soffset,
 882                                       unsigned inst_offset,
 883                                       unsigned glc,
 884                                       unsigned slc)
 885 {
 886         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 887         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 888
 889         if (HAVE_LLVM >= 0x309) {
 890                 LLVMValueRef args[] = {
 891                         LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
 892                         vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
 893                         LLVMConstInt(ctx->i32, inst_offset, 0),
 894                         LLVMConstInt(ctx->i1, glc, 0),
 895                         LLVMConstInt(ctx->i1, slc, 0)
 896                 };
 897
 898                 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
 899                                        ctx->v4f32};
 900                 const char *type_names[] = {"f32", "v2f32", "v4f32"};
 901                 char name[256];
 902
 903                 if (voffset) {
 904                         args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
 905                                                "");
 906                 }
 907
 908                 if (soffset) {
 909                         args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
 910                                                "");
 911                 }
 912
 913                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
 914                          type_names[func]);
 915
 916                 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
 917                                           ARRAY_SIZE(args), LLVMReadOnlyAttribute);
 918         } else {
 919                 LLVMValueRef args[] = {
 920                         LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
 921                         voffset ? voffset : vindex,
 922                         soffset,
 923                         LLVMConstInt(ctx->i32, inst_offset, 0),
 924                         LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
 925                         LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
 926                         LLVMConstInt(ctx->i32, glc, 0),
 927                         LLVMConstInt(ctx->i32, slc, 0),
 928                         LLVMConstInt(ctx->i32, 0, 0), // TFE
 929                 };
 930
 931                 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
 932                                        ctx->v4i32};
 933                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
 934                 const char *arg_type = "i32";
 935                 char name[256];
 936
 937                 if (voffset && vindex) {
 938                         LLVMValueRef vaddr[] = {vindex, voffset};
 939
 940                         arg_type = "v2i32";
 941                         args[1] = lp_build_gather_values(gallivm, vaddr, 2);
 942                 }
 943
 944                 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
 945                          type_names[func], arg_type);
 946
 947                 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
 948                                           ARRAY_SIZE(args), LLVMReadOnlyAttribute);
 949         }
 950 }
 951
 952 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 953                                 enum tgsi_opcode_type type, unsigned swizzle,
 954                                 LLVMValueRef buffer, LLVMValueRef offset,
 955                                 LLVMValueRef base)
 956 {
 957         struct si_shader_context *ctx = si_shader_context(bld_base);
 958         struct gallivm_state *gallivm = bld_base->base.gallivm;
 959         LLVMValueRef value, value2;
 960         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
 961         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 962
 963         if (swizzle == ~0) {
 964                 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
 965                                           0, 1, 0);
 966
 967                 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 968         }
 969
 970         if (!tgsi_type_is_64bit(type)) {
 971                 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
 972                                           0, 1, 0);
 973
 974                 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 975                 return LLVMBuildExtractElement(gallivm->builder, value,
 976                                     lp_build_const_int32(gallivm, swizzle), "");
 977         }
 978
 979         value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
 980                                   swizzle * 4, 1, 0);
 981
 982         value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
 983                                    swizzle * 4 + 4, 1, 0);
 984
 985         return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 986 }
 987
 988 /**
 989  * Load from LDS.
 990  *
 991  * \param type          output value type
 992  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 993  * \param dw_addr       address in dwords
 994  */
 995 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 996                              enum tgsi_opcode_type type, unsigned swizzle,
 997                              LLVMValueRef dw_addr)
 998 {
 999         struct si_shader_context *ctx = si_shader_context(bld_base);
1000         struct gallivm_state *gallivm = bld_base->base.gallivm;
1001         LLVMValueRef value;
1002
1003         if (swizzle == ~0) {
1004                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1005
1006                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1007                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
1008
1009                 return lp_build_gather_values(bld_base->base.gallivm, values,
1010                                               TGSI_NUM_CHANNELS);
1011         }
1012
1013         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1014                             lp_build_const_int32(gallivm, swizzle));
1015
1016         value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1017         if (tgsi_type_is_64bit(type)) {
1018                 LLVMValueRef value2;
1019                 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1020                                        lp_build_const_int32(gallivm, swizzle + 1));
1021                 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1022                 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1023         }
1024
1025         return LLVMBuildBitCast(gallivm->builder, value,
1026                                 tgsi2llvmtype(bld_base, type), "");
1027 }
1028
1029 /**
1030  * Store to LDS.
1031  *
1032  * \param swizzle       offset (typically 0..3)
1033  * \param dw_addr       address in dwords
1034  * \param value         value to store
1035  */
1036 static void lds_store(struct lp_build_tgsi_context *bld_base,
1037                       unsigned swizzle, LLVMValueRef dw_addr,
1038                       LLVMValueRef value)
1039 {
1040         struct si_shader_context *ctx = si_shader_context(bld_base);
1041         struct gallivm_state *gallivm = bld_base->base.gallivm;
1042
1043         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1044                             lp_build_const_int32(gallivm, swizzle));
1045
1046         value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1047         build_indexed_store(ctx, ctx->lds,
1048                             dw_addr, value);
1049 }
1050
1051 static LLVMValueRef fetch_input_tcs(
1052         struct lp_build_tgsi_context *bld_base,
1053         const struct tgsi_full_src_register *reg,
1054         enum tgsi_opcode_type type, unsigned swizzle)
1055 {
1056         struct si_shader_context *ctx = si_shader_context(bld_base);
1057         LLVMValueRef dw_addr, stride;
1058
1059         stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1060         dw_addr = get_tcs_in_current_patch_offset(ctx);
1061         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1062
1063         return lds_load(bld_base, type, swizzle, dw_addr);
1064 }
1065
1066 static LLVMValueRef fetch_output_tcs(
1067                 struct lp_build_tgsi_context *bld_base,
1068                 const struct tgsi_full_src_register *reg,
1069                 enum tgsi_opcode_type type, unsigned swizzle)
1070 {
1071         struct si_shader_context *ctx = si_shader_context(bld_base);
1072         LLVMValueRef dw_addr, stride;
1073
1074         if (reg->Register.Dimension) {
1075                 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1076                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1077                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1078         } else {
1079                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1080                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1081         }
1082
1083         return lds_load(bld_base, type, swizzle, dw_addr);
1084 }
1085
1086 static LLVMValueRef fetch_input_tes(
1087         struct lp_build_tgsi_context *bld_base,
1088         const struct tgsi_full_src_register *reg,
1089         enum tgsi_opcode_type type, unsigned swizzle)
1090 {
1091         struct si_shader_context *ctx = si_shader_context(bld_base);
1092         struct gallivm_state *gallivm = bld_base->base.gallivm;
1093         LLVMValueRef rw_buffers, buffer, base, addr;
1094
1095         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1096                                   SI_PARAM_RW_BUFFERS);
1097         buffer = build_indexed_load_const(ctx, rw_buffers,
1098                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1099
1100         base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1101         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1102
1103         return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1104 }
1105
1106 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1107                              const struct tgsi_full_instruction *inst,
1108                              const struct tgsi_opcode_info *info,
1109                              LLVMValueRef dst[4])
1110 {
1111         struct si_shader_context *ctx = si_shader_context(bld_base);
1112         struct gallivm_state *gallivm = bld_base->base.gallivm;
1113         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1114         unsigned chan_index;
1115         LLVMValueRef dw_addr, stride;
1116         LLVMValueRef rw_buffers, buffer, base, buf_addr;
1117         LLVMValueRef values[4];
1118
1119         /* Only handle per-patch and per-vertex outputs here.
1120          * Vectors will be lowered to scalars and this function will be called again.
1121          */
1122         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1123             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1124                 radeon_llvm_emit_store(bld_base, inst, info, dst);
1125                 return;
1126         }
1127
1128         if (reg->Register.Dimension) {
1129                 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1130                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1131                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1132         } else {
1133                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1134                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1135         }
1136
1137         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1138                                   SI_PARAM_RW_BUFFERS);
1139         buffer = build_indexed_load_const(ctx, rw_buffers,
1140                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1141
1142         base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1143         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1144
1145
1146         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1147                 LLVMValueRef value = dst[chan_index];
1148
1149                 if (inst->Instruction.Saturate)
1150                         value = radeon_llvm_saturate(bld_base, value);
1151
1152                 lds_store(bld_base, chan_index, dw_addr, value);
1153
1154                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1155                 values[chan_index] = value;
1156
1157                 if (inst->Dst[0].Register.WriteMask != 0xF) {
1158                         build_tbuffer_store_dwords(ctx, buffer, value, 1,
1159                                                    buf_addr, base,
1160                                                    4 * chan_index);
1161                 }
1162         }
1163
1164         if (inst->Dst[0].Register.WriteMask == 0xF) {
1165                 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1166                                                             values, 4);
1167                 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1168                                            base, 0);
1169         }
1170 }
1171
1172 static LLVMValueRef fetch_input_gs(
1173         struct lp_build_tgsi_context *bld_base,
1174         const struct tgsi_full_src_register *reg,
1175         enum tgsi_opcode_type type,
1176         unsigned swizzle)
1177 {
1178         struct lp_build_context *base = &bld_base->base;
1179         struct si_shader_context *ctx = si_shader_context(bld_base);
1180         struct si_shader *shader = ctx->shader;
1181         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1182         struct gallivm_state *gallivm = base->gallivm;
1183         LLVMValueRef vtx_offset;
1184         LLVMValueRef args[9];
1185         unsigned vtx_offset_param;
1186         struct tgsi_shader_info *info = &shader->selector->info;
1187         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1188         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1189         unsigned param;
1190         LLVMValueRef value;
1191
1192         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1193                 return get_primitive_id(bld_base, swizzle);
1194
1195         if (!reg->Register.Dimension)
1196                 return NULL;
1197
1198         if (swizzle == ~0) {
1199                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1200                 unsigned chan;
1201                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1202                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1203                 }
1204                 return lp_build_gather_values(bld_base->base.gallivm, values,
1205                                               TGSI_NUM_CHANNELS);
1206         }
1207
1208         /* Get the vertex offset parameter */
1209         vtx_offset_param = reg->Dimension.Index;
1210         if (vtx_offset_param < 2) {
1211                 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1212         } else {
1213                 assert(vtx_offset_param < 6);
1214                 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1215         }
1216         vtx_offset = lp_build_mul_imm(uint,
1217                                       LLVMGetParam(ctx->radeon_bld.main_fn,
1218                                                    vtx_offset_param),
1219                                       4);
1220
1221         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1222         args[0] = ctx->esgs_ring;
1223         args[1] = vtx_offset;
1224         args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1225         args[3] = uint->zero;
1226         args[4] = uint->one;  /* OFFEN */
1227         args[5] = uint->zero; /* IDXEN */
1228         args[6] = uint->one;  /* GLC */
1229         args[7] = uint->zero; /* SLC */
1230         args[8] = uint->zero; /* TFE */
1231
1232         value = lp_build_intrinsic(gallivm->builder,
1233                                    "llvm.SI.buffer.load.dword.i32.i32",
1234                                    ctx->i32, args, 9,
1235                                    LLVMReadOnlyAttribute);
1236         if (tgsi_type_is_64bit(type)) {
1237                 LLVMValueRef value2;
1238                 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1239                 value2 = lp_build_intrinsic(gallivm->builder,
1240                                             "llvm.SI.buffer.load.dword.i32.i32",
1241                                             ctx->i32, args, 9,
1242                                             LLVMReadOnlyAttribute);
1243                 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1244                                                     value, value2);
1245         }
1246         return LLVMBuildBitCast(gallivm->builder,
1247                                 value,
1248                                 tgsi2llvmtype(bld_base, type), "");
1249 }
1250
1251 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1252 {
1253         switch (interpolate) {
1254         case TGSI_INTERPOLATE_CONSTANT:
1255                 return 0;
1256
1257         case TGSI_INTERPOLATE_LINEAR:
1258                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1259                         return SI_PARAM_LINEAR_SAMPLE;
1260                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1261                         return SI_PARAM_LINEAR_CENTROID;
1262                 else
1263                         return SI_PARAM_LINEAR_CENTER;
1264                 break;
1265         case TGSI_INTERPOLATE_COLOR:
1266         case TGSI_INTERPOLATE_PERSPECTIVE:
1267                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1268                         return SI_PARAM_PERSP_SAMPLE;
1269                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1270                         return SI_PARAM_PERSP_CENTROID;
1271                 else
1272                         return SI_PARAM_PERSP_CENTER;
1273                 break;
1274         default:
1275                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1276                 return -1;
1277         }
1278 }
1279
1280 /* This shouldn't be used by explicit INTERP opcodes. */
1281 static unsigned select_interp_param(struct si_shader_context *ctx,
1282                                     unsigned param)
1283 {
1284         if (!ctx->is_monolithic)
1285                 return param;
1286
1287         if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1288                 switch (param) {
1289                 case SI_PARAM_PERSP_CENTROID:
1290                 case SI_PARAM_PERSP_CENTER:
1291                         return SI_PARAM_PERSP_SAMPLE;
1292                 }
1293         }
1294         if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1295                 switch (param) {
1296                 case SI_PARAM_LINEAR_CENTROID:
1297                 case SI_PARAM_LINEAR_CENTER:
1298                         return SI_PARAM_LINEAR_SAMPLE;
1299                 }
1300         }
1301         if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1302                 switch (param) {
1303                 case SI_PARAM_PERSP_CENTROID:
1304                 case SI_PARAM_PERSP_SAMPLE:
1305                         return SI_PARAM_PERSP_CENTER;
1306                 }
1307         }
1308         if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1309                 switch (param) {
1310                 case SI_PARAM_LINEAR_CENTROID:
1311                 case SI_PARAM_LINEAR_SAMPLE:
1312                         return SI_PARAM_LINEAR_CENTER;
1313                 }
1314         }
1315
1316         return param;
1317 }
1318
1319 /**
1320  * Interpolate a fragment shader input.
1321  *
1322  * @param ctx           context
1323  * @param input_index           index of the input in hardware
1324  * @param semantic_name         TGSI_SEMANTIC_*
1325  * @param semantic_index        semantic index
1326  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1327  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1328  * @param interp_param          interpolation weights (i,j)
1329  * @param prim_mask             SI_PARAM_PRIM_MASK
1330  * @param face                  SI_PARAM_FRONT_FACE
1331  * @param result                the return value (4 components)
1332  */
1333 static void interp_fs_input(struct si_shader_context *ctx,
1334                             unsigned input_index,
1335                             unsigned semantic_name,
1336                             unsigned semantic_index,
1337                             unsigned num_interp_inputs,
1338                             unsigned colors_read_mask,
1339                             LLVMValueRef interp_param,
1340                             LLVMValueRef prim_mask,
1341                             LLVMValueRef face,
1342                             LLVMValueRef result[4])
1343 {
1344         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1345         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1346         struct gallivm_state *gallivm = base->gallivm;
1347         const char *intr_name;
1348         LLVMValueRef attr_number;
1349
1350         unsigned chan;
1351
1352         attr_number = lp_build_const_int32(gallivm, input_index);
1353
1354         /* fs.constant returns the param from the middle vertex, so it's not
1355          * really useful for flat shading. It's meant to be used for custom
1356          * interpolation (but the intrinsic can't fetch from the other two
1357          * vertices).
1358          *
1359          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1360          * to do the right thing. The only reason we use fs.constant is that
1361          * fs.interp cannot be used on integers, because they can be equal
1362          * to NaN.
1363          */
1364         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1365
1366         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1367             ctx->shader->key.ps.prolog.color_two_side) {
1368                 LLVMValueRef args[4];
1369                 LLVMValueRef is_face_positive;
1370                 LLVMValueRef back_attr_number;
1371
1372                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1373                  * otherwise it's at offset "num_inputs".
1374                  */
1375                 unsigned back_attr_offset = num_interp_inputs;
1376                 if (semantic_index == 1 && colors_read_mask & 0xf)
1377                         back_attr_offset += 1;
1378
1379                 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1380
1381                 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1382                                                  face, uint->zero, "");
1383
1384                 args[2] = prim_mask;
1385                 args[3] = interp_param;
1386                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1387                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1388                         LLVMValueRef front, back;
1389
1390                         args[0] = llvm_chan;
1391                         args[1] = attr_number;
1392                         front = lp_build_intrinsic(gallivm->builder, intr_name,
1393                                                 ctx->f32, args, args[3] ? 4 : 3,
1394                                                 LLVMReadNoneAttribute);
1395
1396                         args[1] = back_attr_number;
1397                         back = lp_build_intrinsic(gallivm->builder, intr_name,
1398                                                ctx->f32, args, args[3] ? 4 : 3,
1399                                                LLVMReadNoneAttribute);
1400
1401                         result[chan] = LLVMBuildSelect(gallivm->builder,
1402                                                 is_face_positive,
1403                                                 front,
1404                                                 back,
1405                                                 "");
1406                 }
1407         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1408                 LLVMValueRef args[4];
1409
1410                 args[0] = uint->zero;
1411                 args[1] = attr_number;
1412                 args[2] = prim_mask;
1413                 args[3] = interp_param;
1414                 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1415                                         ctx->f32, args, args[3] ? 4 : 3,
1416                                         LLVMReadNoneAttribute);
1417                 result[1] =
1418                 result[2] = lp_build_const_float(gallivm, 0.0f);
1419                 result[3] = lp_build_const_float(gallivm, 1.0f);
1420         } else {
1421                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1422                         LLVMValueRef args[4];
1423                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1424
1425                         args[0] = llvm_chan;
1426                         args[1] = attr_number;
1427                         args[2] = prim_mask;
1428                         args[3] = interp_param;
1429                         result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1430                                                 ctx->f32, args, args[3] ? 4 : 3,
1431                                                 LLVMReadNoneAttribute);
1432                 }
1433         }
1434 }
1435
1436 /* LLVMGetParam with bc_optimize resolved. */
1437 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1438                                      int interp_param_idx)
1439 {
1440         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1441         LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1442         LLVMValueRef param = NULL;
1443
1444         /* Handle PRIM_MASK[31] (bc_optimize). */
1445         if (ctx->is_monolithic &&
1446             ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1447               interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1448              (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1449               interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1450                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1451                  * The hw doesn't compute CENTROID if the whole wave only
1452                  * contains fully-covered quads.
1453                  */
1454                 LLVMValueRef bc_optimize =
1455                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1456                 bc_optimize = LLVMBuildLShr(builder,
1457                                             bc_optimize,
1458                                             LLVMConstInt(ctx->i32, 31, 0), "");
1459                 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1460
1461                 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1462                     interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1463                         param = LLVMBuildSelect(builder, bc_optimize,
1464                                                 LLVMGetParam(main_fn,
1465                                                              SI_PARAM_PERSP_CENTER),
1466                                                 LLVMGetParam(main_fn,
1467                                                              SI_PARAM_PERSP_CENTROID),
1468                                                 "");
1469                 }
1470                 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1471                     interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1472                         param = LLVMBuildSelect(builder, bc_optimize,
1473                                                 LLVMGetParam(main_fn,
1474                                                              SI_PARAM_LINEAR_CENTER),
1475                                                 LLVMGetParam(main_fn,
1476                                                              SI_PARAM_LINEAR_CENTROID),
1477                                                 "");
1478                 }
1479         }
1480
1481         if (!param)
1482                 param = LLVMGetParam(main_fn, interp_param_idx);
1483         return param;
1484 }
1485
1486 static void declare_input_fs(
1487         struct radeon_llvm_context *radeon_bld,
1488         unsigned input_index,
1489         const struct tgsi_full_declaration *decl)
1490 {
1491         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1492         struct si_shader_context *ctx =
1493                 si_shader_context(&radeon_bld->soa.bld_base);
1494         struct si_shader *shader = ctx->shader;
1495         LLVMValueRef main_fn = radeon_bld->main_fn;
1496         LLVMValueRef interp_param = NULL;
1497         int interp_param_idx;
1498
1499         /* Get colors from input VGPRs (set by the prolog). */
1500         if (!ctx->is_monolithic &&
1501             decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1502                 unsigned i = decl->Semantic.Index;
1503                 unsigned colors_read = shader->selector->info.colors_read;
1504                 unsigned mask = colors_read >> (i * 4);
1505                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1506                                   (i ? util_bitcount(colors_read & 0xf) : 0);
1507
1508                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1509                         mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1510                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1511                         mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1512                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1513                         mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1514                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1515                         mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1516                 return;
1517         }
1518
1519         interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1520                                                      decl->Interp.Location);
1521         if (interp_param_idx == -1)
1522                 return;
1523         else if (interp_param_idx) {
1524                 interp_param_idx = select_interp_param(ctx,
1525                                                        interp_param_idx);
1526                 interp_param = get_interp_param(ctx, interp_param_idx);
1527         }
1528
1529         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1530             decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1531             ctx->shader->key.ps.prolog.flatshade_colors)
1532                 interp_param = NULL; /* load the constant color */
1533
1534         interp_fs_input(ctx, input_index, decl->Semantic.Name,
1535                         decl->Semantic.Index, shader->selector->info.num_inputs,
1536                         shader->selector->info.colors_read, interp_param,
1537                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1538                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1539                         &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1540 }
1541
1542 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1543 {
1544         return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1545                             SI_PARAM_ANCILLARY, 8, 4);
1546 }
1547
1548 /**
1549  * Set range metadata on an instruction.  This can only be used on load and
1550  * call instructions.  If you know an instruction can only produce the values
1551  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1552  * \p lo is the minimum value inclusive.
1553  * \p hi is the maximum value exclusive.
1554  */
1555 static void set_range_metadata(struct si_shader_context *ctx,
1556                                LLVMValueRef value, unsigned lo, unsigned hi)
1557 {
1558         LLVMValueRef range_md, md_args[2];
1559         LLVMTypeRef type = LLVMTypeOf(value);
1560         LLVMContextRef context = LLVMGetTypeContext(type);
1561
1562         md_args[0] = LLVMConstInt(type, lo, false);
1563         md_args[1] = LLVMConstInt(type, hi, false);
1564         range_md = LLVMMDNodeInContext(context, md_args, 2);
1565         LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1566 }
1567
1568 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1569 {
1570         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1571         LLVMValueRef tid;
1572
1573         if (HAVE_LLVM < 0x0308) {
1574                 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1575                                 ctx->i32,   NULL, 0, LLVMReadNoneAttribute);
1576         } else {
1577                 LLVMValueRef tid_args[2];
1578                 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1579                 tid_args[1] = lp_build_const_int32(gallivm, 0);
1580                 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1581                                         "llvm.amdgcn.mbcnt.lo", ctx->i32,
1582                                         tid_args, 2, LLVMReadNoneAttribute);
1583
1584                 tid = lp_build_intrinsic(gallivm->builder,
1585                                         "llvm.amdgcn.mbcnt.hi", ctx->i32,
1586                                         tid_args, 2, LLVMReadNoneAttribute);
1587         }
1588         set_range_metadata(ctx, tid, 0, 64);
1589         return tid;
1590 }
1591
1592 /**
1593  * Load a dword from a constant buffer.
1594  */
1595 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1596                                       LLVMValueRef resource,
1597                                       LLVMValueRef offset)
1598 {
1599         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1600         LLVMValueRef args[2] = {resource, offset};
1601
1602         return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1603                                LLVMReadNoneAttribute);
1604 }
1605
1606 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1607 {
1608         struct si_shader_context *ctx =
1609                 si_shader_context(&radeon_bld->soa.bld_base);
1610         struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1611         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1612         LLVMBuilderRef builder = gallivm->builder;
1613         LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1614         LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1615         LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1616
1617         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1618         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1619         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1620
1621         LLVMValueRef pos[4] = {
1622                 buffer_load_const(ctx, resource, offset0),
1623                 buffer_load_const(ctx, resource, offset1),
1624                 lp_build_const_float(gallivm, 0),
1625                 lp_build_const_float(gallivm, 0)
1626         };
1627
1628         return lp_build_gather_values(gallivm, pos, 4);
1629 }
1630
1631 static void declare_system_value(
1632         struct radeon_llvm_context *radeon_bld,
1633         unsigned index,
1634         const struct tgsi_full_declaration *decl)
1635 {
1636         struct si_shader_context *ctx =
1637                 si_shader_context(&radeon_bld->soa.bld_base);
1638         struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1639         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1640         LLVMValueRef value = 0;
1641
1642         switch (decl->Semantic.Name) {
1643         case TGSI_SEMANTIC_INSTANCEID:
1644                 value = LLVMGetParam(radeon_bld->main_fn,
1645                                      ctx->param_instance_id);
1646                 break;
1647
1648         case TGSI_SEMANTIC_VERTEXID:
1649                 value = LLVMBuildAdd(gallivm->builder,
1650                                      LLVMGetParam(radeon_bld->main_fn,
1651                                                   ctx->param_vertex_id),
1652                                      LLVMGetParam(radeon_bld->main_fn,
1653                                                   SI_PARAM_BASE_VERTEX), "");
1654                 break;
1655
1656         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1657                 value = LLVMGetParam(radeon_bld->main_fn,
1658                                      ctx->param_vertex_id);
1659                 break;
1660
1661         case TGSI_SEMANTIC_BASEVERTEX:
1662                 value = LLVMGetParam(radeon_bld->main_fn,
1663                                      SI_PARAM_BASE_VERTEX);
1664                 break;
1665
1666         case TGSI_SEMANTIC_BASEINSTANCE:
1667                 value = LLVMGetParam(radeon_bld->main_fn,
1668                                      SI_PARAM_START_INSTANCE);
1669                 break;
1670
1671         case TGSI_SEMANTIC_DRAWID:
1672                 value = LLVMGetParam(radeon_bld->main_fn,
1673                                      SI_PARAM_DRAWID);
1674                 break;
1675
1676         case TGSI_SEMANTIC_INVOCATIONID:
1677                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1678                         value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1679                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1680                         value = LLVMGetParam(radeon_bld->main_fn,
1681                                              SI_PARAM_GS_INSTANCE_ID);
1682                 else
1683                         assert(!"INVOCATIONID not implemented");
1684                 break;
1685
1686         case TGSI_SEMANTIC_POSITION:
1687         {
1688                 LLVMValueRef pos[4] = {
1689                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1690                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1691                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1692                         lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1693                                                  LLVMGetParam(radeon_bld->main_fn,
1694                                                               SI_PARAM_POS_W_FLOAT)),
1695                 };
1696                 value = lp_build_gather_values(gallivm, pos, 4);
1697                 break;
1698         }
1699
1700         case TGSI_SEMANTIC_FACE:
1701                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1702                 break;
1703
1704         case TGSI_SEMANTIC_SAMPLEID:
1705                 value = get_sample_id(radeon_bld);
1706                 break;
1707
1708         case TGSI_SEMANTIC_SAMPLEPOS: {
1709                 LLVMValueRef pos[4] = {
1710                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1711                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1712                         lp_build_const_float(gallivm, 0),
1713                         lp_build_const_float(gallivm, 0)
1714                 };
1715                 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1716                                                   TGSI_OPCODE_FRC, pos[0]);
1717                 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1718                                                   TGSI_OPCODE_FRC, pos[1]);
1719                 value = lp_build_gather_values(gallivm, pos, 4);
1720                 break;
1721         }
1722
1723         case TGSI_SEMANTIC_SAMPLEMASK:
1724                 /* This can only occur with the OpenGL Core profile, which
1725                  * doesn't support smoothing.
1726                  */
1727                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1728                 break;
1729
1730         case TGSI_SEMANTIC_TESSCOORD:
1731         {
1732                 LLVMValueRef coord[4] = {
1733                         LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1734                         LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1735                         bld->zero,
1736                         bld->zero
1737                 };
1738
1739                 /* For triangles, the vector should be (u, v, 1-u-v). */
1740                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1741                     PIPE_PRIM_TRIANGLES)
1742                         coord[2] = lp_build_sub(bld, bld->one,
1743                                                 lp_build_add(bld, coord[0], coord[1]));
1744
1745                 value = lp_build_gather_values(gallivm, coord, 4);
1746                 break;
1747         }
1748
1749         case TGSI_SEMANTIC_VERTICESIN:
1750                 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1751                 break;
1752
1753         case TGSI_SEMANTIC_TESSINNER:
1754         case TGSI_SEMANTIC_TESSOUTER:
1755         {
1756                 LLVMValueRef rw_buffers, buffer, base, addr;
1757                 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1758
1759                 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1760                                         SI_PARAM_RW_BUFFERS);
1761                 buffer = build_indexed_load_const(ctx, rw_buffers,
1762                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1763
1764                 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1765                 addr = get_tcs_tes_buffer_address(ctx, NULL,
1766                                           lp_build_const_int32(gallivm, param));
1767
1768                 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1769                                     ~0, buffer, base, addr);
1770
1771                 break;
1772         }
1773
1774         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1775         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1776         {
1777                 LLVMValueRef buf, slot, val[4];
1778                 int i, offset;
1779
1780                 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1781                 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1782                 buf = build_indexed_load_const(ctx, buf, slot);
1783                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1784
1785                 for (i = 0; i < 4; i++)
1786                         val[i] = buffer_load_const(ctx, buf,
1787                                                    lp_build_const_int32(gallivm, (offset + i) * 4));
1788                 value = lp_build_gather_values(gallivm, val, 4);
1789                 break;
1790         }
1791
1792         case TGSI_SEMANTIC_PRIMID:
1793                 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1794                 break;
1795
1796         case TGSI_SEMANTIC_GRID_SIZE:
1797                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1798                 break;
1799
1800         case TGSI_SEMANTIC_BLOCK_SIZE:
1801         {
1802                 LLVMValueRef values[3];
1803                 unsigned i;
1804                 unsigned *properties = ctx->shader->selector->info.properties;
1805                 unsigned sizes[3] = {
1806                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1807                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1808                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1809                 };
1810
1811                 for (i = 0; i < 3; ++i)
1812                         values[i] = lp_build_const_int32(gallivm, sizes[i]);
1813
1814                 value = lp_build_gather_values(gallivm, values, 3);
1815                 break;
1816         }
1817
1818         case TGSI_SEMANTIC_BLOCK_ID:
1819                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1820                 break;
1821
1822         case TGSI_SEMANTIC_THREAD_ID:
1823                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1824                 break;
1825
1826 #if HAVE_LLVM >= 0x0309
1827         case TGSI_SEMANTIC_HELPER_INVOCATION:
1828                 value = lp_build_intrinsic(gallivm->builder,
1829                                            "llvm.amdgcn.ps.live",
1830                                            ctx->i1, NULL, 0,
1831                                            LLVMReadNoneAttribute);
1832                 value = LLVMBuildNot(gallivm->builder, value, "");
1833                 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1834                 break;
1835 #endif
1836
1837         default:
1838                 assert(!"unknown system value");
1839                 return;
1840         }
1841
1842         radeon_bld->system_values[index] = value;
1843 }
1844
1845 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1846                                    const struct tgsi_full_declaration *decl)
1847 {
1848         struct si_shader_context *ctx =
1849                 si_shader_context(&radeon_bld->soa.bld_base);
1850         struct si_shader_selector *sel = ctx->shader->selector;
1851         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1852
1853         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1854         LLVMValueRef var;
1855
1856         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1857         assert(decl->Range.First == decl->Range.Last);
1858         assert(!ctx->shared_memory);
1859
1860         var = LLVMAddGlobalInAddressSpace(gallivm->module,
1861                                           LLVMArrayType(ctx->i8, sel->local_size),
1862                                           "compute_lds",
1863                                           LOCAL_ADDR_SPACE);
1864         LLVMSetAlignment(var, 4);
1865
1866         ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1867 }
1868
1869 static LLVMValueRef fetch_constant(
1870         struct lp_build_tgsi_context *bld_base,
1871         const struct tgsi_full_src_register *reg,
1872         enum tgsi_opcode_type type,
1873         unsigned swizzle)
1874 {
1875         struct si_shader_context *ctx = si_shader_context(bld_base);
1876         struct lp_build_context *base = &bld_base->base;
1877         const struct tgsi_ind_register *ireg = &reg->Indirect;
1878         unsigned buf, idx;
1879
1880         LLVMValueRef addr, bufp;
1881         LLVMValueRef result;
1882
1883         if (swizzle == LP_CHAN_ALL) {
1884                 unsigned chan;
1885                 LLVMValueRef values[4];
1886                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1887                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1888
1889                 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1890         }
1891
1892         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1893         idx = reg->Register.Index * 4 + swizzle;
1894
1895         if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1896                 if (!tgsi_type_is_64bit(type))
1897                         return bitcast(bld_base, type, ctx->constants[buf][idx]);
1898                 else {
1899                         return radeon_llvm_emit_fetch_64bit(bld_base, type,
1900                                                             ctx->constants[buf][idx],
1901                                                             ctx->constants[buf][idx + 1]);
1902                 }
1903         }
1904
1905         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1906                 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1907                 LLVMValueRef index;
1908                 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1909                                                    reg->Dimension.Index,
1910                                                    SI_NUM_CONST_BUFFERS);
1911                 bufp = build_indexed_load_const(ctx, ptr, index);
1912         } else
1913                 bufp = ctx->const_buffers[buf];
1914
1915         addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1916         addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1917         addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1918         addr = lp_build_add(&bld_base->uint_bld, addr,
1919                             lp_build_const_int32(base->gallivm, idx * 4));
1920
1921         result = buffer_load_const(ctx, bufp, addr);
1922
1923         if (!tgsi_type_is_64bit(type))
1924                 result = bitcast(bld_base, type, result);
1925         else {
1926                 LLVMValueRef addr2, result2;
1927                 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1928                 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1929                 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1930                 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1931                                      lp_build_const_int32(base->gallivm, idx * 4));
1932
1933                 result2 = buffer_load_const(ctx, ctx->const_buffers[buf],
1934                                             addr2);
1935
1936                 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1937                                                       result, result2);
1938         }
1939         return result;
1940 }
1941
1942 /* Upper 16 bits must be zero. */
1943 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1944                                            LLVMValueRef val[2])
1945 {
1946         return LLVMBuildOr(gallivm->builder, val[0],
1947                            LLVMBuildShl(gallivm->builder, val[1],
1948                                         lp_build_const_int32(gallivm, 16),
1949                                         ""), "");
1950 }
1951
1952 /* Upper 16 bits are ignored and will be dropped. */
1953 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1954                                                     LLVMValueRef val[2])
1955 {
1956         LLVMValueRef v[2] = {
1957                 LLVMBuildAnd(gallivm->builder, val[0],
1958                              lp_build_const_int32(gallivm, 0xffff), ""),
1959                 val[1],
1960         };
1961         return si_llvm_pack_two_int16(gallivm, v);
1962 }
1963
1964 /* Initialize arguments for the shader export intrinsic */
1965 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1966                                      LLVMValueRef *values,
1967                                      unsigned target,
1968                                      LLVMValueRef *args)
1969 {
1970         struct si_shader_context *ctx = si_shader_context(bld_base);
1971         struct lp_build_context *uint =
1972                                 &ctx->radeon_bld.soa.bld_base.uint_bld;
1973         struct lp_build_context *base = &bld_base->base;
1974         struct gallivm_state *gallivm = base->gallivm;
1975         LLVMBuilderRef builder = base->gallivm->builder;
1976         LLVMValueRef val[4];
1977         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1978         unsigned chan;
1979         bool is_int8;
1980
1981         /* Default is 0xf. Adjusted below depending on the format. */
1982         args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1983
1984         /* Specify whether the EXEC mask represents the valid mask */
1985         args[1] = uint->zero;
1986
1987         /* Specify whether this is the last export */
1988         args[2] = uint->zero;
1989
1990         /* Specify the target we are exporting */
1991         args[3] = lp_build_const_int32(base->gallivm, target);
1992
1993         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1994                 const union si_shader_key *key = &ctx->shader->key;
1995                 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1996                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1997
1998                 assert(cbuf >= 0 && cbuf < 8);
1999                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2000                 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
2001         }
2002
2003         args[4] = uint->zero; /* COMPR flag */
2004         args[5] = base->undef;
2005         args[6] = base->undef;
2006         args[7] = base->undef;
2007         args[8] = base->undef;
2008
2009         switch (spi_shader_col_format) {
2010         case V_028714_SPI_SHADER_ZERO:
2011                 args[0] = uint->zero; /* writemask */
2012                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2013                 break;
2014
2015         case V_028714_SPI_SHADER_32_R:
2016                 args[0] = uint->one; /* writemask */
2017                 args[5] = values[0];
2018                 break;
2019
2020         case V_028714_SPI_SHADER_32_GR:
2021                 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
2022                 args[5] = values[0];
2023                 args[6] = values[1];
2024                 break;
2025
2026         case V_028714_SPI_SHADER_32_AR:
2027                 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2028                 args[5] = values[0];
2029                 args[8] = values[3];
2030                 break;
2031
2032         case V_028714_SPI_SHADER_FP16_ABGR:
2033                 args[4] = uint->one; /* COMPR flag */
2034
2035                 for (chan = 0; chan < 2; chan++) {
2036                         LLVMValueRef pack_args[2] = {
2037                                 values[2 * chan],
2038                                 values[2 * chan + 1]
2039                         };
2040                         LLVMValueRef packed;
2041
2042                         packed = lp_build_intrinsic(base->gallivm->builder,
2043                                                     "llvm.SI.packf16",
2044                                                     ctx->i32, pack_args, 2,
2045                                                     LLVMReadNoneAttribute);
2046                         args[chan + 5] =
2047                                 LLVMBuildBitCast(base->gallivm->builder,
2048                                                  packed, ctx->f32, "");
2049                 }
2050                 break;
2051
2052         case V_028714_SPI_SHADER_UNORM16_ABGR:
2053                 for (chan = 0; chan < 4; chan++) {
2054                         val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2055                         val[chan] = LLVMBuildFMul(builder, val[chan],
2056                                                   lp_build_const_float(gallivm, 65535), "");
2057                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2058                                                   lp_build_const_float(gallivm, 0.5), "");
2059                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
2060                                                     ctx->i32, "");
2061                 }
2062
2063                 args[4] = uint->one; /* COMPR flag */
2064                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2065                                   si_llvm_pack_two_int16(gallivm, val));
2066                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2067                                   si_llvm_pack_two_int16(gallivm, val+2));
2068                 break;
2069
2070         case V_028714_SPI_SHADER_SNORM16_ABGR:
2071                 for (chan = 0; chan < 4; chan++) {
2072                         /* Clamp between [-1, 1]. */
2073                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2074                                                               values[chan],
2075                                                               lp_build_const_float(gallivm, 1));
2076                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2077                                                               val[chan],
2078                                                               lp_build_const_float(gallivm, -1));
2079                         /* Convert to a signed integer in [-32767, 32767]. */
2080                         val[chan] = LLVMBuildFMul(builder, val[chan],
2081                                                   lp_build_const_float(gallivm, 32767), "");
2082                         /* If positive, add 0.5, else add -0.5. */
2083                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2084                                         LLVMBuildSelect(builder,
2085                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
2086                                                               val[chan], base->zero, ""),
2087                                                 lp_build_const_float(gallivm, 0.5),
2088                                                 lp_build_const_float(gallivm, -0.5), ""), "");
2089                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2090                 }
2091
2092                 args[4] = uint->one; /* COMPR flag */
2093                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2094                                   si_llvm_pack_two_int32_as_int16(gallivm, val));
2095                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2096                                   si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2097                 break;
2098
2099         case V_028714_SPI_SHADER_UINT16_ABGR: {
2100                 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2101                                                         255 : 65535);
2102                 /* Clamp. */
2103                 for (chan = 0; chan < 4; chan++) {
2104                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2105                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2106                                                               val[chan], max);
2107                 }
2108
2109                 args[4] = uint->one; /* COMPR flag */
2110                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2111                                   si_llvm_pack_two_int16(gallivm, val));
2112                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2113                                   si_llvm_pack_two_int16(gallivm, val+2));
2114                 break;
2115         }
2116
2117         case V_028714_SPI_SHADER_SINT16_ABGR: {
2118                 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2119                                                         127 : 32767);
2120                 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2121                                                         -128 : -32768);
2122                 /* Clamp. */
2123                 for (chan = 0; chan < 4; chan++) {
2124                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2125                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2126                                                               TGSI_OPCODE_IMIN,
2127                                                               val[chan], max);
2128                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2129                                                               TGSI_OPCODE_IMAX,
2130                                                               val[chan], min);
2131                 }
2132
2133                 args[4] = uint->one; /* COMPR flag */
2134                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2135                                   si_llvm_pack_two_int32_as_int16(gallivm, val));
2136                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2137                                   si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2138                 break;
2139         }
2140
2141         case V_028714_SPI_SHADER_32_ABGR:
2142                 memcpy(&args[5], values, sizeof(values[0]) * 4);
2143                 break;
2144         }
2145 }
2146
2147 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2148                           LLVMValueRef alpha)
2149 {
2150         struct si_shader_context *ctx = si_shader_context(bld_base);
2151         struct gallivm_state *gallivm = bld_base->base.gallivm;
2152
2153         if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2154                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2155                                 SI_PARAM_ALPHA_REF);
2156
2157                 LLVMValueRef alpha_pass =
2158                         lp_build_cmp(&bld_base->base,
2159                                      ctx->shader->key.ps.epilog.alpha_func,
2160                                      alpha, alpha_ref);
2161                 LLVMValueRef arg =
2162                         lp_build_select(&bld_base->base,
2163                                         alpha_pass,
2164                                         lp_build_const_float(gallivm, 1.0f),
2165                                         lp_build_const_float(gallivm, -1.0f));
2166
2167                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2168                                    ctx->voidt, &arg, 1, 0);
2169         } else {
2170                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2171                                    ctx->voidt, NULL, 0, 0);
2172         }
2173 }
2174
2175 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2176                                                   LLVMValueRef alpha,
2177                                                   unsigned samplemask_param)
2178 {
2179         struct si_shader_context *ctx = si_shader_context(bld_base);
2180         struct gallivm_state *gallivm = bld_base->base.gallivm;
2181         LLVMValueRef coverage;
2182
2183         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2184         coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2185                                 samplemask_param);
2186         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2187
2188         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2189                                    ctx->i32,
2190                                    &coverage, 1, LLVMReadNoneAttribute);
2191
2192         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2193                                    ctx->f32, "");
2194
2195         coverage = LLVMBuildFMul(gallivm->builder, coverage,
2196                                  lp_build_const_float(gallivm,
2197                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2198
2199         return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2200 }
2201
2202 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2203                                     LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2204 {
2205         struct si_shader_context *ctx = si_shader_context(bld_base);
2206         struct lp_build_context *base = &bld_base->base;
2207         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2208         unsigned reg_index;
2209         unsigned chan;
2210         unsigned const_chan;
2211         LLVMValueRef base_elt;
2212         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2213         LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2214                                                            SI_VS_CONST_CLIP_PLANES);
2215         LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2216
2217         for (reg_index = 0; reg_index < 2; reg_index ++) {
2218                 LLVMValueRef *args = pos[2 + reg_index];
2219
2220                 args[5] =
2221                 args[6] =
2222                 args[7] =
2223                 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2224
2225                 /* Compute dot products of position and user clip plane vectors */
2226                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2227                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2228                                 args[1] = lp_build_const_int32(base->gallivm,
2229                                                                ((reg_index * 4 + chan) * 4 +
2230                                                                 const_chan) * 4);
2231                                 base_elt = buffer_load_const(ctx, const_resource,
2232                                                              args[1]);
2233                                 args[5 + chan] =
2234                                         lp_build_add(base, args[5 + chan],
2235                                                      lp_build_mul(base, base_elt,
2236                                                                   out_elts[const_chan]));
2237                         }
2238                 }
2239
2240                 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2241                 args[1] = uint->zero;
2242                 args[2] = uint->zero;
2243                 args[3] = lp_build_const_int32(base->gallivm,
2244                                                V_008DFC_SQ_EXP_POS + 2 + reg_index);
2245                 args[4] = uint->zero;
2246         }
2247 }
2248
2249 static void si_dump_streamout(struct pipe_stream_output_info *so)
2250 {
2251         unsigned i;
2252
2253         if (so->num_outputs)
2254                 fprintf(stderr, "STREAMOUT\n");
2255
2256         for (i = 0; i < so->num_outputs; i++) {
2257                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2258                                 so->output[i].start_component;
2259                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2260                         i, so->output[i].output_buffer,
2261                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2262                         so->output[i].register_index,
2263                         mask & 1 ? "x" : "",
2264                         mask & 2 ? "y" : "",
2265                         mask & 4 ? "z" : "",
2266                         mask & 8 ? "w" : "");
2267         }
2268 }
2269
2270 /* On SI, the vertex shader is responsible for writing streamout data
2271  * to buffers. */
2272 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2273                                    struct si_shader_output_values *outputs,
2274                                    unsigned noutput)
2275 {
2276         struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2277         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2278         LLVMBuilderRef builder = gallivm->builder;
2279         int i, j;
2280         struct lp_build_if_state if_ctx;
2281
2282         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2283         LLVMValueRef so_vtx_count =
2284                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2285
2286         LLVMValueRef tid = get_thread_id(ctx);
2287
2288         /* can_emit = tid < so_vtx_count; */
2289         LLVMValueRef can_emit =
2290                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2291
2292         LLVMValueRef stream_id =
2293                 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2294
2295         /* Emit the streamout code conditionally. This actually avoids
2296          * out-of-bounds buffer access. The hw tells us via the SGPR
2297          * (so_vtx_count) which threads are allowed to emit streamout data. */
2298         lp_build_if(&if_ctx, gallivm, can_emit);
2299         {
2300                 /* The buffer offset is computed as follows:
2301                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2302                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2303                  *                attrib_offset
2304                  */
2305
2306                 LLVMValueRef so_write_index =
2307                         LLVMGetParam(ctx->radeon_bld.main_fn,
2308                                      ctx->param_streamout_write_index);
2309
2310                 /* Compute (streamout_write_index + thread_id). */
2311                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2312
2313                 /* Compute the write offset for each enabled buffer. */
2314                 LLVMValueRef so_write_offset[4] = {};
2315                 for (i = 0; i < 4; i++) {
2316                         if (!so->stride[i])
2317                                 continue;
2318
2319                         LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2320                                                               ctx->param_streamout_offset[i]);
2321                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2322
2323                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2324                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2325                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2326                 }
2327
2328                 /* Write streamout data. */
2329                 for (i = 0; i < so->num_outputs; i++) {
2330                         unsigned buf_idx = so->output[i].output_buffer;
2331                         unsigned reg = so->output[i].register_index;
2332                         unsigned start = so->output[i].start_component;
2333                         unsigned num_comps = so->output[i].num_components;
2334                         unsigned stream = so->output[i].stream;
2335                         LLVMValueRef out[4];
2336                         struct lp_build_if_state if_ctx_stream;
2337
2338                         assert(num_comps && num_comps <= 4);
2339                         if (!num_comps || num_comps > 4)
2340                                 continue;
2341
2342                         if (reg >= noutput)
2343                                 continue;
2344
2345                         /* Load the output as int. */
2346                         for (j = 0; j < num_comps; j++) {
2347                                 out[j] = LLVMBuildBitCast(builder,
2348                                                           outputs[reg].values[start+j],
2349                                                 ctx->i32, "");
2350                         }
2351
2352                         /* Pack the output. */
2353                         LLVMValueRef vdata = NULL;
2354
2355                         switch (num_comps) {
2356                         case 1: /* as i32 */
2357                                 vdata = out[0];
2358                                 break;
2359                         case 2: /* as v2i32 */
2360                         case 3: /* as v4i32 (aligned to 4) */
2361                         case 4: /* as v4i32 */
2362                                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2363                                 for (j = 0; j < num_comps; j++) {
2364                                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2365                                                                        LLVMConstInt(ctx->i32, j, 0), "");
2366                                 }
2367                                 break;
2368                         }
2369
2370                         LLVMValueRef can_emit_stream =
2371                                 LLVMBuildICmp(builder, LLVMIntEQ,
2372                                               stream_id,
2373                                               lp_build_const_int32(gallivm, stream), "");
2374
2375                         lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2376                         build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2377                                                    vdata, num_comps,
2378                                                    so_write_offset[buf_idx],
2379                                                    LLVMConstInt(ctx->i32, 0, 0),
2380                                                    so->output[i].dst_offset*4);
2381                         lp_build_endif(&if_ctx_stream);
2382                 }
2383         }
2384         lp_build_endif(&if_ctx);
2385 }
2386
2387
2388 /* Generate export instructions for hardware VS shader stage */
2389 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2390                               struct si_shader_output_values *outputs,
2391                               unsigned noutput)
2392 {
2393         struct si_shader_context *ctx = si_shader_context(bld_base);
2394         struct si_shader *shader = ctx->shader;
2395         struct lp_build_context *base = &bld_base->base;
2396         struct lp_build_context *uint =
2397                                 &ctx->radeon_bld.soa.bld_base.uint_bld;
2398         LLVMValueRef args[9];
2399         LLVMValueRef pos_args[4][9] = { { 0 } };
2400         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2401         unsigned semantic_name, semantic_index;
2402         unsigned target;
2403         unsigned param_count = 0;
2404         unsigned pos_idx;
2405         int i;
2406
2407         if (outputs && ctx->shader->selector->so.num_outputs) {
2408                 si_llvm_emit_streamout(ctx, outputs, noutput);
2409         }
2410
2411         for (i = 0; i < noutput; i++) {
2412                 semantic_name = outputs[i].name;
2413                 semantic_index = outputs[i].sid;
2414
2415 handle_semantic:
2416                 /* Select the correct target */
2417                 switch(semantic_name) {
2418                 case TGSI_SEMANTIC_PSIZE:
2419                         psize_value = outputs[i].values[0];
2420                         continue;
2421                 case TGSI_SEMANTIC_EDGEFLAG:
2422                         edgeflag_value = outputs[i].values[0];
2423                         continue;
2424                 case TGSI_SEMANTIC_LAYER:
2425                         layer_value = outputs[i].values[0];
2426                         semantic_name = TGSI_SEMANTIC_GENERIC;
2427                         goto handle_semantic;
2428                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2429                         viewport_index_value = outputs[i].values[0];
2430                         semantic_name = TGSI_SEMANTIC_GENERIC;
2431                         goto handle_semantic;
2432                 case TGSI_SEMANTIC_POSITION:
2433                         target = V_008DFC_SQ_EXP_POS;
2434                         break;
2435                 case TGSI_SEMANTIC_COLOR:
2436                 case TGSI_SEMANTIC_BCOLOR:
2437                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2438                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2439                         shader->info.vs_output_param_offset[i] = param_count;
2440                         param_count++;
2441                         break;
2442                 case TGSI_SEMANTIC_CLIPDIST:
2443                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2444                         break;
2445                 case TGSI_SEMANTIC_CLIPVERTEX:
2446                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2447                         continue;
2448                 case TGSI_SEMANTIC_PRIMID:
2449                 case TGSI_SEMANTIC_FOG:
2450                 case TGSI_SEMANTIC_TEXCOORD:
2451                 case TGSI_SEMANTIC_GENERIC:
2452                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2453                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2454                         shader->info.vs_output_param_offset[i] = param_count;
2455                         param_count++;
2456                         break;
2457                 default:
2458                         target = 0;
2459                         fprintf(stderr,
2460                                 "Warning: SI unhandled vs output type:%d\n",
2461                                 semantic_name);
2462                 }
2463
2464                 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2465
2466                 if (target >= V_008DFC_SQ_EXP_POS &&
2467                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
2468                         memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2469                                args, sizeof(args));
2470                 } else {
2471                         lp_build_intrinsic(base->gallivm->builder,
2472                                            "llvm.SI.export", ctx->voidt,
2473                                            args, 9, 0);
2474                 }
2475
2476                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2477                         semantic_name = TGSI_SEMANTIC_GENERIC;
2478                         goto handle_semantic;
2479                 }
2480         }
2481
2482         shader->info.nr_param_exports = param_count;
2483
2484         /* We need to add the position output manually if it's missing. */
2485         if (!pos_args[0][0]) {
2486                 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2487                 pos_args[0][1] = uint->zero; /* EXEC mask */
2488                 pos_args[0][2] = uint->zero; /* last export? */
2489                 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2490                 pos_args[0][4] = uint->zero; /* COMPR flag */
2491                 pos_args[0][5] = base->zero; /* X */
2492                 pos_args[0][6] = base->zero; /* Y */
2493                 pos_args[0][7] = base->zero; /* Z */
2494                 pos_args[0][8] = base->one;  /* W */
2495         }
2496
2497         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2498         if (shader->selector->info.writes_psize ||
2499             shader->selector->info.writes_edgeflag ||
2500             shader->selector->info.writes_viewport_index ||
2501             shader->selector->info.writes_layer) {
2502                 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2503                                                       shader->selector->info.writes_psize |
2504                                                       (shader->selector->info.writes_edgeflag << 1) |
2505                                                       (shader->selector->info.writes_layer << 2) |
2506                                                       (shader->selector->info.writes_viewport_index << 3));
2507                 pos_args[1][1] = uint->zero; /* EXEC mask */
2508                 pos_args[1][2] = uint->zero; /* last export? */
2509                 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2510                 pos_args[1][4] = uint->zero; /* COMPR flag */
2511                 pos_args[1][5] = base->zero; /* X */
2512                 pos_args[1][6] = base->zero; /* Y */
2513                 pos_args[1][7] = base->zero; /* Z */
2514                 pos_args[1][8] = base->zero; /* W */
2515
2516                 if (shader->selector->info.writes_psize)
2517                         pos_args[1][5] = psize_value;
2518
2519                 if (shader->selector->info.writes_edgeflag) {
2520                         /* The output is a float, but the hw expects an integer
2521                          * with the first bit containing the edge flag. */
2522                         edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2523                                                          edgeflag_value,
2524                                                          ctx->i32, "");
2525                         edgeflag_value = lp_build_min(&bld_base->int_bld,
2526                                                       edgeflag_value,
2527                                                       bld_base->int_bld.one);
2528
2529                         /* The LLVM intrinsic expects a float. */
2530                         pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2531                                                           edgeflag_value,
2532                                                           ctx->f32, "");
2533                 }
2534
2535                 if (shader->selector->info.writes_layer)
2536                         pos_args[1][7] = layer_value;
2537
2538                 if (shader->selector->info.writes_viewport_index)
2539                         pos_args[1][8] = viewport_index_value;
2540         }
2541
2542         for (i = 0; i < 4; i++)
2543                 if (pos_args[i][0])
2544                         shader->info.nr_pos_exports++;
2545
2546         pos_idx = 0;
2547         for (i = 0; i < 4; i++) {
2548                 if (!pos_args[i][0])
2549                         continue;
2550
2551                 /* Specify the target we are exporting */
2552                 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2553
2554                 if (pos_idx == shader->info.nr_pos_exports)
2555                         /* Specify that this is the last export */
2556                         pos_args[i][2] = uint->one;
2557
2558                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2559                                    ctx->voidt, pos_args[i], 9, 0);
2560         }
2561 }
2562
2563 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2564 {
2565         struct si_shader_context *ctx = si_shader_context(bld_base);
2566         struct gallivm_state *gallivm = bld_base->base.gallivm;
2567         LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2568         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2569         uint64_t inputs;
2570
2571         invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2572
2573         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2574         buffer = build_indexed_load_const(ctx, rw_buffers,
2575                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2576
2577         buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2578
2579         lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2580         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2581                                          lds_vertex_stride, "");
2582         lds_base = get_tcs_in_current_patch_offset(ctx);
2583         lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2584
2585         inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2586         while (inputs) {
2587                 unsigned i = u_bit_scan64(&inputs);
2588
2589                 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2590                                             lp_build_const_int32(gallivm, 4 * i),
2591                                              "");
2592
2593                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2594                                               invocation_id,
2595                                               lp_build_const_int32(gallivm, i));
2596
2597                 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2598                                               lds_ptr);
2599
2600                 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2601                                            buffer_offset, 0);
2602         }
2603 }
2604
2605 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2606                                   LLVMValueRef rel_patch_id,
2607                                   LLVMValueRef invocation_id,
2608                                   LLVMValueRef tcs_out_current_patch_data_offset)
2609 {
2610         struct si_shader_context *ctx = si_shader_context(bld_base);
2611         struct gallivm_state *gallivm = bld_base->base.gallivm;
2612         struct si_shader *shader = ctx->shader;
2613         unsigned tess_inner_index, tess_outer_index;
2614         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2615         LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2616         unsigned stride, outer_comps, inner_comps, i;
2617         struct lp_build_if_state if_ctx, inner_if_ctx;
2618
2619         si_llvm_emit_barrier(NULL, bld_base, NULL);
2620
2621         /* Do this only for invocation 0, because the tess levels are per-patch,
2622          * not per-vertex.
2623          *
2624          * This can't jump, because invocation 0 executes this. It should
2625          * at least mask out the loads and stores for other invocations.
2626          */
2627         lp_build_if(&if_ctx, gallivm,
2628                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2629                                   invocation_id, bld_base->uint_bld.zero, ""));
2630
2631         /* Determine the layout of one tess factor element in the buffer. */
2632         switch (shader->key.tcs.epilog.prim_mode) {
2633         case PIPE_PRIM_LINES:
2634                 stride = 2; /* 2 dwords, 1 vec2 store */
2635                 outer_comps = 2;
2636                 inner_comps = 0;
2637                 break;
2638         case PIPE_PRIM_TRIANGLES:
2639                 stride = 4; /* 4 dwords, 1 vec4 store */
2640                 outer_comps = 3;
2641                 inner_comps = 1;
2642                 break;
2643         case PIPE_PRIM_QUADS:
2644                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2645                 outer_comps = 4;
2646                 inner_comps = 2;
2647                 break;
2648         default:
2649                 assert(0);
2650                 return;
2651         }
2652
2653         /* Load tess_inner and tess_outer from LDS.
2654          * Any invocation can write them, so we can't get them from a temporary.
2655          */
2656         tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2657         tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2658
2659         lds_base = tcs_out_current_patch_data_offset;
2660         lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2661                                  lp_build_const_int32(gallivm,
2662                                                       tess_inner_index * 4), "");
2663         lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2664                                  lp_build_const_int32(gallivm,
2665                                                       tess_outer_index * 4), "");
2666
2667         for (i = 0; i < outer_comps; i++)
2668                 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2669         for (i = 0; i < inner_comps; i++)
2670                 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2671
2672         /* Convert the outputs to vectors for stores. */
2673         vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2674         vec1 = NULL;
2675
2676         if (stride > 4)
2677                 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2678
2679         /* Get the buffer. */
2680         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2681                                   SI_PARAM_RW_BUFFERS);
2682         buffer = build_indexed_load_const(ctx, rw_buffers,
2683                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2684
2685         /* Get the offset. */
2686         tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2687                                SI_PARAM_TESS_FACTOR_OFFSET);
2688         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2689                                   lp_build_const_int32(gallivm, 4 * stride), "");
2690
2691         lp_build_if(&inner_if_ctx, gallivm,
2692                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2693                                   rel_patch_id, bld_base->uint_bld.zero, ""));
2694
2695         /* Store the dynamic HS control word. */
2696         build_tbuffer_store_dwords(ctx, buffer,
2697                                    lp_build_const_int32(gallivm, 0x80000000),
2698                                    1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2699
2700         lp_build_endif(&inner_if_ctx);
2701
2702         /* Store the tessellation factors. */
2703         build_tbuffer_store_dwords(ctx, buffer, vec0,
2704                                    MIN2(stride, 4), byteoffset, tf_base, 4);
2705         if (vec1)
2706                 build_tbuffer_store_dwords(ctx, buffer, vec1,
2707                                            stride - 4, byteoffset, tf_base, 20);
2708         lp_build_endif(&if_ctx);
2709 }
2710
2711 /* This only writes the tessellation factor levels. */
2712 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2713 {
2714         struct si_shader_context *ctx = si_shader_context(bld_base);
2715         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2716
2717         rel_patch_id = get_rel_patch_id(ctx);
2718         invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2719         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2720
2721         if (!ctx->is_monolithic) {
2722                 /* Return epilog parameters from this function. */
2723                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2724                 LLVMValueRef ret = ctx->return_value;
2725                 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2726                 unsigned vgpr;
2727
2728                 /* RW_BUFFERS pointer */
2729                 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2730                                           SI_PARAM_RW_BUFFERS);
2731                 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2732                 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2733                 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2734                                               bld_base->uint_bld.zero, "");
2735                 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2736                                               bld_base->uint_bld.one, "");
2737                 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2738                 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2739
2740                 /* Tess factor buffer soffset is after user SGPRs. */
2741                 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2742                                           SI_PARAM_TESS_FACTOR_OFFSET);
2743                 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2744                                            SI_TCS_NUM_USER_SGPR + 1, "");
2745
2746                 /* VGPRs */
2747                 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2748                 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2749                 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2750
2751                 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2752                 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2753                 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2754                 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2755                 ctx->return_value = ret;
2756                 return;
2757         }
2758
2759         si_copy_tcs_inputs(bld_base);
2760         si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2761 }
2762
2763 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2764 {
2765         struct si_shader_context *ctx = si_shader_context(bld_base);
2766         struct si_shader *shader = ctx->shader;
2767         struct tgsi_shader_info *info = &shader->selector->info;
2768         struct gallivm_state *gallivm = bld_base->base.gallivm;
2769         unsigned i, chan;
2770         LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2771                                               ctx->param_rel_auto_id);
2772         LLVMValueRef vertex_dw_stride =
2773                 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2774         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2775                                                  vertex_dw_stride, "");
2776
2777         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2778          * its inputs from it. */
2779         for (i = 0; i < info->num_outputs; i++) {
2780                 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2781                 unsigned name = info->output_semantic_name[i];
2782                 unsigned index = info->output_semantic_index[i];
2783                 int param = si_shader_io_get_unique_index(name, index);
2784                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2785                                         lp_build_const_int32(gallivm, param * 4), "");
2786
2787                 for (chan = 0; chan < 4; chan++) {
2788                         lds_store(bld_base, chan, dw_addr,
2789                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2790                 }
2791         }
2792 }
2793
2794 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2795 {
2796         struct si_shader_context *ctx = si_shader_context(bld_base);
2797         struct gallivm_state *gallivm = bld_base->base.gallivm;
2798         struct si_shader *es = ctx->shader;
2799         struct tgsi_shader_info *info = &es->selector->info;
2800         LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2801                                             ctx->param_es2gs_offset);
2802         unsigned chan;
2803         int i;
2804
2805         for (i = 0; i < info->num_outputs; i++) {
2806                 LLVMValueRef *out_ptr =
2807                         ctx->radeon_bld.soa.outputs[i];
2808                 int param_index;
2809
2810                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2811                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2812                         continue;
2813
2814                 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2815                                                             info->output_semantic_index[i]);
2816
2817                 for (chan = 0; chan < 4; chan++) {
2818                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2819                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2820
2821                         build_tbuffer_store(ctx,
2822                                             ctx->esgs_ring,
2823                                             out_val, 1,
2824                                             LLVMGetUndef(ctx->i32), soffset,
2825                                             (4 * param_index + chan) * 4,
2826                                             V_008F0C_BUF_DATA_FORMAT_32,
2827                                             V_008F0C_BUF_NUM_FORMAT_UINT,
2828                                             0, 0, 1, 1, 0);
2829                 }
2830         }
2831 }
2832
2833 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2834 {
2835         struct si_shader_context *ctx = si_shader_context(bld_base);
2836         struct gallivm_state *gallivm = bld_base->base.gallivm;
2837         LLVMValueRef args[2];
2838
2839         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2840         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2841         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2842                            ctx->voidt, args, 2, 0);
2843 }
2844
2845 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2846 {
2847         struct si_shader_context *ctx = si_shader_context(bld_base);
2848         struct gallivm_state *gallivm = bld_base->base.gallivm;
2849         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2850         struct si_shader_output_values *outputs = NULL;
2851         int i,j;
2852
2853         assert(!ctx->is_gs_copy_shader);
2854
2855         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2856
2857         /* Vertex color clamping.
2858          *
2859          * This uses a state constant loaded in a user data SGPR and
2860          * an IF statement is added that clamps all colors if the constant
2861          * is true.
2862          */
2863         if (ctx->type == PIPE_SHADER_VERTEX) {
2864                 struct lp_build_if_state if_ctx;
2865                 LLVMValueRef cond = NULL;
2866                 LLVMValueRef addr, val;
2867
2868                 for (i = 0; i < info->num_outputs; i++) {
2869                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2870                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2871                                 continue;
2872
2873                         /* We've found a color. */
2874                         if (!cond) {
2875                                 /* The state is in the first bit of the user SGPR. */
2876                                 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2877                                                     SI_PARAM_VS_STATE_BITS);
2878                                 cond = LLVMBuildTrunc(gallivm->builder, cond,
2879                                                       ctx->i1, "");
2880                                 lp_build_if(&if_ctx, gallivm, cond);
2881                         }
2882
2883                         for (j = 0; j < 4; j++) {
2884                                 addr = ctx->radeon_bld.soa.outputs[i][j];
2885                                 val = LLVMBuildLoad(gallivm->builder, addr, "");
2886                                 val = radeon_llvm_saturate(bld_base, val);
2887                                 LLVMBuildStore(gallivm->builder, val, addr);
2888                         }
2889                 }
2890
2891                 if (cond)
2892                         lp_build_endif(&if_ctx);
2893         }
2894
2895         for (i = 0; i < info->num_outputs; i++) {
2896                 outputs[i].name = info->output_semantic_name[i];
2897                 outputs[i].sid = info->output_semantic_index[i];
2898
2899                 for (j = 0; j < 4; j++)
2900                         outputs[i].values[j] =
2901                                 LLVMBuildLoad(gallivm->builder,
2902                                               ctx->radeon_bld.soa.outputs[i][j],
2903                                               "");
2904         }
2905
2906         if (ctx->is_monolithic) {
2907                 /* Export PrimitiveID when PS needs it. */
2908                 if (si_vs_exports_prim_id(ctx->shader)) {
2909                         outputs[i].name = TGSI_SEMANTIC_PRIMID;
2910                         outputs[i].sid = 0;
2911                         outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2912                                                        get_primitive_id(bld_base, 0));
2913                         outputs[i].values[1] = bld_base->base.undef;
2914                         outputs[i].values[2] = bld_base->base.undef;
2915                         outputs[i].values[3] = bld_base->base.undef;
2916                         i++;
2917                 }
2918         } else {
2919                 /* Return the primitive ID from the LLVM function. */
2920                 ctx->return_value =
2921                         LLVMBuildInsertValue(gallivm->builder,
2922                                              ctx->return_value,
2923                                              bitcast(bld_base, TGSI_TYPE_FLOAT,
2924                                                      get_primitive_id(bld_base, 0)),
2925                                              VS_EPILOG_PRIMID_LOC, "");
2926         }
2927
2928         si_llvm_export_vs(bld_base, outputs, i);
2929         FREE(outputs);
2930 }
2931
2932 struct si_ps_exports {
2933         unsigned num;
2934         LLVMValueRef args[10][9];
2935 };
2936
2937 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2938                             LLVMValueRef depth, LLVMValueRef stencil,
2939                             LLVMValueRef samplemask, struct si_ps_exports *exp)
2940 {
2941         struct si_shader_context *ctx = si_shader_context(bld_base);
2942         struct lp_build_context *base = &bld_base->base;
2943         struct lp_build_context *uint = &bld_base->uint_bld;
2944         LLVMValueRef args[9];
2945         unsigned mask = 0;
2946
2947         assert(depth || stencil || samplemask);
2948
2949         args[1] = uint->one; /* whether the EXEC mask is valid */
2950         args[2] = uint->one; /* DONE bit */
2951
2952         /* Specify the target we are exporting */
2953         args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2954
2955         args[4] = uint->zero; /* COMP flag */
2956         args[5] = base->undef; /* R, depth */
2957         args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2958         args[7] = base->undef; /* B, sample mask */
2959         args[8] = base->undef; /* A, alpha to mask */
2960
2961         if (depth) {
2962                 args[5] = depth;
2963                 mask |= 0x1;
2964         }
2965
2966         if (stencil) {
2967                 args[6] = stencil;
2968                 mask |= 0x2;
2969         }
2970
2971         if (samplemask) {
2972                 args[7] = samplemask;
2973                 mask |= 0x4;
2974         }
2975
2976         /* SI (except OLAND) has a bug that it only looks
2977          * at the X writemask component. */
2978         if (ctx->screen->b.chip_class == SI &&
2979             ctx->screen->b.family != CHIP_OLAND)
2980                 mask |= 0x1;
2981
2982         /* Specify which components to enable */
2983         args[0] = lp_build_const_int32(base->gallivm, mask);
2984
2985         memcpy(exp->args[exp->num++], args, sizeof(args));
2986 }
2987
2988 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2989                                 LLVMValueRef *color, unsigned index,
2990                                 unsigned samplemask_param,
2991                                 bool is_last, struct si_ps_exports *exp)
2992 {
2993         struct si_shader_context *ctx = si_shader_context(bld_base);
2994         struct lp_build_context *base = &bld_base->base;
2995         int i;
2996
2997         /* Clamp color */
2998         if (ctx->shader->key.ps.epilog.clamp_color)
2999                 for (i = 0; i < 4; i++)
3000                         color[i] = radeon_llvm_saturate(bld_base, color[i]);
3001
3002         /* Alpha to one */
3003         if (ctx->shader->key.ps.epilog.alpha_to_one)
3004                 color[3] = base->one;
3005
3006         /* Alpha test */
3007         if (index == 0 &&
3008             ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3009                 si_alpha_test(bld_base, color[3]);
3010
3011         /* Line & polygon smoothing */
3012         if (ctx->shader->key.ps.epilog.poly_line_smoothing)
3013                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3014                                                          samplemask_param);
3015
3016         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3017         if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3018                 LLVMValueRef args[8][9];
3019                 int c, last = -1;
3020
3021                 /* Get the export arguments, also find out what the last one is. */
3022                 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3023                         si_llvm_init_export_args(bld_base, color,
3024                                                  V_008DFC_SQ_EXP_MRT + c, args[c]);
3025                         if (args[c][0] != bld_base->uint_bld.zero)
3026                                 last = c;
3027                 }
3028
3029                 /* Emit all exports. */
3030                 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3031                         if (is_last && last == c) {
3032                                 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3033                                 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3034                         } else if (args[c][0] == bld_base->uint_bld.zero)
3035                                 continue; /* unnecessary NULL export */
3036
3037                         memcpy(exp->args[exp->num++], args[c], sizeof(args[c]));
3038                 }
3039         } else {
3040                 LLVMValueRef args[9];
3041
3042                 /* Export */
3043                 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3044                                          args);
3045                 if (is_last) {
3046                         args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3047                         args[2] = bld_base->uint_bld.one; /* DONE bit */
3048                 } else if (args[0] == bld_base->uint_bld.zero)
3049                         return; /* unnecessary NULL export */
3050
3051                 memcpy(exp->args[exp->num++], args, sizeof(args));
3052         }
3053 }
3054
3055 static void si_emit_ps_exports(struct si_shader_context *ctx,
3056                                struct si_ps_exports *exp)
3057 {
3058         for (unsigned i = 0; i < exp->num; i++)
3059                 lp_build_intrinsic(ctx->radeon_bld.gallivm.builder,
3060                                    "llvm.SI.export", ctx->voidt,
3061                                    exp->args[i], 9, 0);
3062 }
3063
3064 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3065 {
3066         struct si_shader_context *ctx = si_shader_context(bld_base);
3067         struct lp_build_context *base = &bld_base->base;
3068         struct lp_build_context *uint = &bld_base->uint_bld;
3069         LLVMValueRef args[9];
3070
3071         args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3072         args[1] = uint->one; /* whether the EXEC mask is valid */
3073         args[2] = uint->one; /* DONE bit */
3074         args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3075         args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3076         args[5] = uint->undef; /* R */
3077         args[6] = uint->undef; /* G */
3078         args[7] = uint->undef; /* B */
3079         args[8] = uint->undef; /* A */
3080
3081         lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3082                            ctx->voidt, args, 9, 0);
3083 }
3084
3085 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3086 {
3087         struct si_shader_context *ctx = si_shader_context(bld_base);
3088         struct si_shader *shader = ctx->shader;
3089         struct lp_build_context *base = &bld_base->base;
3090         struct tgsi_shader_info *info = &shader->selector->info;
3091         LLVMBuilderRef builder = base->gallivm->builder;
3092         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3093         int last_color_export = -1;
3094         int i;
3095         struct si_ps_exports exp = {};
3096
3097         /* Determine the last export. If MRTZ is present, it's always last.
3098          * Otherwise, find the last color export.
3099          */
3100         if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3101                 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3102
3103                 /* Don't export NULL and return if alpha-test is enabled. */
3104                 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3105                     shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3106                     (spi_format & 0xf) == 0)
3107                         spi_format |= V_028714_SPI_SHADER_32_AR;
3108
3109                 for (i = 0; i < info->num_outputs; i++) {
3110                         unsigned index = info->output_semantic_index[i];
3111
3112                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3113                                 continue;
3114
3115                         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3116                         if (shader->key.ps.epilog.last_cbuf > 0) {
3117                                 /* Just set this if any of the colorbuffers are enabled. */
3118                                 if (spi_format &
3119                                     ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3120                                         last_color_export = i;
3121                                 continue;
3122                         }
3123
3124                         if ((spi_format >> (index * 4)) & 0xf)
3125                                 last_color_export = i;
3126                 }
3127
3128                 /* If there are no outputs, export NULL. */
3129                 if (last_color_export == -1) {
3130                         si_export_null(bld_base);
3131                         return;
3132                 }
3133         }
3134
3135         for (i = 0; i < info->num_outputs; i++) {
3136                 unsigned semantic_name = info->output_semantic_name[i];
3137                 unsigned semantic_index = info->output_semantic_index[i];
3138                 unsigned j;
3139                 LLVMValueRef color[4] = {};
3140
3141                 /* Select the correct target */
3142                 switch (semantic_name) {
3143                 case TGSI_SEMANTIC_POSITION:
3144                         depth = LLVMBuildLoad(builder,
3145                                               ctx->radeon_bld.soa.outputs[i][2], "");
3146                         break;
3147                 case TGSI_SEMANTIC_STENCIL:
3148                         stencil = LLVMBuildLoad(builder,
3149                                                 ctx->radeon_bld.soa.outputs[i][1], "");
3150                         break;
3151                 case TGSI_SEMANTIC_SAMPLEMASK:
3152                         samplemask = LLVMBuildLoad(builder,
3153                                                    ctx->radeon_bld.soa.outputs[i][0], "");
3154                         break;
3155                 case TGSI_SEMANTIC_COLOR:
3156                         for (j = 0; j < 4; j++)
3157                                 color[j] = LLVMBuildLoad(builder,
3158                                                          ctx->radeon_bld.soa.outputs[i][j], "");
3159
3160                         si_export_mrt_color(bld_base, color, semantic_index,
3161                                             SI_PARAM_SAMPLE_COVERAGE,
3162                                             last_color_export == i, &exp);
3163                         break;
3164                 default:
3165                         fprintf(stderr,
3166                                 "Warning: SI unhandled fs output type:%d\n",
3167                                 semantic_name);
3168                 }
3169         }
3170
3171         if (depth || stencil || samplemask)
3172                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
3173
3174         si_emit_ps_exports(ctx, &exp);
3175 }
3176
3177 /**
3178  * Return PS outputs in this order:
3179  *
3180  * v[0:3] = color0.xyzw
3181  * v[4:7] = color1.xyzw
3182  * ...
3183  * vN+0 = Depth
3184  * vN+1 = Stencil
3185  * vN+2 = SampleMask
3186  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3187  *
3188  * The alpha-ref SGPR is returned via its original location.
3189  */
3190 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3191 {
3192         struct si_shader_context *ctx = si_shader_context(bld_base);
3193         struct si_shader *shader = ctx->shader;
3194         struct lp_build_context *base = &bld_base->base;
3195         struct tgsi_shader_info *info = &shader->selector->info;
3196         LLVMBuilderRef builder = base->gallivm->builder;
3197         unsigned i, j, first_vgpr, vgpr;
3198
3199         LLVMValueRef color[8][4] = {};
3200         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3201         LLVMValueRef ret;
3202
3203         /* Read the output values. */
3204         for (i = 0; i < info->num_outputs; i++) {
3205                 unsigned semantic_name = info->output_semantic_name[i];
3206                 unsigned semantic_index = info->output_semantic_index[i];
3207
3208                 switch (semantic_name) {
3209                 case TGSI_SEMANTIC_COLOR:
3210                         assert(semantic_index < 8);
3211                         for (j = 0; j < 4; j++) {
3212                                 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3213                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3214                                 color[semantic_index][j] = result;
3215                         }
3216                         break;
3217                 case TGSI_SEMANTIC_POSITION:
3218                         depth = LLVMBuildLoad(builder,
3219                                               ctx->radeon_bld.soa.outputs[i][2], "");
3220                         break;
3221                 case TGSI_SEMANTIC_STENCIL:
3222                         stencil = LLVMBuildLoad(builder,
3223                                                 ctx->radeon_bld.soa.outputs[i][1], "");
3224                         break;
3225                 case TGSI_SEMANTIC_SAMPLEMASK:
3226                         samplemask = LLVMBuildLoad(builder,
3227                                                    ctx->radeon_bld.soa.outputs[i][0], "");
3228                         break;
3229                 default:
3230                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3231                                 semantic_name);
3232                 }
3233         }
3234
3235         /* Fill the return structure. */
3236         ret = ctx->return_value;
3237
3238         /* Set SGPRs. */
3239         ret = LLVMBuildInsertValue(builder, ret,
3240                                    bitcast(bld_base, TGSI_TYPE_SIGNED,
3241                                            LLVMGetParam(ctx->radeon_bld.main_fn,
3242                                                         SI_PARAM_ALPHA_REF)),
3243                                    SI_SGPR_ALPHA_REF, "");
3244
3245         /* Set VGPRs */
3246         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3247         for (i = 0; i < ARRAY_SIZE(color); i++) {
3248                 if (!color[i][0])
3249                         continue;
3250
3251                 for (j = 0; j < 4; j++)
3252                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3253         }
3254         if (depth)
3255                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3256         if (stencil)
3257                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3258         if (samplemask)
3259                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3260
3261         /* Add the input sample mask for smoothing at the end. */
3262         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3263                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3264         ret = LLVMBuildInsertValue(builder, ret,
3265                                    LLVMGetParam(ctx->radeon_bld.main_fn,
3266                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3267
3268         ctx->return_value = ret;
3269 }
3270
3271 /**
3272  * Given a v8i32 resource descriptor for a buffer, extract the size of the
3273  * buffer in number of elements and return it as an i32.
3274  */
3275 static LLVMValueRef get_buffer_size(
3276         struct lp_build_tgsi_context *bld_base,
3277         LLVMValueRef descriptor)
3278 {
3279         struct si_shader_context *ctx = si_shader_context(bld_base);
3280         struct gallivm_state *gallivm = bld_base->base.gallivm;
3281         LLVMBuilderRef builder = gallivm->builder;
3282         LLVMValueRef size =
3283                 LLVMBuildExtractElement(builder, descriptor,
3284                                         lp_build_const_int32(gallivm, 6), "");
3285
3286         if (ctx->screen->b.chip_class >= VI) {
3287                 /* On VI, the descriptor contains the size in bytes,
3288                  * but TXQ must return the size in elements.
3289                  * The stride is always non-zero for resources using TXQ.
3290                  */
3291                 LLVMValueRef stride =
3292                         LLVMBuildExtractElement(builder, descriptor,
3293                                                 lp_build_const_int32(gallivm, 5), "");
3294                 stride = LLVMBuildLShr(builder, stride,
3295                                        lp_build_const_int32(gallivm, 16), "");
3296                 stride = LLVMBuildAnd(builder, stride,
3297                                       lp_build_const_int32(gallivm, 0x3FFF), "");
3298
3299                 size = LLVMBuildUDiv(builder, size, stride, "");
3300         }
3301
3302         return size;
3303 }
3304
3305 /**
3306  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3307  * intrinsic names).
3308  */
3309 static void build_int_type_name(
3310         LLVMTypeRef type,
3311         char *buf, unsigned bufsize)
3312 {
3313         assert(bufsize >= 6);
3314
3315         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3316                 snprintf(buf, bufsize, "v%ui32",
3317                          LLVMGetVectorSize(type));
3318         else
3319                 strcpy(buf, "i32");
3320 }
3321
3322 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3323                                 struct lp_build_tgsi_context *bld_base,
3324                                 struct lp_build_emit_data *emit_data);
3325
3326 /* Prevent optimizations (at least of memory accesses) across the current
3327  * point in the program by emitting empty inline assembly that is marked as
3328  * having side effects.
3329  */
3330 static void emit_optimization_barrier(struct si_shader_context *ctx)
3331 {
3332         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3333         LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3334         LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3335         LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3336 }
3337
3338 static void emit_waitcnt(struct si_shader_context *ctx)
3339 {
3340         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3341         LLVMBuilderRef builder = gallivm->builder;
3342         LLVMValueRef args[1] = {
3343                 lp_build_const_int32(gallivm, 0xf70)
3344         };
3345         lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3346                            ctx->voidt, args, 1, 0);
3347 }
3348
3349 static void membar_emit(
3350                 const struct lp_build_tgsi_action *action,
3351                 struct lp_build_tgsi_context *bld_base,
3352                 struct lp_build_emit_data *emit_data)
3353 {
3354         struct si_shader_context *ctx = si_shader_context(bld_base);
3355
3356         emit_waitcnt(ctx);
3357 }
3358
3359 static LLVMValueRef
3360 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3361                          const struct tgsi_full_src_register *reg)
3362 {
3363         LLVMValueRef ind_index;
3364         LLVMValueRef rsrc_ptr;
3365
3366         if (!reg->Register.Indirect)
3367                 return ctx->shader_buffers[reg->Register.Index];
3368
3369         ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3370                                                reg->Register.Index,
3371                                                SI_NUM_SHADER_BUFFERS);
3372
3373         rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3374         return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3375 }
3376
3377 static bool tgsi_is_array_sampler(unsigned target)
3378 {
3379         return target == TGSI_TEXTURE_1D_ARRAY ||
3380                target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3381                target == TGSI_TEXTURE_2D_ARRAY ||
3382                target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3383                target == TGSI_TEXTURE_CUBE_ARRAY ||
3384                target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3385                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3386 }
3387
3388 static bool tgsi_is_array_image(unsigned target)
3389 {
3390         return target == TGSI_TEXTURE_3D ||
3391                target == TGSI_TEXTURE_CUBE ||
3392                target == TGSI_TEXTURE_1D_ARRAY ||
3393                target == TGSI_TEXTURE_2D_ARRAY ||
3394                target == TGSI_TEXTURE_CUBE_ARRAY ||
3395                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3396 }
3397
3398 /**
3399  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3400  *
3401  * At least on Tonga, executing image stores on images with DCC enabled and
3402  * non-trivial can eventually lead to lockups. This can occur when an
3403  * application binds an image as read-only but then uses a shader that writes
3404  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3405  * program termination) in this case, but it doesn't cost much to be a bit
3406  * nicer: disabling DCC in the shader still leads to undefined results but
3407  * avoids the lockup.
3408  */
3409 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3410                                   LLVMValueRef rsrc)
3411 {
3412         if (ctx->screen->b.chip_class <= CIK) {
3413                 return rsrc;
3414         } else {
3415                 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3416                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3417                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3418                 LLVMValueRef tmp;
3419
3420                 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3421                 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3422                 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3423         }
3424 }
3425
3426 /**
3427  * Load the resource descriptor for \p image.
3428  */
3429 static void
3430 image_fetch_rsrc(
3431         struct lp_build_tgsi_context *bld_base,
3432         const struct tgsi_full_src_register *image,
3433         bool dcc_off,
3434         LLVMValueRef *rsrc)
3435 {
3436         struct si_shader_context *ctx = si_shader_context(bld_base);
3437
3438         assert(image->Register.File == TGSI_FILE_IMAGE);
3439
3440         if (!image->Register.Indirect) {
3441                 /* Fast path: use preloaded resources */
3442                 *rsrc = ctx->images[image->Register.Index];
3443         } else {
3444                 /* Indexing and manual load */
3445                 LLVMValueRef ind_index;
3446                 LLVMValueRef rsrc_ptr;
3447                 LLVMValueRef tmp;
3448
3449                 /* From the GL_ARB_shader_image_load_store extension spec:
3450                  *
3451                  *    If a shader performs an image load, store, or atomic
3452                  *    operation using an image variable declared as an array,
3453                  *    and if the index used to select an individual element is
3454                  *    negative or greater than or equal to the size of the
3455                  *    array, the results of the operation are undefined but may
3456                  *    not lead to termination.
3457                  */
3458                 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3459                                                        image->Register.Index,
3460                                                        SI_NUM_IMAGES);
3461
3462                 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3463                 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3464                 if (dcc_off)
3465                         tmp = force_dcc_off(ctx, tmp);
3466                 *rsrc = tmp;
3467         }
3468 }
3469
3470 static LLVMValueRef image_fetch_coords(
3471                 struct lp_build_tgsi_context *bld_base,
3472                 const struct tgsi_full_instruction *inst,
3473                 unsigned src)
3474 {
3475         struct gallivm_state *gallivm = bld_base->base.gallivm;
3476         LLVMBuilderRef builder = gallivm->builder;
3477         unsigned target = inst->Memory.Texture;
3478         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3479         LLVMValueRef coords[4];
3480         LLVMValueRef tmp;
3481         int chan;
3482
3483         for (chan = 0; chan < num_coords; ++chan) {
3484                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3485                 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3486                 coords[chan] = tmp;
3487         }
3488
3489         if (num_coords == 1)
3490                 return coords[0];
3491
3492         if (num_coords == 3) {
3493                 /* LLVM has difficulties lowering 3-element vectors. */
3494                 coords[3] = bld_base->uint_bld.undef;
3495                 num_coords = 4;
3496         }
3497
3498         return lp_build_gather_values(gallivm, coords, num_coords);
3499 }
3500
3501 /**
3502  * Append the extra mode bits that are used by image load and store.
3503  */
3504 static void image_append_args(
3505                 struct si_shader_context *ctx,
3506                 struct lp_build_emit_data * emit_data,
3507                 unsigned target,
3508                 bool atomic)
3509 {
3510         const struct tgsi_full_instruction *inst = emit_data->inst;
3511         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3512         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3513
3514         emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3515         emit_data->args[emit_data->arg_count++] =
3516                 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3517         if (!atomic) {
3518                 emit_data->args[emit_data->arg_count++] =
3519                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3520                         i1true : i1false; /* glc */
3521         }
3522         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3523 }
3524
3525 /**
3526  * Given a 256 bit resource, extract the top half (which stores the buffer
3527  * resource in the case of textures and images).
3528  */
3529 static LLVMValueRef extract_rsrc_top_half(
3530                 struct si_shader_context *ctx,
3531                 LLVMValueRef rsrc)
3532 {
3533         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3534         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3535         LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3536
3537         rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3538         rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3539         rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3540
3541         return rsrc;
3542 }
3543
3544 /**
3545  * Append the resource and indexing arguments for buffer intrinsics.
3546  *
3547  * \param rsrc the v4i32 buffer resource
3548  * \param index index into the buffer (stride-based)
3549  * \param offset byte offset into the buffer
3550  */
3551 static void buffer_append_args(
3552                 struct si_shader_context *ctx,
3553                 struct lp_build_emit_data *emit_data,
3554                 LLVMValueRef rsrc,
3555                 LLVMValueRef index,
3556                 LLVMValueRef offset,
3557                 bool atomic)
3558 {
3559         const struct tgsi_full_instruction *inst = emit_data->inst;
3560         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3561         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3562
3563         emit_data->args[emit_data->arg_count++] = rsrc;
3564         emit_data->args[emit_data->arg_count++] = index; /* vindex */
3565         emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3566         if (!atomic) {
3567                 emit_data->args[emit_data->arg_count++] =
3568                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3569                         i1true : i1false; /* glc */
3570         }
3571         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3572 }
3573
3574 static void load_fetch_args(
3575                 struct lp_build_tgsi_context * bld_base,
3576                 struct lp_build_emit_data * emit_data)
3577 {
3578         struct si_shader_context *ctx = si_shader_context(bld_base);
3579         struct gallivm_state *gallivm = bld_base->base.gallivm;
3580         const struct tgsi_full_instruction * inst = emit_data->inst;
3581         unsigned target = inst->Memory.Texture;
3582         LLVMValueRef rsrc;
3583
3584         emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3585
3586         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3587                 LLVMBuilderRef builder = gallivm->builder;
3588                 LLVMValueRef offset;
3589                 LLVMValueRef tmp;
3590
3591                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3592
3593                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3594                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3595
3596                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3597                                    offset, false);
3598         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3599                 LLVMValueRef coords;
3600
3601                 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3602                 coords = image_fetch_coords(bld_base, inst, 1);
3603
3604                 if (target == TGSI_TEXTURE_BUFFER) {
3605                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3606                         buffer_append_args(ctx, emit_data, rsrc, coords,
3607                                         bld_base->uint_bld.zero, false);
3608                 } else {
3609                         emit_data->args[0] = coords;
3610                         emit_data->args[1] = rsrc;
3611                         emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3612                         emit_data->arg_count = 3;
3613
3614                         image_append_args(ctx, emit_data, target, false);
3615                 }
3616         }
3617 }
3618
3619 static void load_emit_buffer(struct si_shader_context *ctx,
3620                              struct lp_build_emit_data *emit_data)
3621 {
3622         const struct tgsi_full_instruction *inst = emit_data->inst;
3623         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3624         LLVMBuilderRef builder = gallivm->builder;
3625         uint writemask = inst->Dst[0].Register.WriteMask;
3626         uint count = util_last_bit(writemask);
3627         const char *intrinsic_name;
3628         LLVMTypeRef dst_type;
3629
3630         switch (count) {
3631         case 1:
3632                 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3633                 dst_type = ctx->f32;
3634                 break;
3635         case 2:
3636                 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3637                 dst_type = LLVMVectorType(ctx->f32, 2);
3638                 break;
3639         default: // 3 & 4
3640                 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3641                 dst_type = ctx->v4f32;
3642                 count = 4;
3643         }
3644
3645         emit_data->output[emit_data->chan] = lp_build_intrinsic(
3646                         builder, intrinsic_name, dst_type,
3647                         emit_data->args, emit_data->arg_count,
3648                         LLVMReadOnlyAttribute);
3649 }
3650
3651 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3652                                    const struct tgsi_full_instruction *inst,
3653                                    LLVMTypeRef type, int arg)
3654 {
3655         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3656         LLVMBuilderRef builder = gallivm->builder;
3657         LLVMValueRef offset, ptr;
3658         int addr_space;
3659
3660         offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3661         offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3662
3663         ptr = ctx->shared_memory;
3664         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3665         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3666         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3667
3668         return ptr;
3669 }
3670
3671 static void load_emit_memory(
3672                 struct si_shader_context *ctx,
3673                 struct lp_build_emit_data *emit_data)
3674 {
3675         const struct tgsi_full_instruction *inst = emit_data->inst;
3676         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3677         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3678         LLVMBuilderRef builder = gallivm->builder;
3679         unsigned writemask = inst->Dst[0].Register.WriteMask;
3680         LLVMValueRef channels[4], ptr, derived_ptr, index;
3681         int chan;
3682
3683         ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3684
3685         for (chan = 0; chan < 4; ++chan) {
3686                 if (!(writemask & (1 << chan))) {
3687                         channels[chan] = LLVMGetUndef(base->elem_type);
3688                         continue;
3689                 }
3690
3691                 index = lp_build_const_int32(gallivm, chan);
3692                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3693                 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3694         }
3695         emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3696 }
3697
3698 static void load_emit(
3699                 const struct lp_build_tgsi_action *action,
3700                 struct lp_build_tgsi_context *bld_base,
3701                 struct lp_build_emit_data *emit_data)
3702 {
3703         struct si_shader_context *ctx = si_shader_context(bld_base);
3704         struct gallivm_state *gallivm = bld_base->base.gallivm;
3705         LLVMBuilderRef builder = gallivm->builder;
3706         const struct tgsi_full_instruction * inst = emit_data->inst;
3707         char intrinsic_name[32];
3708         char coords_type[8];
3709
3710         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3711                 load_emit_memory(ctx, emit_data);
3712                 return;
3713         }
3714
3715         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3716                 emit_waitcnt(ctx);
3717
3718         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3719                 load_emit_buffer(ctx, emit_data);
3720                 return;
3721         }
3722
3723         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3724                 emit_data->output[emit_data->chan] =
3725                         lp_build_intrinsic(
3726                                 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3727                                 emit_data->args, emit_data->arg_count,
3728                                 LLVMReadOnlyAttribute);
3729         } else {
3730                 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3731                                     coords_type, sizeof(coords_type));
3732
3733                 snprintf(intrinsic_name, sizeof(intrinsic_name),
3734                          "llvm.amdgcn.image.load.%s", coords_type);
3735
3736                 emit_data->output[emit_data->chan] =
3737                         lp_build_intrinsic(
3738                                 builder, intrinsic_name, emit_data->dst_type,
3739                                 emit_data->args, emit_data->arg_count,
3740                                 LLVMReadOnlyAttribute);
3741         }
3742 }
3743
3744 static void store_fetch_args(
3745                 struct lp_build_tgsi_context * bld_base,
3746                 struct lp_build_emit_data * emit_data)
3747 {
3748         struct si_shader_context *ctx = si_shader_context(bld_base);
3749         struct gallivm_state *gallivm = bld_base->base.gallivm;
3750         LLVMBuilderRef builder = gallivm->builder;
3751         const struct tgsi_full_instruction * inst = emit_data->inst;
3752         struct tgsi_full_src_register memory;
3753         LLVMValueRef chans[4];
3754         LLVMValueRef data;
3755         LLVMValueRef rsrc;
3756         unsigned chan;
3757
3758         emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3759
3760         for (chan = 0; chan < 4; ++chan) {
3761                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3762         }
3763         data = lp_build_gather_values(gallivm, chans, 4);
3764
3765         emit_data->args[emit_data->arg_count++] = data;
3766
3767         memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3768
3769         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3770                 LLVMValueRef offset;
3771                 LLVMValueRef tmp;
3772
3773                 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3774
3775                 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3776                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3777
3778                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3779                                    offset, false);
3780         } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3781                 unsigned target = inst->Memory.Texture;
3782                 LLVMValueRef coords;
3783
3784                 coords = image_fetch_coords(bld_base, inst, 0);
3785
3786                 if (target == TGSI_TEXTURE_BUFFER) {
3787                         image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3788
3789                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3790                         buffer_append_args(ctx, emit_data, rsrc, coords,
3791                                         bld_base->uint_bld.zero, false);
3792                 } else {
3793                         emit_data->args[1] = coords;
3794                         image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3795                         emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3796                         emit_data->arg_count = 4;
3797
3798                         image_append_args(ctx, emit_data, target, false);
3799                 }
3800         }
3801 }
3802
3803 static void store_emit_buffer(
3804                 struct si_shader_context *ctx,
3805                 struct lp_build_emit_data *emit_data)
3806 {
3807         const struct tgsi_full_instruction *inst = emit_data->inst;
3808         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3809         LLVMBuilderRef builder = gallivm->builder;
3810         struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3811         LLVMValueRef base_data = emit_data->args[0];
3812         LLVMValueRef base_offset = emit_data->args[3];
3813         unsigned writemask = inst->Dst[0].Register.WriteMask;
3814
3815         while (writemask) {
3816                 int start, count;
3817                 const char *intrinsic_name;
3818                 LLVMValueRef data;
3819                 LLVMValueRef offset;
3820                 LLVMValueRef tmp;
3821
3822                 u_bit_scan_consecutive_range(&writemask, &start, &count);
3823
3824                 /* Due to an LLVM limitation, split 3-element writes
3825                  * into a 2-element and a 1-element write. */
3826                 if (count == 3) {
3827                         writemask |= 1 << (start + 2);
3828                         count = 2;
3829                 }
3830
3831                 if (count == 4) {
3832                         data = base_data;
3833                         intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3834                 } else if (count == 2) {
3835                         LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3836
3837                         tmp = LLVMBuildExtractElement(
3838                                 builder, base_data,
3839                                 lp_build_const_int32(gallivm, start), "");
3840                         data = LLVMBuildInsertElement(
3841                                 builder, LLVMGetUndef(v2f32), tmp,
3842                                 uint_bld->zero, "");
3843
3844                         tmp = LLVMBuildExtractElement(
3845                                 builder, base_data,
3846                                 lp_build_const_int32(gallivm, start + 1), "");
3847                         data = LLVMBuildInsertElement(
3848                                 builder, data, tmp, uint_bld->one, "");
3849
3850                         intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3851                 } else {
3852                         assert(count == 1);
3853                         data = LLVMBuildExtractElement(
3854                                 builder, base_data,
3855                                 lp_build_const_int32(gallivm, start), "");
3856                         intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3857                 }
3858
3859                 offset = base_offset;
3860                 if (start != 0) {
3861                         offset = LLVMBuildAdd(
3862                                 builder, offset,
3863                                 lp_build_const_int32(gallivm, start * 4), "");
3864                 }
3865
3866                 emit_data->args[0] = data;
3867                 emit_data->args[3] = offset;
3868
3869                 lp_build_intrinsic(
3870                         builder, intrinsic_name, emit_data->dst_type,
3871                         emit_data->args, emit_data->arg_count, 0);
3872         }
3873 }
3874
3875 static void store_emit_memory(
3876                 struct si_shader_context *ctx,
3877                 struct lp_build_emit_data *emit_data)
3878 {
3879         const struct tgsi_full_instruction *inst = emit_data->inst;
3880         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3881         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3882         LLVMBuilderRef builder = gallivm->builder;
3883         unsigned writemask = inst->Dst[0].Register.WriteMask;
3884         LLVMValueRef ptr, derived_ptr, data, index;
3885         int chan;
3886
3887         ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3888
3889         for (chan = 0; chan < 4; ++chan) {
3890                 if (!(writemask & (1 << chan))) {
3891                         continue;
3892                 }
3893                 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3894                 index = lp_build_const_int32(gallivm, chan);
3895                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3896                 LLVMBuildStore(builder, data, derived_ptr);
3897         }
3898 }
3899
3900 static void store_emit(
3901                 const struct lp_build_tgsi_action *action,
3902                 struct lp_build_tgsi_context *bld_base,
3903                 struct lp_build_emit_data *emit_data)
3904 {
3905         struct si_shader_context *ctx = si_shader_context(bld_base);
3906         struct gallivm_state *gallivm = bld_base->base.gallivm;
3907         LLVMBuilderRef builder = gallivm->builder;
3908         const struct tgsi_full_instruction * inst = emit_data->inst;
3909         unsigned target = inst->Memory.Texture;
3910         char intrinsic_name[32];
3911         char coords_type[8];
3912
3913         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3914                 store_emit_memory(ctx, emit_data);
3915                 return;
3916         }
3917
3918         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3919                 emit_waitcnt(ctx);
3920
3921         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3922                 store_emit_buffer(ctx, emit_data);
3923                 return;
3924         }
3925
3926         if (target == TGSI_TEXTURE_BUFFER) {
3927                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3928                         builder, "llvm.amdgcn.buffer.store.format.v4f32",
3929                         emit_data->dst_type, emit_data->args,
3930                         emit_data->arg_count, 0);
3931         } else {
3932                 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3933                                     coords_type, sizeof(coords_type));
3934                 snprintf(intrinsic_name, sizeof(intrinsic_name),
3935                          "llvm.amdgcn.image.store.%s", coords_type);
3936
3937                 emit_data->output[emit_data->chan] =
3938                         lp_build_intrinsic(
3939                                 builder, intrinsic_name, emit_data->dst_type,
3940                                 emit_data->args, emit_data->arg_count, 0);
3941         }
3942 }
3943
3944 static void atomic_fetch_args(
3945                 struct lp_build_tgsi_context * bld_base,
3946                 struct lp_build_emit_data * emit_data)
3947 {
3948         struct si_shader_context *ctx = si_shader_context(bld_base);
3949         struct gallivm_state *gallivm = bld_base->base.gallivm;
3950         LLVMBuilderRef builder = gallivm->builder;
3951         const struct tgsi_full_instruction * inst = emit_data->inst;
3952         LLVMValueRef data1, data2;
3953         LLVMValueRef rsrc;
3954         LLVMValueRef tmp;
3955
3956         emit_data->dst_type = bld_base->base.elem_type;
3957
3958         tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3959         data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3960
3961         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3962                 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3963                 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3964         }
3965
3966         /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3967          * of arguments, which is reversed relative to TGSI (and GLSL)
3968          */
3969         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3970                 emit_data->args[emit_data->arg_count++] = data2;
3971         emit_data->args[emit_data->arg_count++] = data1;
3972
3973         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3974                 LLVMValueRef offset;
3975
3976                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3977
3978                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3979                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3980
3981                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3982                                    offset, true);
3983         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3984                 unsigned target = inst->Memory.Texture;
3985                 LLVMValueRef coords;
3986
3987                 image_fetch_rsrc(bld_base, &inst->Src[0],
3988                                  target != TGSI_TEXTURE_BUFFER, &rsrc);
3989                 coords = image_fetch_coords(bld_base, inst, 1);
3990
3991                 if (target == TGSI_TEXTURE_BUFFER) {
3992                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3993                         buffer_append_args(ctx, emit_data, rsrc, coords,
3994                                            bld_base->uint_bld.zero, true);
3995                 } else {
3996                         emit_data->args[emit_data->arg_count++] = coords;
3997                         emit_data->args[emit_data->arg_count++] = rsrc;
3998
3999                         image_append_args(ctx, emit_data, target, true);
4000                 }
4001         }
4002 }
4003
4004 static void atomic_emit_memory(struct si_shader_context *ctx,
4005                                struct lp_build_emit_data *emit_data) {
4006         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4007         LLVMBuilderRef builder = gallivm->builder;
4008         const struct tgsi_full_instruction * inst = emit_data->inst;
4009         LLVMValueRef ptr, result, arg;
4010
4011         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4012
4013         arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
4014         arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4015
4016         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4017                 LLVMValueRef new_data;
4018                 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
4019                                                inst, 3, 0);
4020
4021                 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4022
4023 #if HAVE_LLVM >= 0x309
4024                 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4025                                        LLVMAtomicOrderingSequentiallyConsistent,
4026                                        LLVMAtomicOrderingSequentiallyConsistent,
4027                                        false);
4028 #endif
4029
4030                 result = LLVMBuildExtractValue(builder, result, 0, "");
4031         } else {
4032                 LLVMAtomicRMWBinOp op;
4033
4034                 switch(inst->Instruction.Opcode) {
4035                         case TGSI_OPCODE_ATOMUADD:
4036                                 op = LLVMAtomicRMWBinOpAdd;
4037                                 break;
4038                         case TGSI_OPCODE_ATOMXCHG:
4039                                 op = LLVMAtomicRMWBinOpXchg;
4040                                 break;
4041                         case TGSI_OPCODE_ATOMAND:
4042                                 op = LLVMAtomicRMWBinOpAnd;
4043                                 break;
4044                         case TGSI_OPCODE_ATOMOR:
4045                                 op = LLVMAtomicRMWBinOpOr;
4046                                 break;
4047                         case TGSI_OPCODE_ATOMXOR:
4048                                 op = LLVMAtomicRMWBinOpXor;
4049                                 break;
4050                         case TGSI_OPCODE_ATOMUMIN:
4051                                 op = LLVMAtomicRMWBinOpUMin;
4052                                 break;
4053                         case TGSI_OPCODE_ATOMUMAX:
4054                                 op = LLVMAtomicRMWBinOpUMax;
4055                                 break;
4056                         case TGSI_OPCODE_ATOMIMIN:
4057                                 op = LLVMAtomicRMWBinOpMin;
4058                                 break;
4059                         case TGSI_OPCODE_ATOMIMAX:
4060                                 op = LLVMAtomicRMWBinOpMax;
4061                                 break;
4062                         default:
4063                                 unreachable("unknown atomic opcode");
4064                 }
4065
4066                 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4067                                        LLVMAtomicOrderingSequentiallyConsistent,
4068                                        false);
4069         }
4070         emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4071 }
4072
4073 static void atomic_emit(
4074                 const struct lp_build_tgsi_action *action,
4075                 struct lp_build_tgsi_context *bld_base,
4076                 struct lp_build_emit_data *emit_data)
4077 {
4078         struct si_shader_context *ctx = si_shader_context(bld_base);
4079         struct gallivm_state *gallivm = bld_base->base.gallivm;
4080         LLVMBuilderRef builder = gallivm->builder;
4081         const struct tgsi_full_instruction * inst = emit_data->inst;
4082         char intrinsic_name[40];
4083         LLVMValueRef tmp;
4084
4085         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4086                 atomic_emit_memory(ctx, emit_data);
4087                 return;
4088         }
4089
4090         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4091             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4092                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4093                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4094         } else {
4095                 char coords_type[8];
4096
4097                 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4098                                     coords_type, sizeof(coords_type));
4099                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4100                          "llvm.amdgcn.image.atomic.%s.%s",
4101                          action->intr_name, coords_type);
4102         }
4103
4104         tmp = lp_build_intrinsic(
4105                 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4106                 emit_data->args, emit_data->arg_count, 0);
4107         emit_data->output[emit_data->chan] =
4108                 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4109 }
4110
4111 static void resq_fetch_args(
4112                 struct lp_build_tgsi_context * bld_base,
4113                 struct lp_build_emit_data * emit_data)
4114 {
4115         struct si_shader_context *ctx = si_shader_context(bld_base);
4116         struct gallivm_state *gallivm = bld_base->base.gallivm;
4117         const struct tgsi_full_instruction *inst = emit_data->inst;
4118         const struct tgsi_full_src_register *reg = &inst->Src[0];
4119
4120         emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
4121
4122         if (reg->Register.File == TGSI_FILE_BUFFER) {
4123                 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4124                 emit_data->arg_count = 1;
4125         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4126                 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4127                 emit_data->arg_count = 1;
4128         } else {
4129                 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4130                 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4131                 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4132                 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4133                 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4134                 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4135                         bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4136                 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4137                 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4138                 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4139                 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4140                 emit_data->arg_count = 10;
4141         }
4142 }
4143
4144 static void resq_emit(
4145                 const struct lp_build_tgsi_action *action,
4146                 struct lp_build_tgsi_context *bld_base,
4147                 struct lp_build_emit_data *emit_data)
4148 {
4149         struct gallivm_state *gallivm = bld_base->base.gallivm;
4150         LLVMBuilderRef builder = gallivm->builder;
4151         const struct tgsi_full_instruction *inst = emit_data->inst;
4152         LLVMValueRef out;
4153
4154         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4155                 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4156                                               lp_build_const_int32(gallivm, 2), "");
4157         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4158                 out = get_buffer_size(bld_base, emit_data->args[0]);
4159         } else {
4160                 out = lp_build_intrinsic(
4161                         builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4162                         emit_data->args, emit_data->arg_count,
4163                         LLVMReadNoneAttribute);
4164
4165                 /* Divide the number of layers by 6 to get the number of cubes. */
4166                 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4167                         LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4168                         LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4169
4170                         LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4171                         z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
4172                         z = LLVMBuildSDiv(builder, z, imm6, "");
4173                         z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
4174                         out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4175                 }
4176         }
4177
4178         emit_data->output[emit_data->chan] = out;
4179 }
4180
4181 static void set_tex_fetch_args(struct si_shader_context *ctx,
4182                                struct lp_build_emit_data *emit_data,
4183                                unsigned opcode, unsigned target,
4184                                LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4185                                LLVMValueRef *param, unsigned count,
4186                                unsigned dmask)
4187 {
4188         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4189         unsigned num_args;
4190         unsigned is_rect = target == TGSI_TEXTURE_RECT;
4191
4192         /* Pad to power of two vector */
4193         while (count < util_next_power_of_two(count))
4194                 param[count++] = LLVMGetUndef(ctx->i32);
4195
4196         /* Texture coordinates. */
4197         if (count > 1)
4198                 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4199         else
4200                 emit_data->args[0] = param[0];
4201
4202         /* Resource. */
4203         emit_data->args[1] = res_ptr;
4204         num_args = 2;
4205
4206         if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4207                 emit_data->dst_type = ctx->v4i32;
4208         else {
4209                 emit_data->dst_type = ctx->v4f32;
4210
4211                 emit_data->args[num_args++] = samp_ptr;
4212         }
4213
4214         emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4215         emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4216         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4217         emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4218                                         tgsi_is_array_sampler(target)); /* da */
4219         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4220         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4221         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4222         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4223
4224         emit_data->arg_count = num_args;
4225 }
4226
4227 static const struct lp_build_tgsi_action tex_action;
4228
4229 enum desc_type {
4230         DESC_IMAGE,
4231         DESC_FMASK,
4232         DESC_SAMPLER
4233 };
4234
4235 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4236 {
4237         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4238                                CONST_ADDR_SPACE);
4239 }
4240
4241 /**
4242  * Load an image view, fmask view. or sampler state descriptor.
4243  */
4244 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4245                                             LLVMValueRef list, LLVMValueRef index,
4246                                             enum desc_type type)
4247 {
4248         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4249         LLVMBuilderRef builder = gallivm->builder;
4250
4251         switch (type) {
4252         case DESC_IMAGE:
4253                 /* The image is at [0:7]. */
4254                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4255                 break;
4256         case DESC_FMASK:
4257                 /* The FMASK is at [8:15]. */
4258                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4259                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4260                 break;
4261         case DESC_SAMPLER:
4262                 /* The sampler state is at [12:15]. */
4263                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4264                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4265                 list = LLVMBuildPointerCast(builder, list,
4266                                             const_array(ctx->v4i32, 0), "");
4267                 break;
4268         }
4269
4270         return build_indexed_load_const(ctx, list, index);
4271 }
4272
4273 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4274                                      LLVMValueRef index, enum desc_type type)
4275 {
4276         LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4277                                          SI_PARAM_SAMPLERS);
4278
4279         return get_sampler_desc_custom(ctx, list, index, type);
4280 }
4281
4282 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4283  *
4284  * SI-CI:
4285  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4286  *   filtering manually. The driver sets img7 to a mask clearing
4287  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4288  *     s_and_b32 samp0, samp0, img7
4289  *
4290  * VI:
4291  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
4292  */
4293 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4294                                            LLVMValueRef res, LLVMValueRef samp)
4295 {
4296         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4297         LLVMValueRef img7, samp0;
4298
4299         if (ctx->screen->b.chip_class >= VI)
4300                 return samp;
4301
4302         img7 = LLVMBuildExtractElement(builder, res,
4303                                        LLVMConstInt(ctx->i32, 7, 0), "");
4304         samp0 = LLVMBuildExtractElement(builder, samp,
4305                                         LLVMConstInt(ctx->i32, 0, 0), "");
4306         samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4307         return LLVMBuildInsertElement(builder, samp, samp0,
4308                                       LLVMConstInt(ctx->i32, 0, 0), "");
4309 }
4310
4311 static void tex_fetch_ptrs(
4312         struct lp_build_tgsi_context *bld_base,
4313         struct lp_build_emit_data *emit_data,
4314         LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4315 {
4316         struct si_shader_context *ctx = si_shader_context(bld_base);
4317         const struct tgsi_full_instruction *inst = emit_data->inst;
4318         unsigned target = inst->Texture.Texture;
4319         unsigned sampler_src;
4320         unsigned sampler_index;
4321
4322         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4323         sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4324
4325         if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4326                 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4327                 LLVMValueRef ind_index;
4328
4329                 ind_index = get_bounded_indirect_index(ctx,
4330                                                        &reg->Indirect,
4331                                                        reg->Register.Index,
4332                                                        SI_NUM_SAMPLERS);
4333
4334                 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4335
4336                 if (target == TGSI_TEXTURE_2D_MSAA ||
4337                     target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4338                         if (samp_ptr)
4339                                 *samp_ptr = NULL;
4340                         if (fmask_ptr)
4341                                 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4342                 } else {
4343                         if (samp_ptr) {
4344                                 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4345                                 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4346                         }
4347                         if (fmask_ptr)
4348                                 *fmask_ptr = NULL;
4349                 }
4350         } else {
4351                 *res_ptr = ctx->sampler_views[sampler_index];
4352                 if (samp_ptr)
4353                         *samp_ptr = ctx->sampler_states[sampler_index];
4354                 if (fmask_ptr)
4355                         *fmask_ptr = ctx->fmasks[sampler_index];
4356         }
4357 }
4358
4359 static void txq_fetch_args(
4360         struct lp_build_tgsi_context *bld_base,
4361         struct lp_build_emit_data *emit_data)
4362 {
4363         struct si_shader_context *ctx = si_shader_context(bld_base);
4364         struct gallivm_state *gallivm = bld_base->base.gallivm;
4365         LLVMBuilderRef builder = gallivm->builder;
4366         const struct tgsi_full_instruction *inst = emit_data->inst;
4367         unsigned target = inst->Texture.Texture;
4368         LLVMValueRef res_ptr;
4369         LLVMValueRef address;
4370
4371         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4372
4373         if (target == TGSI_TEXTURE_BUFFER) {
4374                 /* Read the size from the buffer descriptor directly. */
4375                 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4376                 emit_data->args[0] = get_buffer_size(bld_base, res);
4377                 return;
4378         }
4379
4380         /* Textures - set the mip level. */
4381         address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4382
4383         set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4384                            NULL, &address, 1, 0xf);
4385 }
4386
4387 static void txq_emit(const struct lp_build_tgsi_action *action,
4388                      struct lp_build_tgsi_context *bld_base,
4389                      struct lp_build_emit_data *emit_data)
4390 {
4391         struct lp_build_context *base = &bld_base->base;
4392         unsigned target = emit_data->inst->Texture.Texture;
4393
4394         if (target == TGSI_TEXTURE_BUFFER) {
4395                 /* Just return the buffer size. */
4396                 emit_data->output[emit_data->chan] = emit_data->args[0];
4397                 return;
4398         }
4399
4400         emit_data->output[emit_data->chan] = lp_build_intrinsic(
4401                 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4402                 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4403                 LLVMReadNoneAttribute);
4404
4405         /* Divide the number of layers by 6 to get the number of cubes. */
4406         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4407             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4408                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4409                 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4410                 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4411
4412                 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4413                 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4414                 z = LLVMBuildSDiv(builder, z, six, "");
4415
4416                 emit_data->output[emit_data->chan] =
4417                         LLVMBuildInsertElement(builder, v4, z, two, "");
4418         }
4419 }
4420
4421 static void tex_fetch_args(
4422         struct lp_build_tgsi_context *bld_base,
4423         struct lp_build_emit_data *emit_data)
4424 {
4425         struct si_shader_context *ctx = si_shader_context(bld_base);
4426         struct gallivm_state *gallivm = bld_base->base.gallivm;
4427         const struct tgsi_full_instruction *inst = emit_data->inst;
4428         unsigned opcode = inst->Instruction.Opcode;
4429         unsigned target = inst->Texture.Texture;
4430         LLVMValueRef coords[5], derivs[6];
4431         LLVMValueRef address[16];
4432         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4433         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4434         unsigned count = 0;
4435         unsigned chan;
4436         unsigned num_deriv_channels = 0;
4437         bool has_offset = inst->Texture.NumOffsets > 0;
4438         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4439         unsigned dmask = 0xf;
4440
4441         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4442
4443         if (target == TGSI_TEXTURE_BUFFER) {
4444                 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4445
4446                 /* Bitcast and truncate v8i32 to v16i8. */
4447                 LLVMValueRef res = res_ptr;
4448                 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4449                 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4450                 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4451
4452                 emit_data->dst_type = ctx->v4f32;
4453                 emit_data->args[0] = res;
4454                 emit_data->args[1] = bld_base->uint_bld.zero;
4455                 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4456                 emit_data->arg_count = 3;
4457                 return;
4458         }
4459
4460         /* Fetch and project texture coordinates */
4461         coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4462         for (chan = 0; chan < 3; chan++ ) {
4463                 coords[chan] = lp_build_emit_fetch(bld_base,
4464                                                    emit_data->inst, 0,
4465                                                    chan);
4466                 if (opcode == TGSI_OPCODE_TXP)
4467                         coords[chan] = lp_build_emit_llvm_binary(bld_base,
4468                                                                  TGSI_OPCODE_DIV,
4469                                                                  coords[chan],
4470                                                                  coords[3]);
4471         }
4472
4473         if (opcode == TGSI_OPCODE_TXP)
4474                 coords[3] = bld_base->base.one;
4475
4476         /* Pack offsets. */
4477         if (has_offset && opcode != TGSI_OPCODE_TXF) {
4478                 /* The offsets are six-bit signed integers packed like this:
4479                  *   X=[5:0], Y=[13:8], and Z=[21:16].
4480                  */
4481                 LLVMValueRef offset[3], pack;
4482
4483                 assert(inst->Texture.NumOffsets == 1);
4484
4485                 for (chan = 0; chan < 3; chan++) {
4486                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4487                                                                      emit_data->inst, 0, chan);
4488                         offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4489                                                     lp_build_const_int32(gallivm, 0x3f), "");
4490                         if (chan)
4491                                 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4492                                                             lp_build_const_int32(gallivm, chan*8), "");
4493                 }
4494
4495                 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4496                 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4497                 address[count++] = pack;
4498         }
4499
4500         /* Pack LOD bias value */
4501         if (opcode == TGSI_OPCODE_TXB)
4502                 address[count++] = coords[3];
4503         if (opcode == TGSI_OPCODE_TXB2)
4504                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4505
4506         /* Pack depth comparison value */
4507         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4508                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4509                         address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4510                 } else {
4511                         assert(ref_pos >= 0);
4512                         address[count++] = coords[ref_pos];
4513                 }
4514         }
4515
4516         /* Pack user derivatives */
4517         if (opcode == TGSI_OPCODE_TXD) {
4518                 int param, num_src_deriv_channels;
4519
4520                 switch (target) {
4521                 case TGSI_TEXTURE_3D:
4522                         num_src_deriv_channels = 3;
4523                         num_deriv_channels = 3;
4524                         break;
4525                 case TGSI_TEXTURE_2D:
4526                 case TGSI_TEXTURE_SHADOW2D:
4527                 case TGSI_TEXTURE_RECT:
4528                 case TGSI_TEXTURE_SHADOWRECT:
4529                 case TGSI_TEXTURE_2D_ARRAY:
4530                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4531                         num_src_deriv_channels = 2;
4532                         num_deriv_channels = 2;
4533                         break;
4534                 case TGSI_TEXTURE_CUBE:
4535                 case TGSI_TEXTURE_SHADOWCUBE:
4536                 case TGSI_TEXTURE_CUBE_ARRAY:
4537                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4538                         /* Cube derivatives will be converted to 2D. */
4539                         num_src_deriv_channels = 3;
4540                         num_deriv_channels = 2;
4541                         break;
4542                 case TGSI_TEXTURE_1D:
4543                 case TGSI_TEXTURE_SHADOW1D:
4544                 case TGSI_TEXTURE_1D_ARRAY:
4545                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4546                         num_src_deriv_channels = 1;
4547                         num_deriv_channels = 1;
4548                         break;
4549                 default:
4550                         unreachable("invalid target");
4551                 }
4552
4553                 for (param = 0; param < 2; param++)
4554                         for (chan = 0; chan < num_src_deriv_channels; chan++)
4555                                 derivs[param * num_src_deriv_channels + chan] =
4556                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
4557         }
4558
4559         if (target == TGSI_TEXTURE_CUBE ||
4560             target == TGSI_TEXTURE_CUBE_ARRAY ||
4561             target == TGSI_TEXTURE_SHADOWCUBE ||
4562             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4563                 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4564
4565         if (opcode == TGSI_OPCODE_TXD)
4566                 for (int i = 0; i < num_deriv_channels * 2; i++)
4567                         address[count++] = derivs[i];
4568
4569         /* Pack texture coordinates */
4570         address[count++] = coords[0];
4571         if (num_coords > 1)
4572                 address[count++] = coords[1];
4573         if (num_coords > 2)
4574                 address[count++] = coords[2];
4575
4576         /* Pack LOD or sample index */
4577         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4578                 address[count++] = coords[3];
4579         else if (opcode == TGSI_OPCODE_TXL2)
4580                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4581
4582         if (count > 16) {
4583                 assert(!"Cannot handle more than 16 texture address parameters");
4584                 count = 16;
4585         }
4586
4587         for (chan = 0; chan < count; chan++ ) {
4588                 address[chan] = LLVMBuildBitCast(gallivm->builder,
4589                                                  address[chan], ctx->i32, "");
4590         }
4591
4592         /* Adjust the sample index according to FMASK.
4593          *
4594          * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4595          * which is the identity mapping. Each nibble says which physical sample
4596          * should be fetched to get that sample.
4597          *
4598          * For example, 0x11111100 means there are only 2 samples stored and
4599          * the second sample covers 3/4 of the pixel. When reading samples 0
4600          * and 1, return physical sample 0 (determined by the first two 0s
4601          * in FMASK), otherwise return physical sample 1.
4602          *
4603          * The sample index should be adjusted as follows:
4604          *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
4605          */
4606         if (target == TGSI_TEXTURE_2D_MSAA ||
4607             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4608                 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4609                 struct lp_build_emit_data txf_emit_data = *emit_data;
4610                 LLVMValueRef txf_address[4];
4611                 unsigned txf_count = count;
4612                 struct tgsi_full_instruction inst = {};
4613
4614                 memcpy(txf_address, address, sizeof(txf_address));
4615
4616                 if (target == TGSI_TEXTURE_2D_MSAA) {
4617                         txf_address[2] = bld_base->uint_bld.zero;
4618                 }
4619                 txf_address[3] = bld_base->uint_bld.zero;
4620
4621                 /* Read FMASK using TXF. */
4622                 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4623                 inst.Texture.Texture = target;
4624                 txf_emit_data.inst = &inst;
4625                 txf_emit_data.chan = 0;
4626                 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4627                                    target, fmask_ptr, NULL,
4628                                    txf_address, txf_count, 0xf);
4629                 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4630
4631                 /* Initialize some constants. */
4632                 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4633                 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4634
4635                 /* Apply the formula. */
4636                 LLVMValueRef fmask =
4637                         LLVMBuildExtractElement(gallivm->builder,
4638                                                 txf_emit_data.output[0],
4639                                                 uint_bld->zero, "");
4640
4641                 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4642
4643                 LLVMValueRef sample_index4 =
4644                         LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4645
4646                 LLVMValueRef shifted_fmask =
4647                         LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4648
4649                 LLVMValueRef final_sample =
4650                         LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4651
4652                 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4653                  * resource descriptor is 0 (invalid),
4654                  */
4655                 LLVMValueRef fmask_desc =
4656                         LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4657                                          ctx->v8i32, "");
4658
4659                 LLVMValueRef fmask_word1 =
4660                         LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4661                                                 uint_bld->one, "");
4662
4663                 LLVMValueRef word1_is_nonzero =
4664                         LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4665                                       fmask_word1, uint_bld->zero, "");
4666
4667                 /* Replace the MSAA sample index. */
4668                 address[sample_chan] =
4669                         LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4670                                         final_sample, address[sample_chan], "");
4671         }
4672
4673         if (opcode == TGSI_OPCODE_TXF) {
4674                 /* add tex offsets */
4675                 if (inst->Texture.NumOffsets) {
4676                         struct lp_build_context *uint_bld = &bld_base->uint_bld;
4677                         struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4678                         const struct tgsi_texture_offset *off = inst->TexOffsets;
4679
4680                         assert(inst->Texture.NumOffsets == 1);
4681
4682                         switch (target) {
4683                         case TGSI_TEXTURE_3D:
4684                                 address[2] = lp_build_add(uint_bld, address[2],
4685                                                 bld->immediates[off->Index][off->SwizzleZ]);
4686                                 /* fall through */
4687                         case TGSI_TEXTURE_2D:
4688                         case TGSI_TEXTURE_SHADOW2D:
4689                         case TGSI_TEXTURE_RECT:
4690                         case TGSI_TEXTURE_SHADOWRECT:
4691                         case TGSI_TEXTURE_2D_ARRAY:
4692                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
4693                                 address[1] =
4694                                         lp_build_add(uint_bld, address[1],
4695                                                 bld->immediates[off->Index][off->SwizzleY]);
4696                                 /* fall through */
4697                         case TGSI_TEXTURE_1D:
4698                         case TGSI_TEXTURE_SHADOW1D:
4699                         case TGSI_TEXTURE_1D_ARRAY:
4700                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
4701                                 address[0] =
4702                                         lp_build_add(uint_bld, address[0],
4703                                                 bld->immediates[off->Index][off->SwizzleX]);
4704                                 break;
4705                                 /* texture offsets do not apply to other texture targets */
4706                         }
4707                 }
4708         }
4709
4710         if (opcode == TGSI_OPCODE_TG4) {
4711                 unsigned gather_comp = 0;
4712
4713                 /* DMASK was repurposed for GATHER4. 4 components are always
4714                  * returned and DMASK works like a swizzle - it selects
4715                  * the component to fetch. The only valid DMASK values are
4716                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4717                  * (red,red,red,red) etc.) The ISA document doesn't mention
4718                  * this.
4719                  */
4720
4721                 /* Get the component index from src1.x for Gather4. */
4722                 if (!tgsi_is_shadow_target(target)) {
4723                         LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4724                         LLVMValueRef comp_imm;
4725                         struct tgsi_src_register src1 = inst->Src[1].Register;
4726
4727                         assert(src1.File == TGSI_FILE_IMMEDIATE);
4728
4729                         comp_imm = imms[src1.Index][src1.SwizzleX];
4730                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4731                         gather_comp = CLAMP(gather_comp, 0, 3);
4732                 }
4733
4734                 dmask = 1 << gather_comp;
4735         }
4736
4737         set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4738                            samp_ptr, address, count, dmask);
4739 }
4740
4741 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4742                                 struct lp_build_tgsi_context *bld_base,
4743                                 struct lp_build_emit_data *emit_data)
4744 {
4745         struct si_shader_context *ctx = si_shader_context(bld_base);
4746         struct lp_build_context *base = &bld_base->base;
4747         unsigned opcode = emit_data->inst->Instruction.Opcode;
4748         unsigned target = emit_data->inst->Texture.Texture;
4749         char intr_name[127];
4750         bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4751         bool is_shadow = tgsi_is_shadow_target(target);
4752         char type[64];
4753         const char *name = "llvm.SI.image.sample";
4754         const char *infix = "";
4755
4756         if (target == TGSI_TEXTURE_BUFFER) {
4757                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4758                         base->gallivm->builder,
4759                         "llvm.SI.vs.load.input", emit_data->dst_type,
4760                         emit_data->args, emit_data->arg_count,
4761                         LLVMReadNoneAttribute);
4762                 return;
4763         }
4764
4765         switch (opcode) {
4766         case TGSI_OPCODE_TXF:
4767                 name = target == TGSI_TEXTURE_2D_MSAA ||
4768                        target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4769                                "llvm.SI.image.load" :
4770                                "llvm.SI.image.load.mip";
4771                 is_shadow = false;
4772                 has_offset = false;
4773                 break;
4774         case TGSI_OPCODE_LODQ:
4775                 name = "llvm.SI.getlod";
4776                 is_shadow = false;
4777                 has_offset = false;
4778                 break;
4779         case TGSI_OPCODE_TEX:
4780         case TGSI_OPCODE_TEX2:
4781         case TGSI_OPCODE_TXP:
4782                 if (ctx->type != PIPE_SHADER_FRAGMENT)
4783                         infix = ".lz";
4784                 break;
4785         case TGSI_OPCODE_TXB:
4786         case TGSI_OPCODE_TXB2:
4787                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4788                 infix = ".b";
4789                 break;
4790         case TGSI_OPCODE_TXL:
4791         case TGSI_OPCODE_TXL2:
4792                 infix = ".l";
4793                 break;
4794         case TGSI_OPCODE_TXD:
4795                 infix = ".d";
4796                 break;
4797         case TGSI_OPCODE_TG4:
4798                 name = "llvm.SI.gather4";
4799                 infix = ".lz";
4800                 break;
4801         default:
4802                 assert(0);
4803                 return;
4804         }
4805
4806         /* Add the type and suffixes .c, .o if needed. */
4807         build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4808         sprintf(intr_name, "%s%s%s%s.%s",
4809                 name, is_shadow ? ".c" : "", infix,
4810                 has_offset ? ".o" : "", type);
4811
4812         emit_data->output[emit_data->chan] = lp_build_intrinsic(
4813                 base->gallivm->builder, intr_name, emit_data->dst_type,
4814                 emit_data->args, emit_data->arg_count,
4815                 LLVMReadNoneAttribute);
4816 }
4817
4818 static void si_llvm_emit_txqs(
4819         const struct lp_build_tgsi_action *action,
4820         struct lp_build_tgsi_context *bld_base,
4821         struct lp_build_emit_data *emit_data)
4822 {
4823         struct si_shader_context *ctx = si_shader_context(bld_base);
4824         struct gallivm_state *gallivm = bld_base->base.gallivm;
4825         LLVMBuilderRef builder = gallivm->builder;
4826         LLVMValueRef res, samples;
4827         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4828
4829         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4830
4831
4832         /* Read the samples from the descriptor directly. */
4833         res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4834         samples = LLVMBuildExtractElement(
4835                 builder, res,
4836                 lp_build_const_int32(gallivm, 3), "");
4837         samples = LLVMBuildLShr(builder, samples,
4838                                 lp_build_const_int32(gallivm, 16), "");
4839         samples = LLVMBuildAnd(builder, samples,
4840                                lp_build_const_int32(gallivm, 0xf), "");
4841         samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4842                                samples, "");
4843
4844         emit_data->output[emit_data->chan] = samples;
4845 }
4846
4847 /*
4848  * SI implements derivatives using the local data store (LDS)
4849  * All writes to the LDS happen in all executing threads at
4850  * the same time. TID is the Thread ID for the current
4851  * thread and is a value between 0 and 63, representing
4852  * the thread's position in the wavefront.
4853  *
4854  * For the pixel shader threads are grouped into quads of four pixels.
4855  * The TIDs of the pixels of a quad are:
4856  *
4857  *  +------+------+
4858  *  |4n + 0|4n + 1|
4859  *  +------+------+
4860  *  |4n + 2|4n + 3|
4861  *  +------+------+
4862  *
4863  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4864  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4865  * the current pixel's column, and masking with 0xfffffffe yields the TID
4866  * of the left pixel of the current pixel's row.
4867  *
4868  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4869  * adding 2 yields the TID of the pixel below the top pixel.
4870  */
4871 /* masks for thread ID. */
4872 #define TID_MASK_TOP_LEFT 0xfffffffc
4873 #define TID_MASK_TOP      0xfffffffd
4874 #define TID_MASK_LEFT     0xfffffffe
4875
4876 static void si_llvm_emit_ddxy(
4877         const struct lp_build_tgsi_action *action,
4878         struct lp_build_tgsi_context *bld_base,
4879         struct lp_build_emit_data *emit_data)
4880 {
4881         struct si_shader_context *ctx = si_shader_context(bld_base);
4882         struct gallivm_state *gallivm = bld_base->base.gallivm;
4883         const struct tgsi_full_instruction *inst = emit_data->inst;
4884         unsigned opcode = inst->Instruction.Opcode;
4885         LLVMValueRef indices[2];
4886         LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4887         LLVMValueRef tl, trbl, result[4];
4888         LLVMValueRef tl_tid, trbl_tid;
4889         unsigned swizzle[4];
4890         unsigned c;
4891         int idx;
4892         unsigned mask;
4893
4894         indices[0] = bld_base->uint_bld.zero;
4895         indices[1] = get_thread_id(ctx);
4896         store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4897                                  indices, 2, "");
4898
4899         if (opcode == TGSI_OPCODE_DDX_FINE)
4900                 mask = TID_MASK_LEFT;
4901         else if (opcode == TGSI_OPCODE_DDY_FINE)
4902                 mask = TID_MASK_TOP;
4903         else
4904                 mask = TID_MASK_TOP_LEFT;
4905
4906         tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4907                                 lp_build_const_int32(gallivm, mask), "");
4908         indices[1] = tl_tid;
4909         load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4910                                  indices, 2, "");
4911
4912         /* for DDX we want to next X pixel, DDY next Y pixel. */
4913         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4914         trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4915                                   lp_build_const_int32(gallivm, idx), "");
4916         indices[1] = trbl_tid;
4917         load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4918                                  indices, 2, "");
4919
4920         for (c = 0; c < 4; ++c) {
4921                 unsigned i;
4922                 LLVMValueRef val;
4923                 LLVMValueRef args[2];
4924
4925                 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4926                 for (i = 0; i < c; ++i) {
4927                         if (swizzle[i] == swizzle[c]) {
4928                                 result[c] = result[i];
4929                                 break;
4930                         }
4931                 }
4932                 if (i != c)
4933                         continue;
4934
4935                 val = LLVMBuildBitCast(gallivm->builder,
4936                                 lp_build_emit_fetch(bld_base, inst, 0, c),
4937                                                 ctx->i32, "");
4938
4939                 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4940
4941                         args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4942                                         lp_build_const_int32(gallivm, 4), "");
4943                         args[1] = val;
4944                         tl = lp_build_intrinsic(gallivm->builder,
4945                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
4946                                         args, 2, LLVMReadNoneAttribute);
4947
4948                         args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4949                                         lp_build_const_int32(gallivm, 4), "");
4950                         trbl = lp_build_intrinsic(gallivm->builder,
4951                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
4952                                         args, 2, LLVMReadNoneAttribute);
4953                 } else {
4954                         LLVMBuildStore(gallivm->builder, val, store_ptr);
4955                         tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4956                         trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4957                 }
4958                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4959                 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4960                 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4961         }
4962
4963         emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4964 }
4965
4966 /*
4967  * this takes an I,J coordinate pair,
4968  * and works out the X and Y derivatives.
4969  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4970  */
4971 static LLVMValueRef si_llvm_emit_ddxy_interp(
4972         struct lp_build_tgsi_context *bld_base,
4973         LLVMValueRef interp_ij)
4974 {
4975         struct si_shader_context *ctx = si_shader_context(bld_base);
4976         struct gallivm_state *gallivm = bld_base->base.gallivm;
4977         LLVMValueRef indices[2];
4978         LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4979         LLVMValueRef tl, tr, bl, result[4];
4980         unsigned c;
4981
4982         indices[0] = bld_base->uint_bld.zero;
4983         indices[1] = get_thread_id(ctx);
4984         store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4985                                  indices, 2, "");
4986
4987         temp = LLVMBuildAnd(gallivm->builder, indices[1],
4988                             lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4989
4990         temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4991                              lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4992
4993         indices[1] = temp;
4994         load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4995                                   indices, 2, "");
4996
4997         indices[1] = temp2;
4998         load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4999                                   indices, 2, "");
5000
5001         indices[1] = LLVMBuildAdd(gallivm->builder, temp,
5002                                   lp_build_const_int32(gallivm, 1), "");
5003         load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
5004                                    indices, 2, "");
5005
5006         indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
5007                                   lp_build_const_int32(gallivm, 2), "");
5008         load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
5009                                    indices, 2, "");
5010
5011         for (c = 0; c < 2; ++c) {
5012                 LLVMValueRef store_val;
5013                 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
5014
5015                 store_val = LLVMBuildExtractElement(gallivm->builder,
5016                                                     interp_ij, c_ll, "");
5017                 LLVMBuildStore(gallivm->builder,
5018                                store_val,
5019                                store_ptr);
5020
5021                 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
5022                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5023
5024                 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
5025                 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
5026
5027                 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
5028
5029                 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
5030                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5031
5032                 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
5033                 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
5034
5035                 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
5036         }
5037
5038         return lp_build_gather_values(gallivm, result, 4);
5039 }
5040
5041 static void interp_fetch_args(
5042         struct lp_build_tgsi_context *bld_base,
5043         struct lp_build_emit_data *emit_data)
5044 {
5045         struct si_shader_context *ctx = si_shader_context(bld_base);
5046         struct gallivm_state *gallivm = bld_base->base.gallivm;
5047         const struct tgsi_full_instruction *inst = emit_data->inst;
5048
5049         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5050                 /* offset is in second src, first two channels */
5051                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5052                                                          emit_data->inst, 1,
5053                                                          TGSI_CHAN_X);
5054                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5055                                                          emit_data->inst, 1,
5056                                                          TGSI_CHAN_Y);
5057                 emit_data->arg_count = 2;
5058         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5059                 LLVMValueRef sample_position;
5060                 LLVMValueRef sample_id;
5061                 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5062
5063                 /* fetch sample ID, then fetch its sample position,
5064                  * and place into first two channels.
5065                  */
5066                 sample_id = lp_build_emit_fetch(bld_base,
5067                                                 emit_data->inst, 1, TGSI_CHAN_X);
5068                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5069                                              ctx->i32, "");
5070                 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5071
5072                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5073                                                              sample_position,
5074                                                              lp_build_const_int32(gallivm, 0), "");
5075
5076                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5077                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5078                                                              sample_position,
5079                                                              lp_build_const_int32(gallivm, 1), "");
5080                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5081                 emit_data->arg_count = 2;
5082         }
5083 }
5084
5085 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5086                                 struct lp_build_tgsi_context *bld_base,
5087                                 struct lp_build_emit_data *emit_data)
5088 {
5089         struct si_shader_context *ctx = si_shader_context(bld_base);
5090         struct si_shader *shader = ctx->shader;
5091         struct gallivm_state *gallivm = bld_base->base.gallivm;
5092         LLVMValueRef interp_param;
5093         const struct tgsi_full_instruction *inst = emit_data->inst;
5094         const char *intr_name;
5095         int input_index = inst->Src[0].Register.Index;
5096         int chan;
5097         int i;
5098         LLVMValueRef attr_number;
5099         LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5100         int interp_param_idx;
5101         unsigned interp = shader->selector->info.input_interpolate[input_index];
5102         unsigned location;
5103
5104         assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5105
5106         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5107             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5108                 location = TGSI_INTERPOLATE_LOC_CENTER;
5109         else
5110                 location = TGSI_INTERPOLATE_LOC_CENTROID;
5111
5112         interp_param_idx = lookup_interp_param_index(interp, location);
5113         if (interp_param_idx == -1)
5114                 return;
5115         else if (interp_param_idx)
5116                 interp_param = get_interp_param(ctx, interp_param_idx);
5117         else
5118                 interp_param = NULL;
5119
5120         attr_number = lp_build_const_int32(gallivm, input_index);
5121
5122         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5123             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5124                 LLVMValueRef ij_out[2];
5125                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5126
5127                 /*
5128                  * take the I then J parameters, and the DDX/Y for it, and
5129                  * calculate the IJ inputs for the interpolator.
5130                  * temp1 = ddx * offset/sample.x + I;
5131                  * interp_param.I = ddy * offset/sample.y + temp1;
5132                  * temp1 = ddx * offset/sample.x + J;
5133                  * interp_param.J = ddy * offset/sample.y + temp1;
5134                  */
5135                 for (i = 0; i < 2; i++) {
5136                         LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5137                         LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5138                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5139                                                                       ddxy_out, ix_ll, "");
5140                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5141                                                                       ddxy_out, iy_ll, "");
5142                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5143                                                                          interp_param, ix_ll, "");
5144                         LLVMValueRef temp1, temp2;
5145
5146                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5147                                                      ctx->f32, "");
5148
5149                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5150
5151                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5152
5153                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5154
5155                         temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5156
5157                         ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5158                                                      temp2, ctx->i32, "");
5159                 }
5160                 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5161         }
5162
5163         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5164         for (chan = 0; chan < 2; chan++) {
5165                 LLVMValueRef args[4];
5166                 LLVMValueRef llvm_chan;
5167                 unsigned schan;
5168
5169                 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5170                 llvm_chan = lp_build_const_int32(gallivm, schan);
5171
5172                 args[0] = llvm_chan;
5173                 args[1] = attr_number;
5174                 args[2] = params;
5175                 args[3] = interp_param;
5176
5177                 emit_data->output[chan] =
5178                         lp_build_intrinsic(gallivm->builder, intr_name,
5179                                            ctx->f32, args, args[3] ? 4 : 3,
5180                                            LLVMReadNoneAttribute);
5181         }
5182 }
5183
5184 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5185                                        struct lp_build_emit_data *emit_data)
5186 {
5187         LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5188         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5189         unsigned stream;
5190
5191         assert(src0.File == TGSI_FILE_IMMEDIATE);
5192
5193         stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5194         return stream;
5195 }
5196
5197 /* Emit one vertex from the geometry shader */
5198 static void si_llvm_emit_vertex(
5199         const struct lp_build_tgsi_action *action,
5200         struct lp_build_tgsi_context *bld_base,
5201         struct lp_build_emit_data *emit_data)
5202 {
5203         struct si_shader_context *ctx = si_shader_context(bld_base);
5204         struct lp_build_context *uint = &bld_base->uint_bld;
5205         struct si_shader *shader = ctx->shader;
5206         struct tgsi_shader_info *info = &shader->selector->info;
5207         struct gallivm_state *gallivm = bld_base->base.gallivm;
5208         LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5209                                             SI_PARAM_GS2VS_OFFSET);
5210         LLVMValueRef gs_next_vertex;
5211         LLVMValueRef can_emit, kill;
5212         LLVMValueRef args[2];
5213         unsigned chan;
5214         int i;
5215         unsigned stream;
5216
5217         stream = si_llvm_get_stream(bld_base, emit_data);
5218
5219         /* Write vertex attribute values to GSVS ring */
5220         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5221                                        ctx->gs_next_vertex[stream],
5222                                        "");
5223
5224         /* If this thread has already emitted the declared maximum number of
5225          * vertices, kill it: excessive vertex emissions are not supposed to
5226          * have any effect, and GS threads have no externally observable
5227          * effects other than emitting vertices.
5228          */
5229         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5230                                  lp_build_const_int32(gallivm,
5231                                                       shader->selector->gs_max_out_vertices), "");
5232         kill = lp_build_select(&bld_base->base, can_emit,
5233                                lp_build_const_float(gallivm, 1.0f),
5234                                lp_build_const_float(gallivm, -1.0f));
5235
5236         lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5237                            ctx->voidt, &kill, 1, 0);
5238
5239         for (i = 0; i < info->num_outputs; i++) {
5240                 LLVMValueRef *out_ptr =
5241                         ctx->radeon_bld.soa.outputs[i];
5242
5243                 for (chan = 0; chan < 4; chan++) {
5244                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5245                         LLVMValueRef voffset =
5246                                 lp_build_const_int32(gallivm, (i * 4 + chan) *
5247                                                      shader->selector->gs_max_out_vertices);
5248
5249                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
5250                         voffset = lp_build_mul_imm(uint, voffset, 4);
5251
5252                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5253
5254                         build_tbuffer_store(ctx,
5255                                             ctx->gsvs_ring[stream],
5256                                             out_val, 1,
5257                                             voffset, soffset, 0,
5258                                             V_008F0C_BUF_DATA_FORMAT_32,
5259                                             V_008F0C_BUF_NUM_FORMAT_UINT,
5260                                             1, 0, 1, 1, 0);
5261                 }
5262         }
5263         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5264                                       lp_build_const_int32(gallivm, 1));
5265
5266         LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5267
5268         /* Signal vertex emission */
5269         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5270         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5271         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5272                            ctx->voidt, args, 2, 0);
5273 }
5274
5275 /* Cut one primitive from the geometry shader */
5276 static void si_llvm_emit_primitive(
5277         const struct lp_build_tgsi_action *action,
5278         struct lp_build_tgsi_context *bld_base,
5279         struct lp_build_emit_data *emit_data)
5280 {
5281         struct si_shader_context *ctx = si_shader_context(bld_base);
5282         struct gallivm_state *gallivm = bld_base->base.gallivm;
5283         LLVMValueRef args[2];
5284         unsigned stream;
5285
5286         /* Signal primitive cut */
5287         stream = si_llvm_get_stream(bld_base, emit_data);
5288         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5289         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5290         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5291                            ctx->voidt, args, 2, 0);
5292 }
5293
5294 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5295                                  struct lp_build_tgsi_context *bld_base,
5296                                  struct lp_build_emit_data *emit_data)
5297 {
5298         struct si_shader_context *ctx = si_shader_context(bld_base);
5299         struct gallivm_state *gallivm = bld_base->base.gallivm;
5300
5301         /* The real barrier instruction isn’t needed, because an entire patch
5302          * always fits into a single wave.
5303          */
5304         if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5305                 emit_optimization_barrier(ctx);
5306                 return;
5307         }
5308
5309         lp_build_intrinsic(gallivm->builder,
5310                            HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5311                                                : "llvm.AMDGPU.barrier.local",
5312                            ctx->voidt, NULL, 0, 0);
5313 }
5314
5315 static const struct lp_build_tgsi_action tex_action = {
5316         .fetch_args = tex_fetch_args,
5317         .emit = build_tex_intrinsic,
5318 };
5319
5320 static const struct lp_build_tgsi_action interp_action = {
5321         .fetch_args = interp_fetch_args,
5322         .emit = build_interp_intrinsic,
5323 };
5324
5325 static void si_create_function(struct si_shader_context *ctx,
5326                                LLVMTypeRef *returns, unsigned num_returns,
5327                                LLVMTypeRef *params, unsigned num_params,
5328                                int last_sgpr)
5329 {
5330         int i;
5331
5332         radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5333                                 params, num_params);
5334         radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5335         ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5336
5337         for (i = 0; i <= last_sgpr; ++i) {
5338                 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5339
5340                 /* The combination of:
5341                  * - ByVal
5342                  * - dereferenceable
5343                  * - invariant.load
5344                  * allows the optimization passes to move loads and reduces
5345                  * SGPR spilling significantly.
5346                  */
5347                 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5348                         LLVMAddAttribute(P, LLVMByValAttribute);
5349                         lp_add_attr_dereferenceable(P, UINT64_MAX);
5350                 } else
5351                         LLVMAddAttribute(P, LLVMInRegAttribute);
5352         }
5353
5354         if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5355                 /* These were copied from some LLVM test. */
5356                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5357                                                    "less-precise-fpmad",
5358                                                    "true");
5359                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5360                                                    "no-infs-fp-math",
5361                                                    "true");
5362                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5363                                                    "no-nans-fp-math",
5364                                                    "true");
5365                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5366                                                    "unsafe-fp-math",
5367                                                    "true");
5368         }
5369 }
5370
5371 static void create_meta_data(struct si_shader_context *ctx)
5372 {
5373         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5374
5375         ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5376                                                                "invariant.load", 14);
5377         ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5378                                                      "range", 5);
5379         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5380                                                         "amdgpu.uniform", 14);
5381
5382         ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5383 }
5384
5385 static void declare_streamout_params(struct si_shader_context *ctx,
5386                                      struct pipe_stream_output_info *so,
5387                                      LLVMTypeRef *params, LLVMTypeRef i32,
5388                                      unsigned *num_params)
5389 {
5390         int i;
5391
5392         /* Streamout SGPRs. */
5393         if (so->num_outputs) {
5394                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5395                         params[ctx->param_streamout_config = (*num_params)++] = i32;
5396                 else
5397                         ctx->param_streamout_config = ctx->param_tess_offchip;
5398
5399                 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5400         }
5401         /* A streamout buffer offset is loaded if the stride is non-zero. */
5402         for (i = 0; i < 4; i++) {
5403                 if (!so->stride[i])
5404                         continue;
5405
5406                 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5407         }
5408 }
5409
5410 static unsigned llvm_get_type_size(LLVMTypeRef type)
5411 {
5412         LLVMTypeKind kind = LLVMGetTypeKind(type);
5413
5414         switch (kind) {
5415         case LLVMIntegerTypeKind:
5416                 return LLVMGetIntTypeWidth(type) / 8;
5417         case LLVMFloatTypeKind:
5418                 return 4;
5419         case LLVMPointerTypeKind:
5420                 return 8;
5421         case LLVMVectorTypeKind:
5422                 return LLVMGetVectorSize(type) *
5423                        llvm_get_type_size(LLVMGetElementType(type));
5424         default:
5425                 assert(0);
5426                 return 0;
5427         }
5428 }
5429
5430 static void declare_tess_lds(struct si_shader_context *ctx)
5431 {
5432         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5433         LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
5434         unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5435
5436         /* The actual size is computed outside of the shader to reduce
5437          * the number of shader variants. */
5438         ctx->lds =
5439                 LLVMAddGlobalInAddressSpace(gallivm->module,
5440                                             LLVMArrayType(i32, lds_size / 4),
5441                                             "tess_lds",
5442                                             LOCAL_ADDR_SPACE);
5443 }
5444
5445 static void create_function(struct si_shader_context *ctx)
5446 {
5447         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5448         struct gallivm_state *gallivm = bld_base->base.gallivm;
5449         struct si_shader *shader = ctx->shader;
5450         LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5451         LLVMTypeRef returns[16+32*4];
5452         unsigned i, last_sgpr, num_params, num_return_sgprs;
5453         unsigned num_returns = 0;
5454
5455         v3i32 = LLVMVectorType(ctx->i32, 3);
5456
5457         params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5458         params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5459         params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5460         params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5461         params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5462
5463         switch (ctx->type) {
5464         case PIPE_SHADER_VERTEX:
5465                 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5466                 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5467                 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5468                 params[SI_PARAM_DRAWID] = ctx->i32;
5469                 num_params = SI_PARAM_DRAWID+1;
5470
5471                 if (shader->key.vs.as_es) {
5472                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5473                 } else if (shader->key.vs.as_ls) {
5474                         params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5475                         num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5476                 } else {
5477                         if (ctx->is_gs_copy_shader) {
5478                                 num_params = SI_PARAM_RW_BUFFERS+1;
5479                         } else {
5480                                 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5481                                 num_params = SI_PARAM_VS_STATE_BITS+1;
5482                         }
5483
5484                         /* The locations of the other parameters are assigned dynamically. */
5485                         declare_streamout_params(ctx, &shader->selector->so,
5486                                                  params, ctx->i32, &num_params);
5487                 }
5488
5489                 last_sgpr = num_params-1;
5490
5491                 /* VGPRs */
5492                 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5493                 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5494                 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5495                 params[ctx->param_instance_id = num_params++] = ctx->i32;
5496
5497                 if (!ctx->is_monolithic &&
5498                     !ctx->is_gs_copy_shader) {
5499                         /* Vertex load indices. */
5500                         ctx->param_vertex_index0 = num_params;
5501
5502                         for (i = 0; i < shader->selector->info.num_inputs; i++)
5503                                 params[num_params++] = ctx->i32;
5504
5505                         /* PrimitiveID output. */
5506                         if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5507                                 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5508                                         returns[num_returns++] = ctx->f32;
5509                 }
5510                 break;
5511
5512         case PIPE_SHADER_TESS_CTRL:
5513                 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5514                 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5515                 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5516                 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5517                 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5518                 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5519                 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5520
5521                 /* VGPRs */
5522                 params[SI_PARAM_PATCH_ID] = ctx->i32;
5523                 params[SI_PARAM_REL_IDS] = ctx->i32;
5524                 num_params = SI_PARAM_REL_IDS+1;
5525
5526                 if (!ctx->is_monolithic) {
5527                         /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5528                          * placed after the user SGPRs.
5529                          */
5530                         for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5531                                 returns[num_returns++] = ctx->i32; /* SGPRs */
5532
5533                         for (i = 0; i < 3; i++)
5534                                 returns[num_returns++] = ctx->f32; /* VGPRs */
5535                 }
5536                 break;
5537
5538         case PIPE_SHADER_TESS_EVAL:
5539                 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5540                 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5541
5542                 if (shader->key.tes.as_es) {
5543                         params[ctx->param_oc_lds = num_params++] = ctx->i32;
5544                         params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5545                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5546                 } else {
5547                         params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5548                         declare_streamout_params(ctx, &shader->selector->so,
5549                                                  params, ctx->i32, &num_params);
5550                         params[ctx->param_oc_lds = num_params++] = ctx->i32;
5551                 }
5552                 last_sgpr = num_params - 1;
5553
5554                 /* VGPRs */
5555                 params[ctx->param_tes_u = num_params++] = ctx->f32;
5556                 params[ctx->param_tes_v = num_params++] = ctx->f32;
5557                 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5558                 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5559
5560                 /* PrimitiveID output. */
5561                 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5562                         for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5563                                 returns[num_returns++] = ctx->f32;
5564                 break;
5565
5566         case PIPE_SHADER_GEOMETRY:
5567                 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5568                 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5569                 last_sgpr = SI_PARAM_GS_WAVE_ID;
5570
5571                 /* VGPRs */
5572                 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5573                 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5574                 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5575                 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5576                 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5577                 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5578                 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5579                 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5580                 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5581                 break;
5582
5583         case PIPE_SHADER_FRAGMENT:
5584                 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5585                 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5586                 last_sgpr = SI_PARAM_PRIM_MASK;
5587                 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5588                 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5589                 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5590                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5591                 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5592                 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5593                 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5594                 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5595                 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5596                 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5597                 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5598                 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5599                 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5600                 params[SI_PARAM_ANCILLARY] = ctx->i32;
5601                 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5602                 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5603                 num_params = SI_PARAM_POS_FIXED_PT+1;
5604
5605                 if (!ctx->is_monolithic) {
5606                         /* Color inputs from the prolog. */
5607                         if (shader->selector->info.colors_read) {
5608                                 unsigned num_color_elements =
5609                                         util_bitcount(shader->selector->info.colors_read);
5610
5611                                 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5612                                 for (i = 0; i < num_color_elements; i++)
5613                                         params[num_params++] = ctx->f32;
5614                         }
5615
5616                         /* Outputs for the epilog. */
5617                         num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5618                         num_returns =
5619                                 num_return_sgprs +
5620                                 util_bitcount(shader->selector->info.colors_written) * 4 +
5621                                 shader->selector->info.writes_z +
5622                                 shader->selector->info.writes_stencil +
5623                                 shader->selector->info.writes_samplemask +
5624                                 1 /* SampleMaskIn */;
5625
5626                         num_returns = MAX2(num_returns,
5627                                            num_return_sgprs +
5628                                            PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5629
5630                         for (i = 0; i < num_return_sgprs; i++)
5631                                 returns[i] = ctx->i32;
5632                         for (; i < num_returns; i++)
5633                                 returns[i] = ctx->f32;
5634                 }
5635                 break;
5636
5637         case PIPE_SHADER_COMPUTE:
5638                 params[SI_PARAM_GRID_SIZE] = v3i32;
5639                 params[SI_PARAM_BLOCK_ID] = v3i32;
5640                 last_sgpr = SI_PARAM_BLOCK_ID;
5641
5642                 params[SI_PARAM_THREAD_ID] = v3i32;
5643                 num_params = SI_PARAM_THREAD_ID + 1;
5644                 break;
5645         default:
5646                 assert(0 && "unimplemented shader");
5647                 return;
5648         }
5649
5650         assert(num_params <= ARRAY_SIZE(params));
5651
5652         si_create_function(ctx, returns, num_returns, params,
5653                            num_params, last_sgpr);
5654
5655         /* Reserve register locations for VGPR inputs the PS prolog may need. */
5656         if (ctx->type == PIPE_SHADER_FRAGMENT &&
5657             !ctx->is_monolithic) {
5658                 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5659                                           "InitialPSInputAddr",
5660                                           S_0286D0_PERSP_SAMPLE_ENA(1) |
5661                                           S_0286D0_PERSP_CENTER_ENA(1) |
5662                                           S_0286D0_PERSP_CENTROID_ENA(1) |
5663                                           S_0286D0_LINEAR_SAMPLE_ENA(1) |
5664                                           S_0286D0_LINEAR_CENTER_ENA(1) |
5665                                           S_0286D0_LINEAR_CENTROID_ENA(1) |
5666                                           S_0286D0_FRONT_FACE_ENA(1) |
5667                                           S_0286D0_POS_FIXED_PT_ENA(1));
5668         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5669                 const unsigned *properties = shader->selector->info.properties;
5670                 unsigned max_work_group_size =
5671                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5672                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5673                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5674
5675                 assert(max_work_group_size);
5676
5677                 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5678                                           "amdgpu-max-work-group-size",
5679                                           max_work_group_size);
5680         }
5681
5682         shader->info.num_input_sgprs = 0;
5683         shader->info.num_input_vgprs = 0;
5684
5685         for (i = 0; i <= last_sgpr; ++i)
5686                 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5687
5688         /* Unused fragment shader inputs are eliminated by the compiler,
5689          * so we don't know yet how many there will be.
5690          */
5691         if (ctx->type != PIPE_SHADER_FRAGMENT)
5692                 for (; i < num_params; ++i)
5693                         shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5694
5695         if (bld_base->info &&
5696             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5697              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5698              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5699              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5700              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5701              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5702                 ctx->lds =
5703                         LLVMAddGlobalInAddressSpace(gallivm->module,
5704                                                     LLVMArrayType(ctx->i32, 64),
5705                                                     "ddxy_lds",
5706                                                     LOCAL_ADDR_SPACE);
5707
5708         if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5709             ctx->type == PIPE_SHADER_TESS_CTRL ||
5710             ctx->type == PIPE_SHADER_TESS_EVAL)
5711                 declare_tess_lds(ctx);
5712 }
5713
5714 static void preload_constants(struct si_shader_context *ctx)
5715 {
5716         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5717         struct gallivm_state *gallivm = bld_base->base.gallivm;
5718         const struct tgsi_shader_info *info = bld_base->info;
5719         unsigned buf;
5720         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5721
5722         for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5723                 unsigned i, num_const = info->const_file_max[buf] + 1;
5724
5725                 if (num_const == 0)
5726                         continue;
5727
5728                 /* Allocate space for the constant values */
5729                 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5730
5731                 /* Load the resource descriptor */
5732                 ctx->const_buffers[buf] =
5733                         build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5734
5735                 /* Load the constants, we rely on the code sinking to do the rest */
5736                 for (i = 0; i < num_const * 4; ++i) {
5737                         ctx->constants[buf][i] =
5738                                 buffer_load_const(ctx,
5739                                         ctx->const_buffers[buf],
5740                                         lp_build_const_int32(gallivm, i * 4));
5741                 }
5742         }
5743 }
5744
5745 static void preload_shader_buffers(struct si_shader_context *ctx)
5746 {
5747         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5748         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5749         int buf, maxbuf;
5750
5751         maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5752                       SI_NUM_SHADER_BUFFERS - 1);
5753         for (buf = 0; buf <= maxbuf; ++buf) {
5754                 ctx->shader_buffers[buf] =
5755                         build_indexed_load_const(
5756                                 ctx, ptr, lp_build_const_int32(gallivm, buf));
5757         }
5758 }
5759
5760 static void preload_samplers(struct si_shader_context *ctx)
5761 {
5762         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5763         struct gallivm_state *gallivm = bld_base->base.gallivm;
5764         const struct tgsi_shader_info *info = bld_base->info;
5765         unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5766         LLVMValueRef offset;
5767
5768         if (num_samplers == 0)
5769                 return;
5770
5771         /* Load the resources and samplers, we rely on the code sinking to do the rest */
5772         for (i = 0; i < num_samplers; ++i) {
5773                 /* Resource */
5774                 offset = lp_build_const_int32(gallivm, i);
5775                 ctx->sampler_views[i] =
5776                         get_sampler_desc(ctx, offset, DESC_IMAGE);
5777
5778                 /* FMASK resource */
5779                 if (info->is_msaa_sampler[i])
5780                         ctx->fmasks[i] =
5781                                 get_sampler_desc(ctx, offset, DESC_FMASK);
5782                 else {
5783                         ctx->sampler_states[i] =
5784                                 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5785                         ctx->sampler_states[i] =
5786                                 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5787                                                        ctx->sampler_states[i]);
5788                 }
5789         }
5790 }
5791
5792 static void preload_images(struct si_shader_context *ctx)
5793 {
5794         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5795         struct tgsi_shader_info *info = &ctx->shader->selector->info;
5796         struct gallivm_state *gallivm = bld_base->base.gallivm;
5797         unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5798         LLVMValueRef res_ptr;
5799         unsigned i;
5800
5801         if (num_images == 0)
5802                 return;
5803
5804         res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5805
5806         for (i = 0; i < num_images; ++i) {
5807                 /* Rely on LLVM to shrink the load for buffer resources. */
5808                 LLVMValueRef rsrc =
5809                         build_indexed_load_const(ctx, res_ptr,
5810                                                  lp_build_const_int32(gallivm, i));
5811
5812                 if (info->images_writemask & (1 << i) &&
5813                     !(info->images_buffers & (1 << i)))
5814                         rsrc = force_dcc_off(ctx, rsrc);
5815
5816                 ctx->images[i] = rsrc;
5817         }
5818 }
5819
5820 static void preload_streamout_buffers(struct si_shader_context *ctx)
5821 {
5822         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5823         struct gallivm_state *gallivm = bld_base->base.gallivm;
5824         unsigned i;
5825
5826         /* Streamout can only be used if the shader is compiled as VS. */
5827         if (!ctx->shader->selector->so.num_outputs ||
5828             (ctx->type == PIPE_SHADER_VERTEX &&
5829              (ctx->shader->key.vs.as_es ||
5830               ctx->shader->key.vs.as_ls)) ||
5831             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5832              ctx->shader->key.tes.as_es))
5833                 return;
5834
5835         LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5836                                             SI_PARAM_RW_BUFFERS);
5837
5838         /* Load the resources, we rely on the code sinking to do the rest */
5839         for (i = 0; i < 4; ++i) {
5840                 if (ctx->shader->selector->so.stride[i]) {
5841                         LLVMValueRef offset = lp_build_const_int32(gallivm,
5842                                                                    SI_VS_STREAMOUT_BUF0 + i);
5843
5844                         ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5845                 }
5846         }
5847 }
5848
5849 /**
5850  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5851  * for later use.
5852  */
5853 static void preload_ring_buffers(struct si_shader_context *ctx)
5854 {
5855         struct gallivm_state *gallivm =
5856                 ctx->radeon_bld.soa.bld_base.base.gallivm;
5857
5858         LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5859                                             SI_PARAM_RW_BUFFERS);
5860
5861         if ((ctx->type == PIPE_SHADER_VERTEX &&
5862              ctx->shader->key.vs.as_es) ||
5863             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5864              ctx->shader->key.tes.as_es) ||
5865             ctx->type == PIPE_SHADER_GEOMETRY) {
5866                 unsigned ring =
5867                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5868                                                              : SI_ES_RING_ESGS;
5869                 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5870
5871                 ctx->esgs_ring =
5872                         build_indexed_load_const(ctx, buf_ptr, offset);
5873         }
5874
5875         if (ctx->is_gs_copy_shader) {
5876                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5877
5878                 ctx->gsvs_ring[0] =
5879                         build_indexed_load_const(ctx, buf_ptr, offset);
5880         }
5881         if (ctx->type == PIPE_SHADER_GEOMETRY) {
5882                 int i;
5883                 for (i = 0; i < 4; i++) {
5884                         LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5885
5886                         ctx->gsvs_ring[i] =
5887                                 build_indexed_load_const(ctx, buf_ptr, offset);
5888                 }
5889         }
5890 }
5891
5892 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5893                                          LLVMValueRef param_rw_buffers,
5894                                          unsigned param_pos_fixed_pt)
5895 {
5896         struct lp_build_tgsi_context *bld_base =
5897                 &ctx->radeon_bld.soa.bld_base;
5898         struct gallivm_state *gallivm = bld_base->base.gallivm;
5899         LLVMBuilderRef builder = gallivm->builder;
5900         LLVMValueRef slot, desc, offset, row, bit, address[2];
5901
5902         /* Use the fixed-point gl_FragCoord input.
5903          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5904          * per coordinate to get the repeating effect.
5905          */
5906         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5907         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5908
5909         /* Load the buffer descriptor. */
5910         slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5911         desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5912
5913         /* The stipple pattern is 32x32, each row has 32 bits. */
5914         offset = LLVMBuildMul(builder, address[1],
5915                               LLVMConstInt(ctx->i32, 4, 0), "");
5916         row = buffer_load_const(ctx, desc, offset);
5917         row = LLVMBuildBitCast(builder, row, ctx->i32, "");
5918         bit = LLVMBuildLShr(builder, row, address[0], "");
5919         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5920
5921         /* The intrinsic kills the thread if arg < 0. */
5922         bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5923                               LLVMConstReal(ctx->f32, -1), "");
5924         lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5925 }
5926
5927 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5928                                   struct si_shader_config *conf,
5929                                   unsigned symbol_offset)
5930 {
5931         unsigned i;
5932         const unsigned char *config =
5933                 radeon_shader_binary_config_start(binary, symbol_offset);
5934         bool really_needs_scratch = false;
5935
5936         /* LLVM adds SGPR spills to the scratch size.
5937          * Find out if we really need the scratch buffer.
5938          */
5939         for (i = 0; i < binary->reloc_count; i++) {
5940                 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5941
5942                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5943                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5944                         really_needs_scratch = true;
5945                         break;
5946                 }
5947         }
5948
5949         /* XXX: We may be able to emit some of these values directly rather than
5950          * extracting fields to be emitted later.
5951          */
5952
5953         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5954                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5955                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5956                 switch (reg) {
5957                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5958                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5959                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5960                 case R_00B848_COMPUTE_PGM_RSRC1:
5961                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5962                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5963                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
5964                         conf->rsrc1 = value;
5965                         break;
5966                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5967                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5968                         break;
5969                 case R_00B84C_COMPUTE_PGM_RSRC2:
5970                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5971                         conf->rsrc2 = value;
5972                         break;
5973                 case R_0286CC_SPI_PS_INPUT_ENA:
5974                         conf->spi_ps_input_ena = value;
5975                         break;
5976                 case R_0286D0_SPI_PS_INPUT_ADDR:
5977                         conf->spi_ps_input_addr = value;
5978                         break;
5979                 case R_0286E8_SPI_TMPRING_SIZE:
5980                 case R_00B860_COMPUTE_TMPRING_SIZE:
5981                         /* WAVESIZE is in units of 256 dwords. */
5982                         if (really_needs_scratch)
5983                                 conf->scratch_bytes_per_wave =
5984                                         G_00B860_WAVESIZE(value) * 256 * 4;
5985                         break;
5986                 case 0x4: /* SPILLED_SGPRS */
5987                         conf->spilled_sgprs = value;
5988                         break;
5989                 case 0x8: /* SPILLED_VGPRS */
5990                         conf->spilled_vgprs = value;
5991                         break;
5992                 default:
5993                         {
5994                                 static bool printed;
5995
5996                                 if (!printed) {
5997                                         fprintf(stderr, "Warning: LLVM emitted unknown "
5998                                                 "config register: 0x%x\n", reg);
5999                                         printed = true;
6000                                 }
6001                         }
6002                         break;
6003                 }
6004         }
6005
6006         if (!conf->spi_ps_input_addr)
6007                 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6008 }
6009
6010 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6011                         struct si_shader *shader,
6012                         struct si_shader_config *config,
6013                         uint64_t scratch_va)
6014 {
6015         unsigned i;
6016         uint32_t scratch_rsrc_dword0 = scratch_va;
6017         uint32_t scratch_rsrc_dword1 =
6018                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6019
6020         /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6021          * correctly.
6022          */
6023         if (HAVE_LLVM >= 0x0309)
6024                 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6025         else
6026                 scratch_rsrc_dword1 |=
6027                         S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6028
6029         for (i = 0 ; i < shader->binary.reloc_count; i++) {
6030                 const struct radeon_shader_reloc *reloc =
6031                                         &shader->binary.relocs[i];
6032                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6033                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6034                         &scratch_rsrc_dword0, 4);
6035                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6036                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6037                         &scratch_rsrc_dword1, 4);
6038                 }
6039         }
6040 }
6041
6042 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6043 {
6044         unsigned size = shader->binary.code_size;
6045
6046         if (shader->prolog)
6047                 size += shader->prolog->binary.code_size;
6048         if (shader->epilog)
6049                 size += shader->epilog->binary.code_size;
6050         return size;
6051 }
6052
6053 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6054 {
6055         const struct radeon_shader_binary *prolog =
6056                 shader->prolog ? &shader->prolog->binary : NULL;
6057         const struct radeon_shader_binary *epilog =
6058                 shader->epilog ? &shader->epilog->binary : NULL;
6059         const struct radeon_shader_binary *mainb = &shader->binary;
6060         unsigned bo_size = si_get_shader_binary_size(shader) +
6061                            (!epilog ? mainb->rodata_size : 0);
6062         unsigned char *ptr;
6063
6064         assert(!prolog || !prolog->rodata_size);
6065         assert((!prolog && !epilog) || !mainb->rodata_size);
6066         assert(!epilog || !epilog->rodata_size);
6067
6068         r600_resource_reference(&shader->bo, NULL);
6069         shader->bo = si_resource_create_custom(&sscreen->b.b,
6070                                                PIPE_USAGE_IMMUTABLE,
6071                                                bo_size);
6072         if (!shader->bo)
6073                 return -ENOMEM;
6074
6075         /* Upload. */
6076         ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6077                                         PIPE_TRANSFER_READ_WRITE);
6078
6079         if (prolog) {
6080                 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6081                 ptr += prolog->code_size;
6082         }
6083
6084         util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6085         ptr += mainb->code_size;
6086
6087         if (epilog)
6088                 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6089         else if (mainb->rodata_size > 0)
6090                 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6091
6092         sscreen->b.ws->buffer_unmap(shader->bo->buf);
6093         return 0;
6094 }
6095
6096 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6097                                        struct pipe_debug_callback *debug,
6098                                        const char *name, FILE *file)
6099 {
6100         char *line, *p;
6101         unsigned i, count;
6102
6103         if (binary->disasm_string) {
6104                 fprintf(file, "Shader %s disassembly:\n", name);
6105                 fprintf(file, "%s", binary->disasm_string);
6106
6107                 if (debug && debug->debug_message) {
6108                         /* Very long debug messages are cut off, so send the
6109                          * disassembly one line at a time. This causes more
6110                          * overhead, but on the plus side it simplifies
6111                          * parsing of resulting logs.
6112                          */
6113                         pipe_debug_message(debug, SHADER_INFO,
6114                                            "Shader Disassembly Begin");
6115
6116                         line = binary->disasm_string;
6117                         while (*line) {
6118                                 p = util_strchrnul(line, '\n');
6119                                 count = p - line;
6120
6121                                 if (count) {
6122                                         pipe_debug_message(debug, SHADER_INFO,
6123                                                            "%.*s", count, line);
6124                                 }
6125
6126                                 if (!*p)
6127                                         break;
6128                                 line = p + 1;
6129                         }
6130
6131                         pipe_debug_message(debug, SHADER_INFO,
6132                                            "Shader Disassembly End");
6133                 }
6134         } else {
6135                 fprintf(file, "Shader %s binary:\n", name);
6136                 for (i = 0; i < binary->code_size; i += 4) {
6137                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6138                                 binary->code[i + 3], binary->code[i + 2],
6139                                 binary->code[i + 1], binary->code[i]);
6140                 }
6141         }
6142 }
6143
6144 static void si_shader_dump_stats(struct si_screen *sscreen,
6145                                  struct si_shader_config *conf,
6146                                  unsigned num_inputs,
6147                                  unsigned code_size,
6148                                  struct pipe_debug_callback *debug,
6149                                  unsigned processor,
6150                                  FILE *file)
6151 {
6152         unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6153         unsigned lds_per_wave = 0;
6154         unsigned max_simd_waves = 10;
6155
6156         /* Compute LDS usage for PS. */
6157         if (processor == PIPE_SHADER_FRAGMENT) {
6158                 /* The minimum usage per wave is (num_inputs * 48). The maximum
6159                  * usage is (num_inputs * 48 * 16).
6160                  * We can get anything in between and it varies between waves.
6161                  *
6162                  * The 48 bytes per input for a single primitive is equal to
6163                  * 4 bytes/component * 4 components/input * 3 points.
6164                  *
6165                  * Other stages don't know the size at compile time or don't
6166                  * allocate LDS per wave, but instead they do it per thread group.
6167                  */
6168                 lds_per_wave = conf->lds_size * lds_increment +
6169                                align(num_inputs * 48, lds_increment);
6170         }
6171
6172         /* Compute the per-SIMD wave counts. */
6173         if (conf->num_sgprs) {
6174                 if (sscreen->b.chip_class >= VI)
6175                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6176                 else
6177                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6178         }
6179
6180         if (conf->num_vgprs)
6181                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6182
6183         /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6184          * that PS can use.
6185          */
6186         if (lds_per_wave)
6187                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6188
6189         if (file != stderr ||
6190             r600_can_dump_shader(&sscreen->b, processor)) {
6191                 if (processor == PIPE_SHADER_FRAGMENT) {
6192                         fprintf(file, "*** SHADER CONFIG ***\n"
6193                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6194                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
6195                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6196                 }
6197
6198                 fprintf(file, "*** SHADER STATS ***\n"
6199                         "SGPRS: %d\n"
6200                         "VGPRS: %d\n"
6201                         "Spilled SGPRs: %d\n"
6202                         "Spilled VGPRs: %d\n"
6203                         "Code Size: %d bytes\n"
6204                         "LDS: %d blocks\n"
6205                         "Scratch: %d bytes per wave\n"
6206                         "Max Waves: %d\n"
6207                         "********************\n\n\n",
6208                         conf->num_sgprs, conf->num_vgprs,
6209                         conf->spilled_sgprs, conf->spilled_vgprs, code_size,
6210                         conf->lds_size, conf->scratch_bytes_per_wave,
6211                         max_simd_waves);
6212         }
6213
6214         pipe_debug_message(debug, SHADER_INFO,
6215                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6216                            "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6217                            "Spilled VGPRs: %d",
6218                            conf->num_sgprs, conf->num_vgprs, code_size,
6219                            conf->lds_size, conf->scratch_bytes_per_wave,
6220                            max_simd_waves, conf->spilled_sgprs,
6221                            conf->spilled_vgprs);
6222 }
6223
6224 static const char *si_get_shader_name(struct si_shader *shader,
6225                                       unsigned processor)
6226 {
6227         switch (processor) {
6228         case PIPE_SHADER_VERTEX:
6229                 if (shader->key.vs.as_es)
6230                         return "Vertex Shader as ES";
6231                 else if (shader->key.vs.as_ls)
6232                         return "Vertex Shader as LS";
6233                 else
6234                         return "Vertex Shader as VS";
6235         case PIPE_SHADER_TESS_CTRL:
6236                 return "Tessellation Control Shader";
6237         case PIPE_SHADER_TESS_EVAL:
6238                 if (shader->key.tes.as_es)
6239                         return "Tessellation Evaluation Shader as ES";
6240                 else
6241                         return "Tessellation Evaluation Shader as VS";
6242         case PIPE_SHADER_GEOMETRY:
6243                 if (shader->gs_copy_shader == NULL)
6244                         return "GS Copy Shader as VS";
6245                 else
6246                         return "Geometry Shader";
6247         case PIPE_SHADER_FRAGMENT:
6248                 return "Pixel Shader";
6249         case PIPE_SHADER_COMPUTE:
6250                 return "Compute Shader";
6251         default:
6252                 return "Unknown Shader";
6253         }
6254 }
6255
6256 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6257                     struct pipe_debug_callback *debug, unsigned processor,
6258                     FILE *file)
6259 {
6260         if (file != stderr ||
6261             r600_can_dump_shader(&sscreen->b, processor))
6262                 si_dump_shader_key(processor, &shader->key, file);
6263
6264         if (file != stderr && shader->binary.llvm_ir_string) {
6265                 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6266                         si_get_shader_name(shader, processor));
6267                 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6268         }
6269
6270         if (file != stderr ||
6271             (r600_can_dump_shader(&sscreen->b, processor) &&
6272              !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6273                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6274
6275                 if (shader->prolog)
6276                         si_shader_dump_disassembly(&shader->prolog->binary,
6277                                                    debug, "prolog", file);
6278
6279                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6280
6281                 if (shader->epilog)
6282                         si_shader_dump_disassembly(&shader->epilog->binary,
6283                                                    debug, "epilog", file);
6284                 fprintf(file, "\n");
6285         }
6286
6287         si_shader_dump_stats(sscreen, &shader->config,
6288                              shader->selector ? shader->selector->info.num_inputs : 0,
6289                              si_get_shader_binary_size(shader), debug, processor,
6290                              file);
6291 }
6292
6293 int si_compile_llvm(struct si_screen *sscreen,
6294                     struct radeon_shader_binary *binary,
6295                     struct si_shader_config *conf,
6296                     LLVMTargetMachineRef tm,
6297                     LLVMModuleRef mod,
6298                     struct pipe_debug_callback *debug,
6299                     unsigned processor,
6300                     const char *name)
6301 {
6302         int r = 0;
6303         unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6304
6305         if (r600_can_dump_shader(&sscreen->b, processor)) {
6306                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6307
6308                 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6309                         fprintf(stderr, "%s LLVM IR:\n\n", name);
6310                         LLVMDumpModule(mod);
6311                         fprintf(stderr, "\n");
6312                 }
6313         }
6314
6315         if (sscreen->record_llvm_ir) {
6316                 char *ir = LLVMPrintModuleToString(mod);
6317                 binary->llvm_ir_string = strdup(ir);
6318                 LLVMDisposeMessage(ir);
6319         }
6320
6321         if (!si_replace_shader(count, binary)) {
6322                 r = radeon_llvm_compile(mod, binary, tm, debug);
6323                 if (r)
6324                         return r;
6325         }
6326
6327         si_shader_binary_read_config(binary, conf, 0);
6328
6329         /* Enable 64-bit and 16-bit denormals, because there is no performance
6330          * cost.
6331          *
6332          * If denormals are enabled, all floating-point output modifiers are
6333          * ignored.
6334          *
6335          * Don't enable denormals for 32-bit floats, because:
6336          * - Floating-point output modifiers would be ignored by the hw.
6337          * - Some opcodes don't support denormals, such as v_mad_f32. We would
6338          *   have to stop using those.
6339          * - SI & CI would be very slow.
6340          */
6341         conf->float_mode |= V_00B028_FP_64_DENORMS;
6342
6343         FREE(binary->config);
6344         FREE(binary->global_symbol_offsets);
6345         binary->config = NULL;
6346         binary->global_symbol_offsets = NULL;
6347
6348         /* Some shaders can't have rodata because their binaries can be
6349          * concatenated.
6350          */
6351         if (binary->rodata_size &&
6352             (processor == PIPE_SHADER_VERTEX ||
6353              processor == PIPE_SHADER_TESS_CTRL ||
6354              processor == PIPE_SHADER_TESS_EVAL ||
6355              processor == PIPE_SHADER_FRAGMENT)) {
6356                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6357                 return -EINVAL;
6358         }
6359
6360         return r;
6361 }
6362
6363 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6364 {
6365         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6366                 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6367         else
6368                 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6369 }
6370
6371 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6372 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6373                                       struct si_shader_context *ctx,
6374                                       struct si_shader *gs,
6375                                       struct pipe_debug_callback *debug)
6376 {
6377         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6378         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6379         struct lp_build_context *uint = &bld_base->uint_bld;
6380         struct si_shader_output_values *outputs;
6381         struct tgsi_shader_info *gsinfo = &gs->selector->info;
6382         LLVMValueRef args[9];
6383         int i, r;
6384
6385         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6386
6387         si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6388         ctx->type = PIPE_SHADER_VERTEX;
6389         ctx->is_gs_copy_shader = true;
6390
6391         create_meta_data(ctx);
6392         create_function(ctx);
6393         preload_streamout_buffers(ctx);
6394         preload_ring_buffers(ctx);
6395
6396         args[0] = ctx->gsvs_ring[0];
6397         args[1] = lp_build_mul_imm(uint,
6398                                    LLVMGetParam(ctx->radeon_bld.main_fn,
6399                                                 ctx->param_vertex_id),
6400                                    4);
6401         args[3] = uint->zero;
6402         args[4] = uint->one;  /* OFFEN */
6403         args[5] = uint->zero; /* IDXEN */
6404         args[6] = uint->one;  /* GLC */
6405         args[7] = uint->one;  /* SLC */
6406         args[8] = uint->zero; /* TFE */
6407
6408         /* Fetch vertex data from GSVS ring */
6409         for (i = 0; i < gsinfo->num_outputs; ++i) {
6410                 unsigned chan;
6411
6412                 outputs[i].name = gsinfo->output_semantic_name[i];
6413                 outputs[i].sid = gsinfo->output_semantic_index[i];
6414
6415                 for (chan = 0; chan < 4; chan++) {
6416                         args[2] = lp_build_const_int32(gallivm,
6417                                                        (i * 4 + chan) *
6418                                                        gs->selector->gs_max_out_vertices * 16 * 4);
6419
6420                         outputs[i].values[chan] =
6421                                 LLVMBuildBitCast(gallivm->builder,
6422                                                  lp_build_intrinsic(gallivm->builder,
6423                                                                  "llvm.SI.buffer.load.dword.i32.i32",
6424                                                                  ctx->i32, args, 9,
6425                                                                  LLVMReadOnlyAttribute),
6426                                                  ctx->f32, "");
6427                 }
6428         }
6429
6430         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6431
6432         LLVMBuildRetVoid(gallivm->builder);
6433
6434         /* Dump LLVM IR before any optimization passes */
6435         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6436             r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6437                 LLVMDumpModule(bld_base->base.gallivm->module);
6438
6439         radeon_llvm_finalize_module(&ctx->radeon_bld);
6440
6441         r = si_compile_llvm(sscreen, &ctx->shader->binary,
6442                             &ctx->shader->config, ctx->tm,
6443                             bld_base->base.gallivm->module,
6444                             debug, PIPE_SHADER_GEOMETRY,
6445                             "GS Copy Shader");
6446         if (!r) {
6447                 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6448                         fprintf(stderr, "GS Copy Shader:\n");
6449                 si_shader_dump(sscreen, ctx->shader, debug,
6450                                PIPE_SHADER_GEOMETRY, stderr);
6451                 r = si_shader_binary_upload(sscreen, ctx->shader);
6452         }
6453
6454         radeon_llvm_dispose(&ctx->radeon_bld);
6455
6456         FREE(outputs);
6457         return r;
6458 }
6459
6460 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
6461                                FILE *f)
6462 {
6463         int i;
6464
6465         fprintf(f, "SHADER KEY\n");
6466
6467         switch (shader) {
6468         case PIPE_SHADER_VERTEX:
6469                 fprintf(f, "  instance_divisors = {");
6470                 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6471                         fprintf(f, !i ? "%u" : ", %u",
6472                                 key->vs.prolog.instance_divisors[i]);
6473                 fprintf(f, "}\n");
6474                 fprintf(f, "  as_es = %u\n", key->vs.as_es);
6475                 fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
6476                 fprintf(f, "  export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6477                 break;
6478
6479         case PIPE_SHADER_TESS_CTRL:
6480                 fprintf(f, "  prim_mode = %u\n", key->tcs.epilog.prim_mode);
6481                 break;
6482
6483         case PIPE_SHADER_TESS_EVAL:
6484                 fprintf(f, "  as_es = %u\n", key->tes.as_es);
6485                 fprintf(f, "  export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6486                 break;
6487
6488         case PIPE_SHADER_GEOMETRY:
6489         case PIPE_SHADER_COMPUTE:
6490                 break;
6491
6492         case PIPE_SHADER_FRAGMENT:
6493                 fprintf(f, "  prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6494                 fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6495                 fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6496                 fprintf(f, "  prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6497                 fprintf(f, "  prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6498                 fprintf(f, "  prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6499                 fprintf(f, "  prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6500                 fprintf(f, "  prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6501                 fprintf(f, "  prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6502                 fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6503                 fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6504                 fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6505                 fprintf(f, "  epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6506                 fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6507                 fprintf(f, "  epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6508                 fprintf(f, "  epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6509                 break;
6510
6511         default:
6512                 assert(0);
6513         }
6514 }
6515
6516 static void si_init_shader_ctx(struct si_shader_context *ctx,
6517                                struct si_screen *sscreen,
6518                                struct si_shader *shader,
6519                                LLVMTargetMachineRef tm)
6520 {
6521         struct lp_build_tgsi_context *bld_base;
6522         struct lp_build_tgsi_action tmpl = {};
6523
6524         memset(ctx, 0, sizeof(*ctx));
6525         radeon_llvm_context_init(
6526                 &ctx->radeon_bld, "amdgcn--",
6527                 (shader && shader->selector) ? &shader->selector->info : NULL);
6528         ctx->tm = tm;
6529         ctx->screen = sscreen;
6530         if (shader && shader->selector)
6531                 ctx->type = shader->selector->info.processor;
6532         else
6533                 ctx->type = -1;
6534         ctx->shader = shader;
6535
6536         ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6537         ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6538         ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6539         ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6540         ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6541         ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6542         ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6543         ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6544         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6545         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6546         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6547         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6548
6549         bld_base = &ctx->radeon_bld.soa.bld_base;
6550         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6551
6552         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6553         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6554         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6555
6556         bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6557         bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6558         bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6559         bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6560         bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6561         bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6562         bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6563         bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6564         bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6565         bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6566         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6567         bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6568         bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6569         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6570
6571         bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6572         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6573         bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6574         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6575         bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6576         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6577
6578         tmpl.fetch_args = atomic_fetch_args;
6579         tmpl.emit = atomic_emit;
6580         bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6581         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6582         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6583         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6584         bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6585         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6586         bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6587         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6588         bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6589         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6590         bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6591         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6592         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6593         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6594         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6595         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6596         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6597         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6598         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6599         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6600
6601         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6602
6603         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6604         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6605         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6606         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6607
6608         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6609         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6610         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6611
6612         bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6613         bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6614         bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6615         bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6616 }
6617
6618 int si_compile_tgsi_shader(struct si_screen *sscreen,
6619                            LLVMTargetMachineRef tm,
6620                            struct si_shader *shader,
6621                            bool is_monolithic,
6622                            struct pipe_debug_callback *debug)
6623 {
6624         struct si_shader_selector *sel = shader->selector;
6625         struct si_shader_context ctx;
6626         struct lp_build_tgsi_context *bld_base;
6627         LLVMModuleRef mod;
6628         int r = 0;
6629
6630         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6631          * conversion fails. */
6632         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6633             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6634                 tgsi_dump(sel->tokens, 0);
6635                 si_dump_streamout(&sel->so);
6636         }
6637
6638         si_init_shader_ctx(&ctx, sscreen, shader, tm);
6639         ctx.is_monolithic = is_monolithic;
6640
6641         shader->info.uses_instanceid = sel->info.uses_instanceid;
6642
6643         bld_base = &ctx.radeon_bld.soa.bld_base;
6644         ctx.radeon_bld.load_system_value = declare_system_value;
6645
6646         switch (ctx.type) {
6647         case PIPE_SHADER_VERTEX:
6648                 ctx.radeon_bld.load_input = declare_input_vs;
6649                 if (shader->key.vs.as_ls)
6650                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6651                 else if (shader->key.vs.as_es)
6652                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6653                 else
6654                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6655                 break;
6656         case PIPE_SHADER_TESS_CTRL:
6657                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6658                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6659                 bld_base->emit_store = store_output_tcs;
6660                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6661                 break;
6662         case PIPE_SHADER_TESS_EVAL:
6663                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6664                 if (shader->key.tes.as_es)
6665                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6666                 else
6667                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6668                 break;
6669         case PIPE_SHADER_GEOMETRY:
6670                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6671                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6672                 break;
6673         case PIPE_SHADER_FRAGMENT:
6674                 ctx.radeon_bld.load_input = declare_input_fs;
6675                 if (is_monolithic)
6676                         bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6677                 else
6678                         bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6679                 break;
6680         case PIPE_SHADER_COMPUTE:
6681                 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6682                 break;
6683         default:
6684                 assert(!"Unsupported shader type");
6685                 return -1;
6686         }
6687
6688         create_meta_data(&ctx);
6689         create_function(&ctx);
6690         preload_constants(&ctx);
6691         preload_shader_buffers(&ctx);
6692         preload_samplers(&ctx);
6693         preload_images(&ctx);
6694         preload_streamout_buffers(&ctx);
6695         preload_ring_buffers(&ctx);
6696
6697         if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6698             shader->key.ps.prolog.poly_stipple) {
6699                 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6700                                                  SI_PARAM_RW_BUFFERS);
6701                 si_llvm_emit_polygon_stipple(&ctx, list,
6702                                              SI_PARAM_POS_FIXED_PT);
6703         }
6704
6705         if (ctx.type == PIPE_SHADER_GEOMETRY) {
6706                 int i;
6707                 for (i = 0; i < 4; i++) {
6708                         ctx.gs_next_vertex[i] =
6709                                 lp_build_alloca(bld_base->base.gallivm,
6710                                                 ctx.i32, "");
6711                 }
6712         }
6713
6714         if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6715                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6716                 goto out;
6717         }
6718
6719         si_llvm_build_ret(&ctx, ctx.return_value);
6720         mod = bld_base->base.gallivm->module;
6721
6722         /* Dump LLVM IR before any optimization passes */
6723         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6724             r600_can_dump_shader(&sscreen->b, ctx.type))
6725                 LLVMDumpModule(mod);
6726
6727         radeon_llvm_finalize_module(&ctx.radeon_bld);
6728
6729         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6730                             mod, debug, ctx.type, "TGSI shader");
6731         if (r) {
6732                 fprintf(stderr, "LLVM failed to compile shader\n");
6733                 goto out;
6734         }
6735
6736         radeon_llvm_dispose(&ctx.radeon_bld);
6737
6738         /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6739          * LLVM 3.9svn has this bug.
6740          */
6741         if (sel->type == PIPE_SHADER_COMPUTE) {
6742                 unsigned *props = sel->info.properties;
6743                 unsigned wave_size = 64;
6744                 unsigned max_vgprs = 256;
6745                 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6746                 unsigned max_sgprs_per_wave = 128;
6747                 unsigned min_waves_per_cu =
6748                         DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
6749                                      props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
6750                                      props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
6751                                      wave_size);
6752                 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6753
6754                 max_vgprs = max_vgprs / min_waves_per_simd;
6755                 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6756
6757                 if (shader->config.num_sgprs > max_sgprs ||
6758                     shader->config.num_vgprs > max_vgprs) {
6759                         fprintf(stderr, "LLVM failed to compile a shader correctly: "
6760                                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6761                                 shader->config.num_sgprs, shader->config.num_vgprs,
6762                                 max_sgprs, max_vgprs);
6763
6764                         /* Just terminate the process, because dependent
6765                          * shaders can hang due to bad input data, but use
6766                          * the env var to allow shader-db to work.
6767                          */
6768                         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6769                                 abort();
6770                 }
6771         }
6772
6773         /* Add the scratch offset to input SGPRs. */
6774         if (shader->config.scratch_bytes_per_wave)
6775                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6776
6777         /* Calculate the number of fragment input VGPRs. */
6778         if (ctx.type == PIPE_SHADER_FRAGMENT) {
6779                 shader->info.num_input_vgprs = 0;
6780                 shader->info.face_vgpr_index = -1;
6781
6782                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6783                         shader->info.num_input_vgprs += 2;
6784                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6785                         shader->info.num_input_vgprs += 2;
6786                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6787                         shader->info.num_input_vgprs += 2;
6788                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6789                         shader->info.num_input_vgprs += 3;
6790                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6791                         shader->info.num_input_vgprs += 2;
6792                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6793                         shader->info.num_input_vgprs += 2;
6794                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6795                         shader->info.num_input_vgprs += 2;
6796                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6797                         shader->info.num_input_vgprs += 1;
6798                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6799                         shader->info.num_input_vgprs += 1;
6800                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6801                         shader->info.num_input_vgprs += 1;
6802                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6803                         shader->info.num_input_vgprs += 1;
6804                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6805                         shader->info.num_input_vgprs += 1;
6806                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6807                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6808                         shader->info.num_input_vgprs += 1;
6809                 }
6810                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6811                         shader->info.num_input_vgprs += 1;
6812                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6813                         shader->info.num_input_vgprs += 1;
6814                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6815                         shader->info.num_input_vgprs += 1;
6816         }
6817
6818         if (ctx.type == PIPE_SHADER_GEOMETRY) {
6819                 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6820                 shader->gs_copy_shader->selector = shader->selector;
6821                 ctx.shader = shader->gs_copy_shader;
6822                 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6823                                                     shader, debug))) {
6824                         free(shader->gs_copy_shader);
6825                         shader->gs_copy_shader = NULL;
6826                         goto out;
6827                 }
6828         }
6829
6830 out:
6831         for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6832                 FREE(ctx.constants[i]);
6833         return r;
6834 }
6835
6836 /**
6837  * Create, compile and return a shader part (prolog or epilog).
6838  *
6839  * \param sscreen       screen
6840  * \param list          list of shader parts of the same category
6841  * \param key           shader part key
6842  * \param tm            LLVM target machine
6843  * \param debug         debug callback
6844  * \param compile       the callback responsible for compilation
6845  * \return              non-NULL on success
6846  */
6847 static struct si_shader_part *
6848 si_get_shader_part(struct si_screen *sscreen,
6849                    struct si_shader_part **list,
6850                    union si_shader_part_key *key,
6851                    LLVMTargetMachineRef tm,
6852                    struct pipe_debug_callback *debug,
6853                    bool (*compile)(struct si_screen *,
6854                                    LLVMTargetMachineRef,
6855                                    struct pipe_debug_callback *,
6856                                    struct si_shader_part *))
6857 {
6858         struct si_shader_part *result;
6859
6860         pipe_mutex_lock(sscreen->shader_parts_mutex);
6861
6862         /* Find existing. */
6863         for (result = *list; result; result = result->next) {
6864                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6865                         pipe_mutex_unlock(sscreen->shader_parts_mutex);
6866                         return result;
6867                 }
6868         }
6869
6870         /* Compile a new one. */
6871         result = CALLOC_STRUCT(si_shader_part);
6872         result->key = *key;
6873         if (!compile(sscreen, tm, debug, result)) {
6874                 FREE(result);
6875                 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6876                 return NULL;
6877         }
6878
6879         result->next = *list;
6880         *list = result;
6881         pipe_mutex_unlock(sscreen->shader_parts_mutex);
6882         return result;
6883 }
6884
6885 /**
6886  * Create a vertex shader prolog.
6887  *
6888  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6889  * All inputs are returned unmodified. The vertex load indices are
6890  * stored after them, which will used by the API VS for fetching inputs.
6891  *
6892  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6893  *   input_v0,
6894  *   input_v1,
6895  *   input_v2,
6896  *   input_v3,
6897  *   (VertexID + BaseVertex),
6898  *   (InstanceID + StartInstance),
6899  *   (InstanceID / 2 + StartInstance)
6900  */
6901 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6902                                  LLVMTargetMachineRef tm,
6903                                  struct pipe_debug_callback *debug,
6904                                  struct si_shader_part *out)
6905 {
6906         union si_shader_part_key *key = &out->key;
6907         struct si_shader shader = {};
6908         struct si_shader_context ctx;
6909         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6910         LLVMTypeRef *params, *returns;
6911         LLVMValueRef ret, func;
6912         int last_sgpr, num_params, num_returns, i;
6913         bool status = true;
6914
6915         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6916         ctx.type = PIPE_SHADER_VERTEX;
6917         ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6918         ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6919
6920         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6921         params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6922                         sizeof(LLVMTypeRef));
6923         returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6924                           key->vs_prolog.last_input + 1) *
6925                          sizeof(LLVMTypeRef));
6926         num_params = 0;
6927         num_returns = 0;
6928
6929         /* Declare input and output SGPRs. */
6930         num_params = 0;
6931         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6932                 params[num_params++] = ctx.i32;
6933                 returns[num_returns++] = ctx.i32;
6934         }
6935         last_sgpr = num_params - 1;
6936
6937         /* 4 preloaded VGPRs (outputs must be floats) */
6938         for (i = 0; i < 4; i++) {
6939                 params[num_params++] = ctx.i32;
6940                 returns[num_returns++] = ctx.f32;
6941         }
6942
6943         /* Vertex load indices. */
6944         for (i = 0; i <= key->vs_prolog.last_input; i++)
6945                 returns[num_returns++] = ctx.f32;
6946
6947         /* Create the function. */
6948         si_create_function(&ctx, returns, num_returns, params,
6949                            num_params, last_sgpr);
6950         func = ctx.radeon_bld.main_fn;
6951
6952         /* Copy inputs to outputs. This should be no-op, as the registers match,
6953          * but it will prevent the compiler from overwriting them unintentionally.
6954          */
6955         ret = ctx.return_value;
6956         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6957                 LLVMValueRef p = LLVMGetParam(func, i);
6958                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6959         }
6960         for (i = num_params - 4; i < num_params; i++) {
6961                 LLVMValueRef p = LLVMGetParam(func, i);
6962                 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6963                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6964         }
6965
6966         /* Compute vertex load indices from instance divisors. */
6967         for (i = 0; i <= key->vs_prolog.last_input; i++) {
6968                 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6969                 LLVMValueRef index;
6970
6971                 if (divisor) {
6972                         /* InstanceID / Divisor + StartInstance */
6973                         index = get_instance_index_for_fetch(&ctx.radeon_bld,
6974                                                              SI_SGPR_START_INSTANCE,
6975                                                              divisor);
6976                 } else {
6977                         /* VertexID + BaseVertex */
6978                         index = LLVMBuildAdd(gallivm->builder,
6979                                              LLVMGetParam(func, ctx.param_vertex_id),
6980                                              LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6981                 }
6982
6983                 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6984                 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6985                                            num_params++, "");
6986         }
6987
6988         /* Compile. */
6989         si_llvm_build_ret(&ctx, ret);
6990         radeon_llvm_finalize_module(&ctx.radeon_bld);
6991
6992         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6993                             gallivm->module, debug, ctx.type,
6994                             "Vertex Shader Prolog"))
6995                 status = false;
6996
6997         radeon_llvm_dispose(&ctx.radeon_bld);
6998         return status;
6999 }
7000
7001 /**
7002  * Compile the vertex shader epilog. This is also used by the tessellation
7003  * evaluation shader compiled as VS.
7004  *
7005  * The input is PrimitiveID.
7006  *
7007  * If PrimitiveID is required by the pixel shader, export it.
7008  * Otherwise, do nothing.
7009  */
7010 static bool si_compile_vs_epilog(struct si_screen *sscreen,
7011                                  LLVMTargetMachineRef tm,
7012                                  struct pipe_debug_callback *debug,
7013                                  struct si_shader_part *out)
7014 {
7015         union si_shader_part_key *key = &out->key;
7016         struct si_shader_context ctx;
7017         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7018         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7019         LLVMTypeRef params[5];
7020         int num_params, i;
7021         bool status = true;
7022
7023         si_init_shader_ctx(&ctx, sscreen, NULL, tm);
7024         ctx.type = PIPE_SHADER_VERTEX;
7025
7026         /* Declare input VGPRs. */
7027         num_params = key->vs_epilog.states.export_prim_id ?
7028                            (VS_EPILOG_PRIMID_LOC + 1) : 0;
7029         assert(num_params <= ARRAY_SIZE(params));
7030
7031         for (i = 0; i < num_params; i++)
7032                 params[i] = ctx.f32;
7033
7034         /* Create the function. */
7035         si_create_function(&ctx, NULL, 0, params, num_params, -1);
7036
7037         /* Emit exports. */
7038         if (key->vs_epilog.states.export_prim_id) {
7039                 struct lp_build_context *base = &bld_base->base;
7040                 struct lp_build_context *uint = &bld_base->uint_bld;
7041                 LLVMValueRef args[9];
7042
7043                 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
7044                 args[1] = uint->zero; /* whether the EXEC mask is valid */
7045                 args[2] = uint->zero; /* DONE bit */
7046                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
7047                                                key->vs_epilog.prim_id_param_offset);
7048                 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
7049                 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
7050                                        VS_EPILOG_PRIMID_LOC); /* X */
7051                 args[6] = uint->undef; /* Y */
7052                 args[7] = uint->undef; /* Z */
7053                 args[8] = uint->undef; /* W */
7054
7055                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
7056                                    LLVMVoidTypeInContext(base->gallivm->context),
7057                                    args, 9, 0);
7058         }
7059
7060         /* Compile. */
7061         LLVMBuildRetVoid(gallivm->builder);
7062         radeon_llvm_finalize_module(&ctx.radeon_bld);
7063
7064         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7065                             gallivm->module, debug, ctx.type,
7066                             "Vertex Shader Epilog"))
7067                 status = false;
7068
7069         radeon_llvm_dispose(&ctx.radeon_bld);
7070         return status;
7071 }
7072
7073 /**
7074  * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7075  */
7076 static bool si_get_vs_epilog(struct si_screen *sscreen,
7077                              LLVMTargetMachineRef tm,
7078                              struct si_shader *shader,
7079                              struct pipe_debug_callback *debug,
7080                              struct si_vs_epilog_bits *states)
7081 {
7082         union si_shader_part_key epilog_key;
7083
7084         memset(&epilog_key, 0, sizeof(epilog_key));
7085         epilog_key.vs_epilog.states = *states;
7086
7087         /* Set up the PrimitiveID output. */
7088         if (shader->key.vs.epilog.export_prim_id) {
7089                 unsigned index = shader->selector->info.num_outputs;
7090                 unsigned offset = shader->info.nr_param_exports++;
7091
7092                 epilog_key.vs_epilog.prim_id_param_offset = offset;
7093                 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7094                 shader->info.vs_output_param_offset[index] = offset;
7095         }
7096
7097         shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7098                                             &epilog_key, tm, debug,
7099                                             si_compile_vs_epilog);
7100         return shader->epilog != NULL;
7101 }
7102
7103 /**
7104  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7105  */
7106 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7107                                       LLVMTargetMachineRef tm,
7108                                       struct si_shader *shader,
7109                                       struct pipe_debug_callback *debug)
7110 {
7111         struct tgsi_shader_info *info = &shader->selector->info;
7112         union si_shader_part_key prolog_key;
7113         unsigned i;
7114
7115         /* Get the prolog. */
7116         memset(&prolog_key, 0, sizeof(prolog_key));
7117         prolog_key.vs_prolog.states = shader->key.vs.prolog;
7118         prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7119         prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7120
7121         /* The prolog is a no-op if there are no inputs. */
7122         if (info->num_inputs) {
7123                 shader->prolog =
7124                         si_get_shader_part(sscreen, &sscreen->vs_prologs,
7125                                            &prolog_key, tm, debug,
7126                                            si_compile_vs_prolog);
7127                 if (!shader->prolog)
7128                         return false;
7129         }
7130
7131         /* Get the epilog. */
7132         if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7133             !si_get_vs_epilog(sscreen, tm, shader, debug,
7134                               &shader->key.vs.epilog))
7135                 return false;
7136
7137         /* Set the instanceID flag. */
7138         for (i = 0; i < info->num_inputs; i++)
7139                 if (prolog_key.vs_prolog.states.instance_divisors[i])
7140                         shader->info.uses_instanceid = true;
7141
7142         return true;
7143 }
7144
7145 /**
7146  * Select and compile (or reuse) TES parts (epilog).
7147  */
7148 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7149                                        LLVMTargetMachineRef tm,
7150                                        struct si_shader *shader,
7151                                        struct pipe_debug_callback *debug)
7152 {
7153         if (shader->key.tes.as_es)
7154                 return true;
7155
7156         /* TES compiled as VS. */
7157         return si_get_vs_epilog(sscreen, tm, shader, debug,
7158                                 &shader->key.tes.epilog);
7159 }
7160
7161 /**
7162  * Compile the TCS epilog. This writes tesselation factors to memory based on
7163  * the output primitive type of the tesselator (determined by TES).
7164  */
7165 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7166                                   LLVMTargetMachineRef tm,
7167                                   struct pipe_debug_callback *debug,
7168                                   struct si_shader_part *out)
7169 {
7170         union si_shader_part_key *key = &out->key;
7171         struct si_shader shader = {};
7172         struct si_shader_context ctx;
7173         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7174         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7175         LLVMTypeRef params[16];
7176         LLVMValueRef func;
7177         int last_sgpr, num_params;
7178         bool status = true;
7179
7180         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7181         ctx.type = PIPE_SHADER_TESS_CTRL;
7182         shader.key.tcs.epilog = key->tcs_epilog.states;
7183
7184         /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7185         params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7186         params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7187         params[SI_PARAM_SAMPLERS] = ctx.i64;
7188         params[SI_PARAM_IMAGES] = ctx.i64;
7189         params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7190         params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7191         params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7192         params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7193         params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7194         params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7195         params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7196         last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7197         num_params = last_sgpr + 1;
7198
7199         params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7200         params[num_params++] = ctx.i32; /* invocation ID within the patch */
7201         params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7202
7203         /* Create the function. */
7204         si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7205         declare_tess_lds(&ctx);
7206         func = ctx.radeon_bld.main_fn;
7207
7208         si_write_tess_factors(bld_base,
7209                               LLVMGetParam(func, last_sgpr + 1),
7210                               LLVMGetParam(func, last_sgpr + 2),
7211                               LLVMGetParam(func, last_sgpr + 3));
7212
7213         /* Compile. */
7214         LLVMBuildRetVoid(gallivm->builder);
7215         radeon_llvm_finalize_module(&ctx.radeon_bld);
7216
7217         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7218                             gallivm->module, debug, ctx.type,
7219                             "Tessellation Control Shader Epilog"))
7220                 status = false;
7221
7222         radeon_llvm_dispose(&ctx.radeon_bld);
7223         return status;
7224 }
7225
7226 /**
7227  * Select and compile (or reuse) TCS parts (epilog).
7228  */
7229 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7230                                        LLVMTargetMachineRef tm,
7231                                        struct si_shader *shader,
7232                                        struct pipe_debug_callback *debug)
7233 {
7234         union si_shader_part_key epilog_key;
7235
7236         /* Get the epilog. */
7237         memset(&epilog_key, 0, sizeof(epilog_key));
7238         epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7239
7240         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7241                                             &epilog_key, tm, debug,
7242                                             si_compile_tcs_epilog);
7243         return shader->epilog != NULL;
7244 }
7245
7246 /**
7247  * Compile the pixel shader prolog. This handles:
7248  * - two-side color selection and interpolation
7249  * - overriding interpolation parameters for the API PS
7250  * - polygon stippling
7251  *
7252  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7253  * overriden by other states. (e.g. per-sample interpolation)
7254  * Interpolated colors are stored after the preloaded VGPRs.
7255  */
7256 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7257                                  LLVMTargetMachineRef tm,
7258                                  struct pipe_debug_callback *debug,
7259                                  struct si_shader_part *out)
7260 {
7261         union si_shader_part_key *key = &out->key;
7262         struct si_shader shader = {};
7263         struct si_shader_context ctx;
7264         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7265         LLVMTypeRef *params;
7266         LLVMValueRef ret, func;
7267         int last_sgpr, num_params, num_returns, i, num_color_channels;
7268         bool status = true;
7269
7270         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7271         ctx.type = PIPE_SHADER_FRAGMENT;
7272         shader.key.ps.prolog = key->ps_prolog.states;
7273
7274         /* Number of inputs + 8 color elements. */
7275         params = alloca((key->ps_prolog.num_input_sgprs +
7276                          key->ps_prolog.num_input_vgprs + 8) *
7277                         sizeof(LLVMTypeRef));
7278
7279         /* Declare inputs. */
7280         num_params = 0;
7281         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7282                 params[num_params++] = ctx.i32;
7283         last_sgpr = num_params - 1;
7284
7285         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7286                 params[num_params++] = ctx.f32;
7287
7288         /* Declare outputs (same as inputs + add colors if needed) */
7289         num_returns = num_params;
7290         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7291         for (i = 0; i < num_color_channels; i++)
7292                 params[num_returns++] = ctx.f32;
7293
7294         /* Create the function. */
7295         si_create_function(&ctx, params, num_returns, params,
7296                            num_params, last_sgpr);
7297         func = ctx.radeon_bld.main_fn;
7298
7299         /* Copy inputs to outputs. This should be no-op, as the registers match,
7300          * but it will prevent the compiler from overwriting them unintentionally.
7301          */
7302         ret = ctx.return_value;
7303         for (i = 0; i < num_params; i++) {
7304                 LLVMValueRef p = LLVMGetParam(func, i);
7305                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7306         }
7307
7308         /* Polygon stippling. */
7309         if (key->ps_prolog.states.poly_stipple) {
7310                 /* POS_FIXED_PT is always last. */
7311                 unsigned pos = key->ps_prolog.num_input_sgprs +
7312                                key->ps_prolog.num_input_vgprs - 1;
7313                 LLVMValueRef ptr[2], list;
7314
7315                 /* Get the pointer to rw buffers. */
7316                 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7317                 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7318                 list = lp_build_gather_values(gallivm, ptr, 2);
7319                 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7320                 list = LLVMBuildIntToPtr(gallivm->builder, list,
7321                                           const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7322
7323                 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7324         }
7325
7326         if (key->ps_prolog.states.bc_optimize_for_persp ||
7327             key->ps_prolog.states.bc_optimize_for_linear) {
7328                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7329                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7330
7331                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7332                  * The hw doesn't compute CENTROID if the whole wave only
7333                  * contains fully-covered quads.
7334                  *
7335                  * PRIM_MASK is after user SGPRs.
7336                  */
7337                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7338                 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7339                                             LLVMConstInt(ctx.i32, 31, 0), "");
7340                 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7341                                              ctx.i1, "");
7342
7343                 if (key->ps_prolog.states.bc_optimize_for_persp) {
7344                         /* Read PERSP_CENTER. */
7345                         for (i = 0; i < 2; i++)
7346                                 center[i] = LLVMGetParam(func, base + 2 + i);
7347                         /* Read PERSP_CENTROID. */
7348                         for (i = 0; i < 2; i++)
7349                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
7350                         /* Select PERSP_CENTROID. */
7351                         for (i = 0; i < 2; i++) {
7352                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7353                                                       center[i], centroid[i], "");
7354                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7355                                                            tmp, base + 4 + i, "");
7356                         }
7357                 }
7358                 if (key->ps_prolog.states.bc_optimize_for_linear) {
7359                         /* Read LINEAR_CENTER. */
7360                         for (i = 0; i < 2; i++)
7361                                 center[i] = LLVMGetParam(func, base + 8 + i);
7362                         /* Read LINEAR_CENTROID. */
7363                         for (i = 0; i < 2; i++)
7364                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
7365                         /* Select LINEAR_CENTROID. */
7366                         for (i = 0; i < 2; i++) {
7367                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7368                                                       center[i], centroid[i], "");
7369                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7370                                                            tmp, base + 10 + i, "");
7371                         }
7372                 }
7373         }
7374
7375         /* Interpolate colors. */
7376         for (i = 0; i < 2; i++) {
7377                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7378                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7379                                      key->ps_prolog.face_vgpr_index;
7380                 LLVMValueRef interp[2], color[4];
7381                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7382
7383                 if (!writemask)
7384                         continue;
7385
7386                 /* If the interpolation qualifier is not CONSTANT (-1). */
7387                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7388                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7389                                                key->ps_prolog.color_interp_vgpr_index[i];
7390
7391                         /* Get the (i,j) updated by bc_optimize handling. */
7392                         interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7393                                                           interp_vgpr, "");
7394                         interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7395                                                           interp_vgpr + 1, "");
7396                         interp_ij = lp_build_gather_values(gallivm, interp, 2);
7397                         interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7398                                                      ctx.v2i32, "");
7399                 }
7400
7401                 /* Use the absolute location of the input. */
7402                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7403
7404                 if (key->ps_prolog.states.color_two_side) {
7405                         face = LLVMGetParam(func, face_vgpr);
7406                         face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7407                 }
7408
7409                 interp_fs_input(&ctx,
7410                                 key->ps_prolog.color_attr_index[i],
7411                                 TGSI_SEMANTIC_COLOR, i,
7412                                 key->ps_prolog.num_interp_inputs,
7413                                 key->ps_prolog.colors_read, interp_ij,
7414                                 prim_mask, face, color);
7415
7416                 while (writemask) {
7417                         unsigned chan = u_bit_scan(&writemask);
7418                         ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7419                                                    num_params++, "");
7420                 }
7421         }
7422
7423         /* Force per-sample interpolation. */
7424         if (key->ps_prolog.states.force_persp_sample_interp) {
7425                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7426                 LLVMValueRef persp_sample[2];
7427
7428                 /* Read PERSP_SAMPLE. */
7429                 for (i = 0; i < 2; i++)
7430                         persp_sample[i] = LLVMGetParam(func, base + i);
7431                 /* Overwrite PERSP_CENTER. */
7432                 for (i = 0; i < 2; i++)
7433                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7434                                                    persp_sample[i], base + 2 + i, "");
7435                 /* Overwrite PERSP_CENTROID. */
7436                 for (i = 0; i < 2; i++)
7437                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7438                                                    persp_sample[i], base + 4 + i, "");
7439         }
7440         if (key->ps_prolog.states.force_linear_sample_interp) {
7441                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7442                 LLVMValueRef linear_sample[2];
7443
7444                 /* Read LINEAR_SAMPLE. */
7445                 for (i = 0; i < 2; i++)
7446                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7447                 /* Overwrite LINEAR_CENTER. */
7448                 for (i = 0; i < 2; i++)
7449                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7450                                                    linear_sample[i], base + 8 + i, "");
7451                 /* Overwrite LINEAR_CENTROID. */
7452                 for (i = 0; i < 2; i++)
7453                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7454                                                    linear_sample[i], base + 10 + i, "");
7455         }
7456
7457         /* Force center interpolation. */
7458         if (key->ps_prolog.states.force_persp_center_interp) {
7459                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7460                 LLVMValueRef persp_center[2];
7461
7462                 /* Read PERSP_CENTER. */
7463                 for (i = 0; i < 2; i++)
7464                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
7465                 /* Overwrite PERSP_SAMPLE. */
7466                 for (i = 0; i < 2; i++)
7467                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7468                                                    persp_center[i], base + i, "");
7469                 /* Overwrite PERSP_CENTROID. */
7470                 for (i = 0; i < 2; i++)
7471                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7472                                                    persp_center[i], base + 4 + i, "");
7473         }
7474         if (key->ps_prolog.states.force_linear_center_interp) {
7475                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7476                 LLVMValueRef linear_center[2];
7477
7478                 /* Read LINEAR_CENTER. */
7479                 for (i = 0; i < 2; i++)
7480                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
7481                 /* Overwrite LINEAR_SAMPLE. */
7482                 for (i = 0; i < 2; i++)
7483                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7484                                                    linear_center[i], base + 6 + i, "");
7485                 /* Overwrite LINEAR_CENTROID. */
7486                 for (i = 0; i < 2; i++)
7487                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7488                                                    linear_center[i], base + 10 + i, "");
7489         }
7490
7491         /* Tell LLVM to insert WQM instruction sequence when needed. */
7492         if (key->ps_prolog.wqm) {
7493                 LLVMAddTargetDependentFunctionAttr(func,
7494                                                    "amdgpu-ps-wqm-outputs", "");
7495         }
7496
7497         /* Compile. */
7498         si_llvm_build_ret(&ctx, ret);
7499         radeon_llvm_finalize_module(&ctx.radeon_bld);
7500
7501         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7502                             gallivm->module, debug, ctx.type,
7503                             "Fragment Shader Prolog"))
7504                 status = false;
7505
7506         radeon_llvm_dispose(&ctx.radeon_bld);
7507         return status;
7508 }
7509
7510 /**
7511  * Compile the pixel shader epilog. This handles everything that must be
7512  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7513  */
7514 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7515                                  LLVMTargetMachineRef tm,
7516                                  struct pipe_debug_callback *debug,
7517                                  struct si_shader_part *out)
7518 {
7519         union si_shader_part_key *key = &out->key;
7520         struct si_shader shader = {};
7521         struct si_shader_context ctx;
7522         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7523         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7524         LLVMTypeRef params[16+8*4+3];
7525         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7526         int last_sgpr, num_params, i;
7527         bool status = true;
7528         struct si_ps_exports exp = {};
7529
7530         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7531         ctx.type = PIPE_SHADER_FRAGMENT;
7532         shader.key.ps.epilog = key->ps_epilog.states;
7533
7534         /* Declare input SGPRs. */
7535         params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7536         params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7537         params[SI_PARAM_SAMPLERS] = ctx.i64;
7538         params[SI_PARAM_IMAGES] = ctx.i64;
7539         params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7540         params[SI_PARAM_ALPHA_REF] = ctx.f32;
7541         last_sgpr = SI_PARAM_ALPHA_REF;
7542
7543         /* Declare input VGPRs. */
7544         num_params = (last_sgpr + 1) +
7545                      util_bitcount(key->ps_epilog.colors_written) * 4 +
7546                      key->ps_epilog.writes_z +
7547                      key->ps_epilog.writes_stencil +
7548                      key->ps_epilog.writes_samplemask;
7549
7550         num_params = MAX2(num_params,
7551                           last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7552
7553         assert(num_params <= ARRAY_SIZE(params));
7554
7555         for (i = last_sgpr + 1; i < num_params; i++)
7556                 params[i] = ctx.f32;
7557
7558         /* Create the function. */
7559         si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7560         /* Disable elimination of unused inputs. */
7561         radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7562                                   "InitialPSInputAddr", 0xffffff);
7563
7564         /* Process colors. */
7565         unsigned vgpr = last_sgpr + 1;
7566         unsigned colors_written = key->ps_epilog.colors_written;
7567         int last_color_export = -1;
7568
7569         /* Find the last color export. */
7570         if (!key->ps_epilog.writes_z &&
7571             !key->ps_epilog.writes_stencil &&
7572             !key->ps_epilog.writes_samplemask) {
7573                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7574
7575                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7576                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7577                         /* Just set this if any of the colorbuffers are enabled. */
7578                         if (spi_format &
7579                             ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7580                                 last_color_export = 0;
7581                 } else {
7582                         for (i = 0; i < 8; i++)
7583                                 if (colors_written & (1 << i) &&
7584                                     (spi_format >> (i * 4)) & 0xf)
7585                                         last_color_export = i;
7586                 }
7587         }
7588
7589         while (colors_written) {
7590                 LLVMValueRef color[4];
7591                 int mrt = u_bit_scan(&colors_written);
7592
7593                 for (i = 0; i < 4; i++)
7594                         color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7595
7596                 si_export_mrt_color(bld_base, color, mrt,
7597                                     num_params - 1,
7598                                     mrt == last_color_export, &exp);
7599         }
7600
7601         /* Process depth, stencil, samplemask. */
7602         if (key->ps_epilog.writes_z)
7603                 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7604         if (key->ps_epilog.writes_stencil)
7605                 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7606         if (key->ps_epilog.writes_samplemask)
7607                 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7608
7609         if (depth || stencil || samplemask)
7610                 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7611         else if (last_color_export == -1)
7612                 si_export_null(bld_base);
7613
7614         if (exp.num)
7615                 si_emit_ps_exports(&ctx, &exp);
7616
7617         /* Compile. */
7618         LLVMBuildRetVoid(gallivm->builder);
7619         radeon_llvm_finalize_module(&ctx.radeon_bld);
7620
7621         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7622                             gallivm->module, debug, ctx.type,
7623                             "Fragment Shader Epilog"))
7624                 status = false;
7625
7626         radeon_llvm_dispose(&ctx.radeon_bld);
7627         return status;
7628 }
7629
7630 /**
7631  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7632  */
7633 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7634                                       LLVMTargetMachineRef tm,
7635                                       struct si_shader *shader,
7636                                       struct pipe_debug_callback *debug)
7637 {
7638         struct tgsi_shader_info *info = &shader->selector->info;
7639         union si_shader_part_key prolog_key;
7640         union si_shader_part_key epilog_key;
7641         unsigned i;
7642
7643         /* Get the prolog. */
7644         memset(&prolog_key, 0, sizeof(prolog_key));
7645         prolog_key.ps_prolog.states = shader->key.ps.prolog;
7646         prolog_key.ps_prolog.colors_read = info->colors_read;
7647         prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7648         prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7649         prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7650                 (prolog_key.ps_prolog.colors_read ||
7651                  prolog_key.ps_prolog.states.force_persp_sample_interp ||
7652                  prolog_key.ps_prolog.states.force_linear_sample_interp ||
7653                  prolog_key.ps_prolog.states.force_persp_center_interp ||
7654                  prolog_key.ps_prolog.states.force_linear_center_interp ||
7655                  prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7656                  prolog_key.ps_prolog.states.bc_optimize_for_linear);
7657
7658         if (info->colors_read) {
7659                 unsigned *color = shader->selector->color_attr_index;
7660
7661                 if (shader->key.ps.prolog.color_two_side) {
7662                         /* BCOLORs are stored after the last input. */
7663                         prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7664                         prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7665                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7666                 }
7667
7668                 for (i = 0; i < 2; i++) {
7669                         unsigned interp = info->input_interpolate[color[i]];
7670                         unsigned location = info->input_interpolate_loc[color[i]];
7671
7672                         if (!(info->colors_read & (0xf << i*4)))
7673                                 continue;
7674
7675                         prolog_key.ps_prolog.color_attr_index[i] = color[i];
7676
7677                         if (shader->key.ps.prolog.flatshade_colors &&
7678                             interp == TGSI_INTERPOLATE_COLOR)
7679                                 interp = TGSI_INTERPOLATE_CONSTANT;
7680
7681                         switch (interp) {
7682                         case TGSI_INTERPOLATE_CONSTANT:
7683                                 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7684                                 break;
7685                         case TGSI_INTERPOLATE_PERSPECTIVE:
7686                         case TGSI_INTERPOLATE_COLOR:
7687                                 /* Force the interpolation location for colors here. */
7688                                 if (shader->key.ps.prolog.force_persp_sample_interp)
7689                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7690                                 if (shader->key.ps.prolog.force_persp_center_interp)
7691                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7692
7693                                 switch (location) {
7694                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7695                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7696                                         shader->config.spi_ps_input_ena |=
7697                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
7698                                         break;
7699                                 case TGSI_INTERPOLATE_LOC_CENTER:
7700                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7701                                         shader->config.spi_ps_input_ena |=
7702                                                 S_0286CC_PERSP_CENTER_ENA(1);
7703                                         break;
7704                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7705                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7706                                         shader->config.spi_ps_input_ena |=
7707                                                 S_0286CC_PERSP_CENTROID_ENA(1);
7708                                         break;
7709                                 default:
7710                                         assert(0);
7711                                 }
7712                                 break;
7713                         case TGSI_INTERPOLATE_LINEAR:
7714                                 /* Force the interpolation location for colors here. */
7715                                 if (shader->key.ps.prolog.force_linear_sample_interp)
7716                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7717                                 if (shader->key.ps.prolog.force_linear_center_interp)
7718                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7719
7720                                 switch (location) {
7721                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7722                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7723                                         shader->config.spi_ps_input_ena |=
7724                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
7725                                         break;
7726                                 case TGSI_INTERPOLATE_LOC_CENTER:
7727                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7728                                         shader->config.spi_ps_input_ena |=
7729                                                 S_0286CC_LINEAR_CENTER_ENA(1);
7730                                         break;
7731                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7732                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7733                                         shader->config.spi_ps_input_ena |=
7734                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
7735                                         break;
7736                                 default:
7737                                         assert(0);
7738                                 }
7739                                 break;
7740                         default:
7741                                 assert(0);
7742                         }
7743                 }
7744         }
7745
7746         /* The prolog is a no-op if these aren't set. */
7747         if (prolog_key.ps_prolog.colors_read ||
7748             prolog_key.ps_prolog.states.force_persp_sample_interp ||
7749             prolog_key.ps_prolog.states.force_linear_sample_interp ||
7750             prolog_key.ps_prolog.states.force_persp_center_interp ||
7751             prolog_key.ps_prolog.states.force_linear_center_interp ||
7752             prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7753             prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7754             prolog_key.ps_prolog.states.poly_stipple) {
7755                 shader->prolog =
7756                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
7757                                            &prolog_key, tm, debug,
7758                                            si_compile_ps_prolog);
7759                 if (!shader->prolog)
7760                         return false;
7761         }
7762
7763         /* Get the epilog. */
7764         memset(&epilog_key, 0, sizeof(epilog_key));
7765         epilog_key.ps_epilog.colors_written = info->colors_written;
7766         epilog_key.ps_epilog.writes_z = info->writes_z;
7767         epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7768         epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7769         epilog_key.ps_epilog.states = shader->key.ps.epilog;
7770
7771         shader->epilog =
7772                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7773                                    &epilog_key, tm, debug,
7774                                    si_compile_ps_epilog);
7775         if (!shader->epilog)
7776                 return false;
7777
7778         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7779         if (shader->key.ps.prolog.poly_stipple) {
7780                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7781                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7782         }
7783
7784         /* Set up the enable bits for per-sample shading if needed. */
7785         if (shader->key.ps.prolog.force_persp_sample_interp &&
7786             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7787              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7788                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7789                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7790                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7791         }
7792         if (shader->key.ps.prolog.force_linear_sample_interp &&
7793             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7794              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7795                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7796                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7797                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7798         }
7799         if (shader->key.ps.prolog.force_persp_center_interp &&
7800             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7801              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7802                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7803                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7804                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7805         }
7806         if (shader->key.ps.prolog.force_linear_center_interp &&
7807             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7808              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7809                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7810                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7811                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7812         }
7813
7814         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7815         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7816             !(shader->config.spi_ps_input_ena & 0xf)) {
7817                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7818                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7819         }
7820
7821         /* At least one pair of interpolation weights must be enabled. */
7822         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7823                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7824                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7825         }
7826
7827         /* The sample mask input is always enabled, because the API shader always
7828          * passes it through to the epilog. Disable it here if it's unused.
7829          */
7830         if (!shader->key.ps.epilog.poly_line_smoothing &&
7831             !shader->selector->info.reads_samplemask)
7832                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7833
7834         return true;
7835 }
7836
7837 static void si_fix_num_sgprs(struct si_shader *shader)
7838 {
7839         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7840
7841         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7842 }
7843
7844 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7845                      struct si_shader *shader,
7846                      struct pipe_debug_callback *debug)
7847 {
7848         struct si_shader *mainp = shader->selector->main_shader_part;
7849         int r;
7850
7851         /* LS, ES, VS are compiled on demand if the main part hasn't been
7852          * compiled for that stage.
7853          */
7854         if (!mainp ||
7855             (shader->selector->type == PIPE_SHADER_VERTEX &&
7856              (shader->key.vs.as_es != mainp->key.vs.as_es ||
7857               shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7858             (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7859              shader->key.tes.as_es != mainp->key.tes.as_es) ||
7860             (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7861              shader->key.tcs.epilog.inputs_to_copy) ||
7862             shader->selector->type == PIPE_SHADER_COMPUTE) {
7863                 /* Monolithic shader (compiled as a whole, has many variants,
7864                  * may take a long time to compile).
7865                  */
7866                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7867                 if (r)
7868                         return r;
7869         } else {
7870                 /* The shader consists of 2-3 parts:
7871                  *
7872                  * - the middle part is the user shader, it has 1 variant only
7873                  *   and it was compiled during the creation of the shader
7874                  *   selector
7875                  * - the prolog part is inserted at the beginning
7876                  * - the epilog part is inserted at the end
7877                  *
7878                  * The prolog and epilog have many (but simple) variants.
7879                  */
7880
7881                 /* Copy the compiled TGSI shader data over. */
7882                 shader->is_binary_shared = true;
7883                 shader->binary = mainp->binary;
7884                 shader->config = mainp->config;
7885                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7886                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7887                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7888                 memcpy(shader->info.vs_output_param_offset,
7889                        mainp->info.vs_output_param_offset,
7890                        sizeof(mainp->info.vs_output_param_offset));
7891                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7892                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7893                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7894
7895                 /* Select prologs and/or epilogs. */
7896                 switch (shader->selector->type) {
7897                 case PIPE_SHADER_VERTEX:
7898                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7899                                 return -1;
7900                         break;
7901                 case PIPE_SHADER_TESS_CTRL:
7902                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7903                                 return -1;
7904                         break;
7905                 case PIPE_SHADER_TESS_EVAL:
7906                         if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7907                                 return -1;
7908                         break;
7909                 case PIPE_SHADER_FRAGMENT:
7910                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7911                                 return -1;
7912
7913                         /* Make sure we have at least as many VGPRs as there
7914                          * are allocated inputs.
7915                          */
7916                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7917                                                         shader->info.num_input_vgprs);
7918                         break;
7919                 }
7920
7921                 /* Update SGPR and VGPR counts. */
7922                 if (shader->prolog) {
7923                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7924                                                         shader->prolog->config.num_sgprs);
7925                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7926                                                         shader->prolog->config.num_vgprs);
7927                 }
7928                 if (shader->epilog) {
7929                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7930                                                         shader->epilog->config.num_sgprs);
7931                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7932                                                         shader->epilog->config.num_vgprs);
7933                 }
7934         }
7935
7936         si_fix_num_sgprs(shader);
7937         si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7938                        stderr);
7939
7940         /* Upload. */
7941         r = si_shader_binary_upload(sscreen, shader);
7942         if (r) {
7943                 fprintf(stderr, "LLVM failed to upload shader\n");
7944                 return r;
7945         }
7946
7947         return 0;
7948 }
7949
7950 void si_shader_destroy(struct si_shader *shader)
7951 {
7952         if (shader->gs_copy_shader) {
7953                 si_shader_destroy(shader->gs_copy_shader);
7954                 FREE(shader->gs_copy_shader);
7955         }
7956
7957         if (shader->scratch_bo)
7958                 r600_resource_reference(&shader->scratch_bo, NULL);
7959
7960         r600_resource_reference(&shader->bo, NULL);
7961
7962         if (!shader->is_binary_shared)
7963                 radeon_shader_binary_clean(&shader->binary);
7964
7965         free(shader->shader_log);
7966 }