src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_bitarit.h"
  35 #include "gallivm/lp_bld_flow.h"
  36 #include "radeon/r600_cs.h"
  37 #include "radeon/radeon_llvm.h"
  38 #include "radeon/radeon_elf_util.h"
  39 #include "radeon/radeon_llvm_emit.h"
  40 #include "util/u_memory.h"
  41 #include "util/u_pstipple.h"
  42 #include "util/u_string.h"
  43 #include "tgsi/tgsi_parse.h"
  44 #include "tgsi/tgsi_build.h"
  45 #include "tgsi/tgsi_util.h"
  46 #include "tgsi/tgsi_dump.h"
  47
  48 #include "si_pipe.h"
  49 #include "si_shader.h"
  50 #include "sid.h"
  51
  52 #include <errno.h>
  53
  54 static const char *scratch_rsrc_dword0_symbol =
  55         "SCRATCH_RSRC_DWORD0";
  56
  57 static const char *scratch_rsrc_dword1_symbol =
  58         "SCRATCH_RSRC_DWORD1";
  59
  60 struct si_shader_output_values
  61 {
  62         LLVMValueRef values[4];
  63         unsigned name;
  64         unsigned sid;
  65 };
  66
  67 struct si_shader_context
  68 {
  69         struct radeon_llvm_context radeon_bld;
  70         struct si_shader *shader;
  71         struct si_screen *screen;
  72
  73         unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
  74         bool is_gs_copy_shader;
  75
  76         /* Whether to generate the optimized shader variant compiled as a whole
  77          * (without a prolog and epilog)
  78          */
  79         bool is_monolithic;
  80
  81         int param_streamout_config;
  82         int param_streamout_write_index;
  83         int param_streamout_offset[4];
  84         int param_vertex_id;
  85         int param_rel_auto_id;
  86         int param_vs_prim_id;
  87         int param_instance_id;
  88         int param_vertex_index0;
  89         int param_tes_u;
  90         int param_tes_v;
  91         int param_tes_rel_patch_id;
  92         int param_tes_patch_id;
  93         int param_es2gs_offset;
  94         int param_oc_lds;
  95
  96         /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
  97          * 0x800000 for VS, 0x1 for ES.
  98          */
  99         int param_tess_offchip;
 100
 101         LLVMTargetMachineRef tm;
 102
 103         unsigned uniform_md_kind;
 104         LLVMValueRef const_md;
 105         LLVMValueRef empty_md;
 106         LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
 107         LLVMValueRef lds;
 108         LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
 109         LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
 110         LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
 111         LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
 112         LLVMValueRef fmasks[SI_NUM_SAMPLERS];
 113         LLVMValueRef images[SI_NUM_IMAGES];
 114         LLVMValueRef so_buffers[4];
 115         LLVMValueRef esgs_ring;
 116         LLVMValueRef gsvs_ring[4];
 117         LLVMValueRef gs_next_vertex[4];
 118         LLVMValueRef return_value;
 119
 120         LLVMTypeRef voidt;
 121         LLVMTypeRef i1;
 122         LLVMTypeRef i8;
 123         LLVMTypeRef i32;
 124         LLVMTypeRef i64;
 125         LLVMTypeRef i128;
 126         LLVMTypeRef f32;
 127         LLVMTypeRef v16i8;
 128         LLVMTypeRef v2i32;
 129         LLVMTypeRef v4i32;
 130         LLVMTypeRef v4f32;
 131         LLVMTypeRef v8i32;
 132
 133         LLVMValueRef shared_memory;
 134 };
 135
 136 static struct si_shader_context *si_shader_context(
 137         struct lp_build_tgsi_context *bld_base)
 138 {
 139         return (struct si_shader_context *)bld_base;
 140 }
 141
 142 static void si_init_shader_ctx(struct si_shader_context *ctx,
 143                                struct si_screen *sscreen,
 144                                struct si_shader *shader,
 145                                LLVMTargetMachineRef tm);
 146
 147 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 148                                  struct lp_build_tgsi_context *bld_base,
 149                                  struct lp_build_emit_data *emit_data);
 150
 151 /* Ideally pass the sample mask input to the PS epilog as v13, which
 152  * is its usual location, so that the shader doesn't have to add v_mov.
 153  */
 154 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
 155
 156 /* The VS location of the PrimitiveID input is the same in the epilog,
 157  * so that the main shader part doesn't have to move it.
 158  */
 159 #define VS_EPILOG_PRIMID_LOC 2
 160
 161 #define PERSPECTIVE_BASE 0
 162 #define LINEAR_BASE 9
 163
 164 #define SAMPLE_OFFSET 0
 165 #define CENTER_OFFSET 2
 166 #define CENTROID_OFSET 4
 167
 168 #define USE_SGPR_MAX_SUFFIX_LEN 5
 169 #define CONST_ADDR_SPACE 2
 170 #define LOCAL_ADDR_SPACE 3
 171 #define USER_SGPR_ADDR_SPACE 8
 172
 173
 174 #define SENDMSG_GS 2
 175 #define SENDMSG_GS_DONE 3
 176
 177 #define SENDMSG_GS_OP_NOP      (0 << 4)
 178 #define SENDMSG_GS_OP_CUT      (1 << 4)
 179 #define SENDMSG_GS_OP_EMIT     (2 << 4)
 180 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
 181
 182 /**
 183  * Returns a unique index for a semantic name and index. The index must be
 184  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 185  * calculated.
 186  */
 187 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 188 {
 189         switch (semantic_name) {
 190         case TGSI_SEMANTIC_POSITION:
 191                 return 0;
 192         case TGSI_SEMANTIC_PSIZE:
 193                 return 1;
 194         case TGSI_SEMANTIC_CLIPDIST:
 195                 assert(index <= 1);
 196                 return 2 + index;
 197         case TGSI_SEMANTIC_GENERIC:
 198                 if (index <= 63-4)
 199                         return 4 + index;
 200                 else
 201                         /* same explanation as in the default statement,
 202                          * the only user hitting this is st/nine.
 203                          */
 204                         return 0;
 205
 206         /* patch indices are completely separate and thus start from 0 */
 207         case TGSI_SEMANTIC_TESSOUTER:
 208                 return 0;
 209         case TGSI_SEMANTIC_TESSINNER:
 210                 return 1;
 211         case TGSI_SEMANTIC_PATCH:
 212                 return 2 + index;
 213
 214         default:
 215                 /* Don't fail here. The result of this function is only used
 216                  * for LS, TCS, TES, and GS, where legacy GL semantics can't
 217                  * occur, but this function is called for all vertex shaders
 218                  * before it's known whether LS will be compiled or not.
 219                  */
 220                 return 0;
 221         }
 222 }
 223
 224 /**
 225  * Get the value of a shader input parameter and extract a bitfield.
 226  */
 227 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 228                                  unsigned param, unsigned rshift,
 229                                  unsigned bitwidth)
 230 {
 231         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 232         LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
 233                                           param);
 234
 235         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 236                 value = bitcast(&ctx->radeon_bld.soa.bld_base,
 237                                 TGSI_TYPE_UNSIGNED, value);
 238
 239         if (rshift)
 240                 value = LLVMBuildLShr(gallivm->builder, value,
 241                                       lp_build_const_int32(gallivm, rshift), "");
 242
 243         if (rshift + bitwidth < 32) {
 244                 unsigned mask = (1 << bitwidth) - 1;
 245                 value = LLVMBuildAnd(gallivm->builder, value,
 246                                      lp_build_const_int32(gallivm, mask), "");
 247         }
 248
 249         return value;
 250 }
 251
 252 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 253 {
 254         switch (ctx->type) {
 255         case PIPE_SHADER_TESS_CTRL:
 256                 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
 257
 258         case PIPE_SHADER_TESS_EVAL:
 259                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 260                                     ctx->param_tes_rel_patch_id);
 261
 262         default:
 263                 assert(0);
 264                 return NULL;
 265         }
 266 }
 267
 268 /* Tessellation shaders pass outputs to the next shader using LDS.
 269  *
 270  * LS outputs = TCS inputs
 271  * TCS outputs = TES inputs
 272  *
 273  * The LDS layout is:
 274  * - TCS inputs for patch 0
 275  * - TCS inputs for patch 1
 276  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 277  * - ...
 278  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 279  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 280  * - TCS outputs for patch 1
 281  * - Per-patch TCS outputs for patch 1
 282  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 283  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 284  * - ...
 285  *
 286  * All three shaders VS(LS), TCS, TES share the same LDS space.
 287  */
 288
 289 static LLVMValueRef
 290 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 291 {
 292         if (ctx->type == PIPE_SHADER_VERTEX)
 293                 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
 294         else if (ctx->type == PIPE_SHADER_TESS_CTRL)
 295                 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
 296         else {
 297                 assert(0);
 298                 return NULL;
 299         }
 300 }
 301
 302 static LLVMValueRef
 303 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 304 {
 305         return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
 306 }
 307
 308 static LLVMValueRef
 309 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 310 {
 311         return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
 312                                 unpack_param(ctx,
 313                                              SI_PARAM_TCS_OUT_OFFSETS,
 314                                              0, 16),
 315                                 4);
 316 }
 317
 318 static LLVMValueRef
 319 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 320 {
 321         return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
 322                                 unpack_param(ctx,
 323                                              SI_PARAM_TCS_OUT_OFFSETS,
 324                                              16, 16),
 325                                 4);
 326 }
 327
 328 static LLVMValueRef
 329 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 330 {
 331         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 332         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 333         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 334
 335         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 336 }
 337
 338 static LLVMValueRef
 339 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 340 {
 341         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 342         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 343         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 344         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 345
 346         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 347                             LLVMBuildMul(gallivm->builder, patch_stride,
 348                                          rel_patch_id, ""),
 349                             "");
 350 }
 351
 352 static LLVMValueRef
 353 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 354 {
 355         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 356         LLVMValueRef patch0_patch_data_offset =
 357                 get_tcs_out_patch0_patch_data_offset(ctx);
 358         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 359         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 360
 361         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 362                             LLVMBuildMul(gallivm->builder, patch_stride,
 363                                          rel_patch_id, ""),
 364                             "");
 365 }
 366
 367 static void build_indexed_store(struct si_shader_context *ctx,
 368                                 LLVMValueRef base_ptr, LLVMValueRef index,
 369                                 LLVMValueRef value)
 370 {
 371         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 372         struct gallivm_state *gallivm = bld_base->base.gallivm;
 373         LLVMValueRef indices[2], pointer;
 374
 375         indices[0] = bld_base->uint_bld.zero;
 376         indices[1] = index;
 377
 378         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 379         LLVMBuildStore(gallivm->builder, value, pointer);
 380 }
 381
 382 /**
 383  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
 384  * It's equivalent to doing a load from &base_ptr[index].
 385  *
 386  * \param base_ptr  Where the array starts.
 387  * \param index     The element index into the array.
 388  * \param uniform   Whether the base_ptr and index can be assumed to be
 389  *                  dynamically uniform
 390  */
 391 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
 392                                        LLVMValueRef base_ptr, LLVMValueRef index,
 393                                        bool uniform)
 394 {
 395         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 396         struct gallivm_state *gallivm = bld_base->base.gallivm;
 397         LLVMValueRef indices[2], pointer;
 398
 399         indices[0] = bld_base->uint_bld.zero;
 400         indices[1] = index;
 401
 402         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 403         if (uniform)
 404                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
 405         return LLVMBuildLoad(gallivm->builder, pointer, "");
 406 }
 407
 408 /**
 409  * Do a load from &base_ptr[index], but also add a flag that it's loading
 410  * a constant from a dynamically uniform index.
 411  */
 412 static LLVMValueRef build_indexed_load_const(
 413         struct si_shader_context *ctx,
 414         LLVMValueRef base_ptr, LLVMValueRef index)
 415 {
 416         LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
 417         LLVMSetMetadata(result, 1, ctx->const_md);
 418         return result;
 419 }
 420
 421 static LLVMValueRef get_instance_index_for_fetch(
 422         struct radeon_llvm_context *radeon_bld,
 423         unsigned param_start_instance, unsigned divisor)
 424 {
 425         struct si_shader_context *ctx =
 426                 si_shader_context(&radeon_bld->soa.bld_base);
 427         struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
 428
 429         LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
 430                                            ctx->param_instance_id);
 431
 432         /* The division must be done before START_INSTANCE is added. */
 433         if (divisor > 1)
 434                 result = LLVMBuildUDiv(gallivm->builder, result,
 435                                 lp_build_const_int32(gallivm, divisor), "");
 436
 437         return LLVMBuildAdd(gallivm->builder, result,
 438                             LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 439 }
 440
 441 static void declare_input_vs(
 442         struct radeon_llvm_context *radeon_bld,
 443         unsigned input_index,
 444         const struct tgsi_full_declaration *decl)
 445 {
 446         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 447         struct gallivm_state *gallivm = base->gallivm;
 448         struct si_shader_context *ctx =
 449                 si_shader_context(&radeon_bld->soa.bld_base);
 450         unsigned divisor =
 451                 ctx->shader->key.vs.prolog.instance_divisors[input_index];
 452
 453         unsigned chan;
 454
 455         LLVMValueRef t_list_ptr;
 456         LLVMValueRef t_offset;
 457         LLVMValueRef t_list;
 458         LLVMValueRef attribute_offset;
 459         LLVMValueRef buffer_index;
 460         LLVMValueRef args[3];
 461         LLVMValueRef input;
 462
 463         /* Load the T list */
 464         t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
 465
 466         t_offset = lp_build_const_int32(gallivm, input_index);
 467
 468         t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
 469
 470         /* Build the attribute offset */
 471         attribute_offset = lp_build_const_int32(gallivm, 0);
 472
 473         if (!ctx->is_monolithic) {
 474                 buffer_index = LLVMGetParam(radeon_bld->main_fn,
 475                                             ctx->param_vertex_index0 +
 476                                             input_index);
 477         } else if (divisor) {
 478                 /* Build index from instance ID, start instance and divisor */
 479                 ctx->shader->info.uses_instanceid = true;
 480                 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
 481                                                             SI_PARAM_START_INSTANCE,
 482                                                             divisor);
 483         } else {
 484                 /* Load the buffer index for vertices. */
 485                 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
 486                                                       ctx->param_vertex_id);
 487                 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
 488                                                         SI_PARAM_BASE_VERTEX);
 489                 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
 490         }
 491
 492         args[0] = t_list;
 493         args[1] = attribute_offset;
 494         args[2] = buffer_index;
 495         input = lp_build_intrinsic(gallivm->builder,
 496                 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
 497                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 498
 499         /* Break up the vec4 into individual components */
 500         for (chan = 0; chan < 4; chan++) {
 501                 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 502                 /* XXX: Use a helper function for this.  There is one in
 503                  * tgsi_llvm.c. */
 504                 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
 505                                 LLVMBuildExtractElement(gallivm->builder,
 506                                 input, llvm_chan, "");
 507         }
 508 }
 509
 510 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 511                                      unsigned swizzle)
 512 {
 513         struct si_shader_context *ctx = si_shader_context(bld_base);
 514
 515         if (swizzle > 0)
 516                 return bld_base->uint_bld.zero;
 517
 518         switch (ctx->type) {
 519         case PIPE_SHADER_VERTEX:
 520                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 521                                     ctx->param_vs_prim_id);
 522         case PIPE_SHADER_TESS_CTRL:
 523                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 524                                     SI_PARAM_PATCH_ID);
 525         case PIPE_SHADER_TESS_EVAL:
 526                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 527                                     ctx->param_tes_patch_id);
 528         case PIPE_SHADER_GEOMETRY:
 529                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 530                                     SI_PARAM_PRIMITIVE_ID);
 531         default:
 532                 assert(0);
 533                 return bld_base->uint_bld.zero;
 534         }
 535 }
 536
 537 /**
 538  * Return the value of tgsi_ind_register for indexing.
 539  * This is the indirect index with the constant offset added to it.
 540  */
 541 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 542                                        const struct tgsi_ind_register *ind,
 543                                        int rel_index)
 544 {
 545         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 546         LLVMValueRef result;
 547
 548         result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
 549         result = LLVMBuildLoad(gallivm->builder, result, "");
 550         result = LLVMBuildAdd(gallivm->builder, result,
 551                               lp_build_const_int32(gallivm, rel_index), "");
 552         return result;
 553 }
 554
 555 /**
 556  * Like get_indirect_index, but restricts the return value to a (possibly
 557  * undefined) value inside [0..num).
 558  */
 559 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
 560                                                const struct tgsi_ind_register *ind,
 561                                                int rel_index, unsigned num)
 562 {
 563         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 564         LLVMBuilderRef builder = gallivm->builder;
 565         LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
 566         LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
 567         LLVMValueRef cc;
 568
 569         /* LLVM 3.8: If indirect resource indexing is used:
 570          * - SI & CIK hang
 571          * - VI crashes
 572          */
 573         if (HAVE_LLVM <= 0x0308)
 574                 return LLVMGetUndef(ctx->i32);
 575
 576         if (util_is_power_of_two(num)) {
 577                 result = LLVMBuildAnd(builder, result, c_max, "");
 578         } else {
 579                 /* In theory, this MAX pattern should result in code that is
 580                  * as good as the bit-wise AND above.
 581                  *
 582                  * In practice, LLVM generates worse code (at the time of
 583                  * writing), because its value tracking is not strong enough.
 584                  */
 585                 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
 586                 result = LLVMBuildSelect(builder, cc, result, c_max, "");
 587         }
 588
 589         return result;
 590 }
 591
 592
 593 /**
 594  * Calculate a dword address given an input or output register and a stride.
 595  */
 596 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 597                                    const struct tgsi_full_dst_register *dst,
 598                                    const struct tgsi_full_src_register *src,
 599                                    LLVMValueRef vertex_dw_stride,
 600                                    LLVMValueRef base_addr)
 601 {
 602         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 603         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 604         ubyte *name, *index, *array_first;
 605         int first, param;
 606         struct tgsi_full_dst_register reg;
 607
 608         /* Set the register description. The address computation is the same
 609          * for sources and destinations. */
 610         if (src) {
 611                 reg.Register.File = src->Register.File;
 612                 reg.Register.Index = src->Register.Index;
 613                 reg.Register.Indirect = src->Register.Indirect;
 614                 reg.Register.Dimension = src->Register.Dimension;
 615                 reg.Indirect = src->Indirect;
 616                 reg.Dimension = src->Dimension;
 617                 reg.DimIndirect = src->DimIndirect;
 618         } else
 619                 reg = *dst;
 620
 621         /* If the register is 2-dimensional (e.g. an array of vertices
 622          * in a primitive), calculate the base address of the vertex. */
 623         if (reg.Register.Dimension) {
 624                 LLVMValueRef index;
 625
 626                 if (reg.Dimension.Indirect)
 627                         index = get_indirect_index(ctx, &reg.DimIndirect,
 628                                                    reg.Dimension.Index);
 629                 else
 630                         index = lp_build_const_int32(gallivm, reg.Dimension.Index);
 631
 632                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 633                                          LLVMBuildMul(gallivm->builder, index,
 634                                                       vertex_dw_stride, ""), "");
 635         }
 636
 637         /* Get information about the register. */
 638         if (reg.Register.File == TGSI_FILE_INPUT) {
 639                 name = info->input_semantic_name;
 640                 index = info->input_semantic_index;
 641                 array_first = info->input_array_first;
 642         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 643                 name = info->output_semantic_name;
 644                 index = info->output_semantic_index;
 645                 array_first = info->output_array_first;
 646         } else {
 647                 assert(0);
 648                 return NULL;
 649         }
 650
 651         if (reg.Register.Indirect) {
 652                 /* Add the relative address of the element. */
 653                 LLVMValueRef ind_index;
 654
 655                 if (reg.Indirect.ArrayID)
 656                         first = array_first[reg.Indirect.ArrayID];
 657                 else
 658                         first = reg.Register.Index;
 659
 660                 ind_index = get_indirect_index(ctx, &reg.Indirect,
 661                                            reg.Register.Index - first);
 662
 663                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 664                                     LLVMBuildMul(gallivm->builder, ind_index,
 665                                                  lp_build_const_int32(gallivm, 4), ""), "");
 666
 667                 param = si_shader_io_get_unique_index(name[first], index[first]);
 668         } else {
 669                 param = si_shader_io_get_unique_index(name[reg.Register.Index],
 670                                                       index[reg.Register.Index]);
 671         }
 672
 673         /* Add the base address of the element. */
 674         return LLVMBuildAdd(gallivm->builder, base_addr,
 675                             lp_build_const_int32(gallivm, param * 4), "");
 676 }
 677
 678 /* The offchip buffer layout for TCS->TES is
 679  *
 680  * - attribute 0 of patch 0 vertex 0
 681  * - attribute 0 of patch 0 vertex 1
 682  * - attribute 0 of patch 0 vertex 2
 683  *   ...
 684  * - attribute 0 of patch 1 vertex 0
 685  * - attribute 0 of patch 1 vertex 1
 686  *   ...
 687  * - attribute 1 of patch 0 vertex 0
 688  * - attribute 1 of patch 0 vertex 1
 689  *   ...
 690  * - per patch attribute 0 of patch 0
 691  * - per patch attribute 0 of patch 1
 692  *   ...
 693  *
 694  * Note that every attribute has 4 components.
 695  */
 696 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 697                                                LLVMValueRef vertex_index,
 698                                                LLVMValueRef param_index)
 699 {
 700         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 701         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 702         LLVMValueRef param_stride, constant16;
 703
 704         vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
 705         num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
 706         total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 707                                       num_patches, "");
 708
 709         constant16 = lp_build_const_int32(gallivm, 16);
 710         if (vertex_index) {
 711                 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
 712                                          vertices_per_patch, "");
 713
 714                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 715                                          vertex_index, "");
 716
 717                 param_stride = total_vertices;
 718         } else {
 719                 base_addr = get_rel_patch_id(ctx);
 720                 param_stride = num_patches;
 721         }
 722
 723         base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 724                                  LLVMBuildMul(gallivm->builder, param_index,
 725                                               param_stride, ""), "");
 726
 727         base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 728
 729         if (!vertex_index) {
 730                 LLVMValueRef patch_data_offset =
 731                            unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
 732
 733                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 734                                          patch_data_offset, "");
 735         }
 736         return base_addr;
 737 }
 738
 739 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 740                                        struct si_shader_context *ctx,
 741                                        const struct tgsi_full_dst_register *dst,
 742                                        const struct tgsi_full_src_register *src)
 743 {
 744         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 745         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 746         ubyte *name, *index, *array_first;
 747         struct tgsi_full_src_register reg;
 748         LLVMValueRef vertex_index = NULL;
 749         LLVMValueRef param_index = NULL;
 750         unsigned param_index_base, param_base;
 751
 752         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 753
 754         if (reg.Register.Dimension) {
 755
 756                 if (reg.Dimension.Indirect)
 757                         vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
 758                                                           reg.Dimension.Index);
 759                 else
 760                         vertex_index = lp_build_const_int32(gallivm,
 761                                                             reg.Dimension.Index);
 762         }
 763
 764         /* Get information about the register. */
 765         if (reg.Register.File == TGSI_FILE_INPUT) {
 766                 name = info->input_semantic_name;
 767                 index = info->input_semantic_index;
 768                 array_first = info->input_array_first;
 769         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 770                 name = info->output_semantic_name;
 771                 index = info->output_semantic_index;
 772                 array_first = info->output_array_first;
 773         } else {
 774                 assert(0);
 775                 return NULL;
 776         }
 777
 778         if (reg.Register.Indirect) {
 779                 if (reg.Indirect.ArrayID)
 780                         param_base = array_first[reg.Indirect.ArrayID];
 781                 else
 782                         param_base = reg.Register.Index;
 783
 784                 param_index = get_indirect_index(ctx, &reg.Indirect,
 785                                                  reg.Register.Index - param_base);
 786
 787         } else {
 788                 param_base = reg.Register.Index;
 789                 param_index = lp_build_const_int32(gallivm, 0);
 790         }
 791
 792         param_index_base = si_shader_io_get_unique_index(name[param_base],
 793                                                          index[param_base]);
 794
 795         param_index = LLVMBuildAdd(gallivm->builder, param_index,
 796                                    lp_build_const_int32(gallivm, param_index_base),
 797                                    "");
 798
 799         return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
 800 }
 801
 802 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
 803  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
 804  * or v4i32 (num_channels=3,4). */
 805 static void build_tbuffer_store(struct si_shader_context *ctx,
 806                                 LLVMValueRef rsrc,
 807                                 LLVMValueRef vdata,
 808                                 unsigned num_channels,
 809                                 LLVMValueRef vaddr,
 810                                 LLVMValueRef soffset,
 811                                 unsigned inst_offset,
 812                                 unsigned dfmt,
 813                                 unsigned nfmt,
 814                                 unsigned offen,
 815                                 unsigned idxen,
 816                                 unsigned glc,
 817                                 unsigned slc,
 818                                 unsigned tfe)
 819 {
 820         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 821         LLVMValueRef args[] = {
 822                 rsrc,
 823                 vdata,
 824                 LLVMConstInt(ctx->i32, num_channels, 0),
 825                 vaddr,
 826                 soffset,
 827                 LLVMConstInt(ctx->i32, inst_offset, 0),
 828                 LLVMConstInt(ctx->i32, dfmt, 0),
 829                 LLVMConstInt(ctx->i32, nfmt, 0),
 830                 LLVMConstInt(ctx->i32, offen, 0),
 831                 LLVMConstInt(ctx->i32, idxen, 0),
 832                 LLVMConstInt(ctx->i32, glc, 0),
 833                 LLVMConstInt(ctx->i32, slc, 0),
 834                 LLVMConstInt(ctx->i32, tfe, 0)
 835         };
 836
 837         /* The instruction offset field has 12 bits */
 838         assert(offen || inst_offset < (1 << 12));
 839
 840         /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
 841         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 842         const char *types[] = {"i32", "v2i32", "v4i32"};
 843         char name[256];
 844         snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
 845
 846         lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
 847                            args, ARRAY_SIZE(args), 0);
 848 }
 849
 850 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
 851                                      LLVMValueRef rsrc,
 852                                      LLVMValueRef vdata,
 853                                      unsigned num_channels,
 854                                      LLVMValueRef vaddr,
 855                                      LLVMValueRef soffset,
 856                                      unsigned inst_offset)
 857 {
 858         static unsigned dfmt[] = {
 859                 V_008F0C_BUF_DATA_FORMAT_32,
 860                 V_008F0C_BUF_DATA_FORMAT_32_32,
 861                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
 862                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
 863         };
 864         assert(num_channels >= 1 && num_channels <= 4);
 865
 866         build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
 867                             inst_offset, dfmt[num_channels-1],
 868                             V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
 869 }
 870
 871 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
 872                                       LLVMValueRef rsrc,
 873                                       int num_channels,
 874                                       LLVMValueRef vindex,
 875                                       LLVMValueRef voffset,
 876                                       LLVMValueRef soffset,
 877                                       unsigned inst_offset,
 878                                       unsigned glc,
 879                                       unsigned slc)
 880 {
 881         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 882         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 883
 884         if (HAVE_LLVM >= 0x309) {
 885                 LLVMValueRef args[] = {
 886                         LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
 887                         vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
 888                         LLVMConstInt(ctx->i32, inst_offset, 0),
 889                         LLVMConstInt(ctx->i1, glc, 0),
 890                         LLVMConstInt(ctx->i1, slc, 0)
 891                 };
 892
 893                 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
 894                                        ctx->v4f32};
 895                 const char *type_names[] = {"f32", "v2f32", "v4f32"};
 896                 char name[256];
 897
 898                 if (voffset) {
 899                         args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
 900                                                "");
 901                 }
 902
 903                 if (soffset) {
 904                         args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
 905                                                "");
 906                 }
 907
 908                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
 909                          type_names[func]);
 910
 911                 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
 912                                           ARRAY_SIZE(args), LLVMReadOnlyAttribute |
 913                                           LLVMNoUnwindAttribute);
 914         } else {
 915                 LLVMValueRef args[] = {
 916                         LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
 917                         voffset ? voffset : vindex,
 918                         soffset,
 919                         LLVMConstInt(ctx->i32, inst_offset, 0),
 920                         LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
 921                         LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
 922                         LLVMConstInt(ctx->i32, glc, 0),
 923                         LLVMConstInt(ctx->i32, slc, 0),
 924                         LLVMConstInt(ctx->i32, 0, 0), // TFE
 925                 };
 926
 927                 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
 928                                        ctx->v4i32};
 929                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
 930                 const char *arg_type = "i32";
 931                 char name[256];
 932
 933                 if (voffset && vindex) {
 934                         LLVMValueRef vaddr[] = {vindex, voffset};
 935
 936                         arg_type = "v2i32";
 937                         args[1] = lp_build_gather_values(gallivm, vaddr, 2);
 938                 }
 939
 940                 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
 941                          type_names[func], arg_type);
 942
 943                 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
 944                                           ARRAY_SIZE(args), LLVMReadOnlyAttribute |
 945                                           LLVMNoUnwindAttribute);
 946         }
 947 }
 948
 949 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 950                                 enum tgsi_opcode_type type, unsigned swizzle,
 951                                 LLVMValueRef buffer, LLVMValueRef offset,
 952                                 LLVMValueRef base)
 953 {
 954         struct si_shader_context *ctx = si_shader_context(bld_base);
 955         struct gallivm_state *gallivm = bld_base->base.gallivm;
 956         LLVMValueRef value, value2;
 957         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
 958         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 959
 960         if (swizzle == ~0) {
 961                 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
 962                                           0, 1, 0);
 963
 964                 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 965         }
 966
 967         if (!tgsi_type_is_64bit(type)) {
 968                 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
 969                                           0, 1, 0);
 970
 971                 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 972                 return LLVMBuildExtractElement(gallivm->builder, value,
 973                                     lp_build_const_int32(gallivm, swizzle), "");
 974         }
 975
 976         value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
 977                                   swizzle * 4, 1, 0);
 978
 979         value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
 980                                    swizzle * 4 + 4, 1, 0);
 981
 982         return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 983 }
 984
 985 /**
 986  * Load from LDS.
 987  *
 988  * \param type          output value type
 989  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 990  * \param dw_addr       address in dwords
 991  */
 992 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 993                              enum tgsi_opcode_type type, unsigned swizzle,
 994                              LLVMValueRef dw_addr)
 995 {
 996         struct si_shader_context *ctx = si_shader_context(bld_base);
 997         struct gallivm_state *gallivm = bld_base->base.gallivm;
 998         LLVMValueRef value;
 999
1000         if (swizzle == ~0) {
1001                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1002
1003                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1004                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
1005
1006                 return lp_build_gather_values(bld_base->base.gallivm, values,
1007                                               TGSI_NUM_CHANNELS);
1008         }
1009
1010         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1011                             lp_build_const_int32(gallivm, swizzle));
1012
1013         value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1014         if (tgsi_type_is_64bit(type)) {
1015                 LLVMValueRef value2;
1016                 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1017                                        lp_build_const_int32(gallivm, swizzle + 1));
1018                 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1019                 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1020         }
1021
1022         return LLVMBuildBitCast(gallivm->builder, value,
1023                                 tgsi2llvmtype(bld_base, type), "");
1024 }
1025
1026 /**
1027  * Store to LDS.
1028  *
1029  * \param swizzle       offset (typically 0..3)
1030  * \param dw_addr       address in dwords
1031  * \param value         value to store
1032  */
1033 static void lds_store(struct lp_build_tgsi_context *bld_base,
1034                       unsigned swizzle, LLVMValueRef dw_addr,
1035                       LLVMValueRef value)
1036 {
1037         struct si_shader_context *ctx = si_shader_context(bld_base);
1038         struct gallivm_state *gallivm = bld_base->base.gallivm;
1039
1040         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1041                             lp_build_const_int32(gallivm, swizzle));
1042
1043         value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1044         build_indexed_store(ctx, ctx->lds,
1045                             dw_addr, value);
1046 }
1047
1048 static LLVMValueRef fetch_input_tcs(
1049         struct lp_build_tgsi_context *bld_base,
1050         const struct tgsi_full_src_register *reg,
1051         enum tgsi_opcode_type type, unsigned swizzle)
1052 {
1053         struct si_shader_context *ctx = si_shader_context(bld_base);
1054         LLVMValueRef dw_addr, stride;
1055
1056         stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1057         dw_addr = get_tcs_in_current_patch_offset(ctx);
1058         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1059
1060         return lds_load(bld_base, type, swizzle, dw_addr);
1061 }
1062
1063 static LLVMValueRef fetch_output_tcs(
1064                 struct lp_build_tgsi_context *bld_base,
1065                 const struct tgsi_full_src_register *reg,
1066                 enum tgsi_opcode_type type, unsigned swizzle)
1067 {
1068         struct si_shader_context *ctx = si_shader_context(bld_base);
1069         LLVMValueRef dw_addr, stride;
1070
1071         if (reg->Register.Dimension) {
1072                 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1073                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1074                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1075         } else {
1076                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1077                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1078         }
1079
1080         return lds_load(bld_base, type, swizzle, dw_addr);
1081 }
1082
1083 static LLVMValueRef fetch_input_tes(
1084         struct lp_build_tgsi_context *bld_base,
1085         const struct tgsi_full_src_register *reg,
1086         enum tgsi_opcode_type type, unsigned swizzle)
1087 {
1088         struct si_shader_context *ctx = si_shader_context(bld_base);
1089         struct gallivm_state *gallivm = bld_base->base.gallivm;
1090         LLVMValueRef rw_buffers, buffer, base, addr;
1091
1092         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1093                                   SI_PARAM_RW_BUFFERS);
1094         buffer = build_indexed_load_const(ctx, rw_buffers,
1095                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1096
1097         base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1098         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1099
1100         return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1101 }
1102
1103 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1104                              const struct tgsi_full_instruction *inst,
1105                              const struct tgsi_opcode_info *info,
1106                              LLVMValueRef dst[4])
1107 {
1108         struct si_shader_context *ctx = si_shader_context(bld_base);
1109         struct gallivm_state *gallivm = bld_base->base.gallivm;
1110         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1111         unsigned chan_index;
1112         LLVMValueRef dw_addr, stride;
1113         LLVMValueRef rw_buffers, buffer, base, buf_addr;
1114         LLVMValueRef values[4];
1115
1116         /* Only handle per-patch and per-vertex outputs here.
1117          * Vectors will be lowered to scalars and this function will be called again.
1118          */
1119         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1120             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1121                 radeon_llvm_emit_store(bld_base, inst, info, dst);
1122                 return;
1123         }
1124
1125         if (reg->Register.Dimension) {
1126                 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1127                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1128                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1129         } else {
1130                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1131                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1132         }
1133
1134         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1135                                   SI_PARAM_RW_BUFFERS);
1136         buffer = build_indexed_load_const(ctx, rw_buffers,
1137                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1138
1139         base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1140         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1141
1142
1143         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1144                 LLVMValueRef value = dst[chan_index];
1145
1146                 if (inst->Instruction.Saturate)
1147                         value = radeon_llvm_saturate(bld_base, value);
1148
1149                 lds_store(bld_base, chan_index, dw_addr, value);
1150
1151                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1152                 values[chan_index] = value;
1153
1154                 if (inst->Dst[0].Register.WriteMask != 0xF) {
1155                         build_tbuffer_store_dwords(ctx, buffer, value, 1,
1156                                                    buf_addr, base,
1157                                                    4 * chan_index);
1158                 }
1159         }
1160
1161         if (inst->Dst[0].Register.WriteMask == 0xF) {
1162                 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1163                                                             values, 4);
1164                 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1165                                            base, 0);
1166         }
1167 }
1168
1169 static LLVMValueRef fetch_input_gs(
1170         struct lp_build_tgsi_context *bld_base,
1171         const struct tgsi_full_src_register *reg,
1172         enum tgsi_opcode_type type,
1173         unsigned swizzle)
1174 {
1175         struct lp_build_context *base = &bld_base->base;
1176         struct si_shader_context *ctx = si_shader_context(bld_base);
1177         struct si_shader *shader = ctx->shader;
1178         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1179         struct gallivm_state *gallivm = base->gallivm;
1180         LLVMValueRef vtx_offset;
1181         LLVMValueRef args[9];
1182         unsigned vtx_offset_param;
1183         struct tgsi_shader_info *info = &shader->selector->info;
1184         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1185         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1186         unsigned param;
1187         LLVMValueRef value;
1188
1189         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1190                 return get_primitive_id(bld_base, swizzle);
1191
1192         if (!reg->Register.Dimension)
1193                 return NULL;
1194
1195         if (swizzle == ~0) {
1196                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1197                 unsigned chan;
1198                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1199                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1200                 }
1201                 return lp_build_gather_values(bld_base->base.gallivm, values,
1202                                               TGSI_NUM_CHANNELS);
1203         }
1204
1205         /* Get the vertex offset parameter */
1206         vtx_offset_param = reg->Dimension.Index;
1207         if (vtx_offset_param < 2) {
1208                 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1209         } else {
1210                 assert(vtx_offset_param < 6);
1211                 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1212         }
1213         vtx_offset = lp_build_mul_imm(uint,
1214                                       LLVMGetParam(ctx->radeon_bld.main_fn,
1215                                                    vtx_offset_param),
1216                                       4);
1217
1218         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1219         args[0] = ctx->esgs_ring;
1220         args[1] = vtx_offset;
1221         args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1222         args[3] = uint->zero;
1223         args[4] = uint->one;  /* OFFEN */
1224         args[5] = uint->zero; /* IDXEN */
1225         args[6] = uint->one;  /* GLC */
1226         args[7] = uint->zero; /* SLC */
1227         args[8] = uint->zero; /* TFE */
1228
1229         value = lp_build_intrinsic(gallivm->builder,
1230                                    "llvm.SI.buffer.load.dword.i32.i32",
1231                                    ctx->i32, args, 9,
1232                                    LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
1233         if (tgsi_type_is_64bit(type)) {
1234                 LLVMValueRef value2;
1235                 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1236                 value2 = lp_build_intrinsic(gallivm->builder,
1237                                             "llvm.SI.buffer.load.dword.i32.i32",
1238                                             ctx->i32, args, 9,
1239                                             LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
1240                 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1241                                                     value, value2);
1242         }
1243         return LLVMBuildBitCast(gallivm->builder,
1244                                 value,
1245                                 tgsi2llvmtype(bld_base, type), "");
1246 }
1247
1248 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1249 {
1250         switch (interpolate) {
1251         case TGSI_INTERPOLATE_CONSTANT:
1252                 return 0;
1253
1254         case TGSI_INTERPOLATE_LINEAR:
1255                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1256                         return SI_PARAM_LINEAR_SAMPLE;
1257                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1258                         return SI_PARAM_LINEAR_CENTROID;
1259                 else
1260                         return SI_PARAM_LINEAR_CENTER;
1261                 break;
1262         case TGSI_INTERPOLATE_COLOR:
1263         case TGSI_INTERPOLATE_PERSPECTIVE:
1264                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1265                         return SI_PARAM_PERSP_SAMPLE;
1266                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1267                         return SI_PARAM_PERSP_CENTROID;
1268                 else
1269                         return SI_PARAM_PERSP_CENTER;
1270                 break;
1271         default:
1272                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1273                 return -1;
1274         }
1275 }
1276
1277 /* This shouldn't be used by explicit INTERP opcodes. */
1278 static unsigned select_interp_param(struct si_shader_context *ctx,
1279                                     unsigned param)
1280 {
1281         if (!ctx->is_monolithic)
1282                 return param;
1283
1284         /* If the shader doesn't use center/centroid, just return the parameter.
1285          *
1286          * If the shader only uses one set of (i,j), "si_emit_spi_ps_input" can
1287          * switch between center/centroid and sample without shader changes.
1288          */
1289         if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1290                 switch (param) {
1291                 case SI_PARAM_PERSP_CENTROID:
1292                 case SI_PARAM_PERSP_CENTER:
1293                         return SI_PARAM_PERSP_SAMPLE;
1294                 }
1295         }
1296         if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1297                 switch (param) {
1298                 case SI_PARAM_LINEAR_CENTROID:
1299                 case SI_PARAM_LINEAR_CENTER:
1300                         return SI_PARAM_LINEAR_SAMPLE;
1301                 }
1302         }
1303
1304         return param;
1305 }
1306
1307 /**
1308  * Interpolate a fragment shader input.
1309  *
1310  * @param ctx           context
1311  * @param input_index           index of the input in hardware
1312  * @param semantic_name         TGSI_SEMANTIC_*
1313  * @param semantic_index        semantic index
1314  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1315  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1316  * @param interp_param          interpolation weights (i,j)
1317  * @param prim_mask             SI_PARAM_PRIM_MASK
1318  * @param face                  SI_PARAM_FRONT_FACE
1319  * @param result                the return value (4 components)
1320  */
1321 static void interp_fs_input(struct si_shader_context *ctx,
1322                             unsigned input_index,
1323                             unsigned semantic_name,
1324                             unsigned semantic_index,
1325                             unsigned num_interp_inputs,
1326                             unsigned colors_read_mask,
1327                             LLVMValueRef interp_param,
1328                             LLVMValueRef prim_mask,
1329                             LLVMValueRef face,
1330                             LLVMValueRef result[4])
1331 {
1332         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1333         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1334         struct gallivm_state *gallivm = base->gallivm;
1335         const char *intr_name;
1336         LLVMValueRef attr_number;
1337
1338         unsigned chan;
1339
1340         attr_number = lp_build_const_int32(gallivm, input_index);
1341
1342         /* fs.constant returns the param from the middle vertex, so it's not
1343          * really useful for flat shading. It's meant to be used for custom
1344          * interpolation (but the intrinsic can't fetch from the other two
1345          * vertices).
1346          *
1347          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1348          * to do the right thing. The only reason we use fs.constant is that
1349          * fs.interp cannot be used on integers, because they can be equal
1350          * to NaN.
1351          */
1352         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1353
1354         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1355             ctx->shader->key.ps.prolog.color_two_side) {
1356                 LLVMValueRef args[4];
1357                 LLVMValueRef is_face_positive;
1358                 LLVMValueRef back_attr_number;
1359
1360                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1361                  * otherwise it's at offset "num_inputs".
1362                  */
1363                 unsigned back_attr_offset = num_interp_inputs;
1364                 if (semantic_index == 1 && colors_read_mask & 0xf)
1365                         back_attr_offset += 1;
1366
1367                 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1368
1369                 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1370                                                  face, uint->zero, "");
1371
1372                 args[2] = prim_mask;
1373                 args[3] = interp_param;
1374                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1375                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1376                         LLVMValueRef front, back;
1377
1378                         args[0] = llvm_chan;
1379                         args[1] = attr_number;
1380                         front = lp_build_intrinsic(gallivm->builder, intr_name,
1381                                                 ctx->f32, args, args[3] ? 4 : 3,
1382                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1383
1384                         args[1] = back_attr_number;
1385                         back = lp_build_intrinsic(gallivm->builder, intr_name,
1386                                                ctx->f32, args, args[3] ? 4 : 3,
1387                                                LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1388
1389                         result[chan] = LLVMBuildSelect(gallivm->builder,
1390                                                 is_face_positive,
1391                                                 front,
1392                                                 back,
1393                                                 "");
1394                 }
1395         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1396                 LLVMValueRef args[4];
1397
1398                 args[0] = uint->zero;
1399                 args[1] = attr_number;
1400                 args[2] = prim_mask;
1401                 args[3] = interp_param;
1402                 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1403                                         ctx->f32, args, args[3] ? 4 : 3,
1404                                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1405                 result[1] =
1406                 result[2] = lp_build_const_float(gallivm, 0.0f);
1407                 result[3] = lp_build_const_float(gallivm, 1.0f);
1408         } else {
1409                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1410                         LLVMValueRef args[4];
1411                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1412
1413                         args[0] = llvm_chan;
1414                         args[1] = attr_number;
1415                         args[2] = prim_mask;
1416                         args[3] = interp_param;
1417                         result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1418                                                 ctx->f32, args, args[3] ? 4 : 3,
1419                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1420                 }
1421         }
1422 }
1423
1424 static void declare_input_fs(
1425         struct radeon_llvm_context *radeon_bld,
1426         unsigned input_index,
1427         const struct tgsi_full_declaration *decl)
1428 {
1429         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1430         struct si_shader_context *ctx =
1431                 si_shader_context(&radeon_bld->soa.bld_base);
1432         struct si_shader *shader = ctx->shader;
1433         LLVMValueRef main_fn = radeon_bld->main_fn;
1434         LLVMValueRef interp_param = NULL;
1435         int interp_param_idx;
1436
1437         /* Get colors from input VGPRs (set by the prolog). */
1438         if (!ctx->is_monolithic &&
1439             decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1440                 unsigned i = decl->Semantic.Index;
1441                 unsigned colors_read = shader->selector->info.colors_read;
1442                 unsigned mask = colors_read >> (i * 4);
1443                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1444                                   (i ? util_bitcount(colors_read & 0xf) : 0);
1445
1446                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1447                         mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1448                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1449                         mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1450                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1451                         mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1452                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1453                         mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1454                 return;
1455         }
1456
1457         interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1458                                                      decl->Interp.Location);
1459         if (interp_param_idx == -1)
1460                 return;
1461         else if (interp_param_idx) {
1462                 interp_param_idx = select_interp_param(ctx,
1463                                                        interp_param_idx);
1464                 interp_param = LLVMGetParam(main_fn, interp_param_idx);
1465         }
1466
1467         interp_fs_input(ctx, input_index, decl->Semantic.Name,
1468                         decl->Semantic.Index, shader->selector->info.num_inputs,
1469                         shader->selector->info.colors_read, interp_param,
1470                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1471                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1472                         &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1473 }
1474
1475 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1476 {
1477         return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1478                             SI_PARAM_ANCILLARY, 8, 4);
1479 }
1480
1481 /**
1482  * Set range metadata on an instruction.  This can only be used on load and
1483  * call instructions.  If you know an instruction can only produce the values
1484  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1485  * \p lo is the minimum value inclusive.
1486  * \p hi is the maximum value exclusive.
1487  */
1488 static void set_range_metadata(LLVMValueRef value, unsigned lo, unsigned hi)
1489 {
1490         const char *range_md_string = "range";
1491         LLVMValueRef range_md, md_args[2];
1492         LLVMTypeRef type = LLVMTypeOf(value);
1493         LLVMContextRef context = LLVMGetTypeContext(type);
1494         unsigned md_range_id = LLVMGetMDKindIDInContext(context,
1495                                 range_md_string, strlen(range_md_string));
1496
1497         md_args[0] = LLVMConstInt(type, lo, false);
1498         md_args[1] = LLVMConstInt(type, hi, false);
1499         range_md = LLVMMDNodeInContext(context, md_args, 2);
1500         LLVMSetMetadata(value, md_range_id, range_md);
1501 }
1502
1503 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1504 {
1505         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1506         LLVMValueRef tid;
1507
1508         if (HAVE_LLVM < 0x0308) {
1509                 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1510                                 ctx->i32,   NULL, 0, LLVMReadNoneAttribute);
1511         } else {
1512                 LLVMValueRef tid_args[2];
1513                 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1514                 tid_args[1] = lp_build_const_int32(gallivm, 0);
1515                 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1516                                         "llvm.amdgcn.mbcnt.lo", ctx->i32,
1517                                         tid_args, 2, LLVMReadNoneAttribute);
1518
1519                 tid = lp_build_intrinsic(gallivm->builder,
1520                                         "llvm.amdgcn.mbcnt.hi", ctx->i32,
1521                                         tid_args, 2, LLVMReadNoneAttribute);
1522         }
1523         set_range_metadata(tid, 0, 64);
1524         return tid;
1525 }
1526
1527 /**
1528  * Load a dword from a constant buffer.
1529  */
1530 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1531                                       LLVMValueRef offset, LLVMTypeRef return_type)
1532 {
1533         LLVMValueRef args[2] = {resource, offset};
1534
1535         return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1536                                LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1537 }
1538
1539 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1540 {
1541         struct si_shader_context *ctx =
1542                 si_shader_context(&radeon_bld->soa.bld_base);
1543         struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1544         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1545         LLVMBuilderRef builder = gallivm->builder;
1546         LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1547         LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1548         LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1549
1550         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1551         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1552         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1553
1554         LLVMValueRef pos[4] = {
1555                 buffer_load_const(builder, resource, offset0, ctx->f32),
1556                 buffer_load_const(builder, resource, offset1, ctx->f32),
1557                 lp_build_const_float(gallivm, 0),
1558                 lp_build_const_float(gallivm, 0)
1559         };
1560
1561         return lp_build_gather_values(gallivm, pos, 4);
1562 }
1563
1564 static void declare_system_value(
1565         struct radeon_llvm_context *radeon_bld,
1566         unsigned index,
1567         const struct tgsi_full_declaration *decl)
1568 {
1569         struct si_shader_context *ctx =
1570                 si_shader_context(&radeon_bld->soa.bld_base);
1571         struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1572         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1573         LLVMValueRef value = 0;
1574
1575         switch (decl->Semantic.Name) {
1576         case TGSI_SEMANTIC_INSTANCEID:
1577                 value = LLVMGetParam(radeon_bld->main_fn,
1578                                      ctx->param_instance_id);
1579                 break;
1580
1581         case TGSI_SEMANTIC_VERTEXID:
1582                 value = LLVMBuildAdd(gallivm->builder,
1583                                      LLVMGetParam(radeon_bld->main_fn,
1584                                                   ctx->param_vertex_id),
1585                                      LLVMGetParam(radeon_bld->main_fn,
1586                                                   SI_PARAM_BASE_VERTEX), "");
1587                 break;
1588
1589         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1590                 value = LLVMGetParam(radeon_bld->main_fn,
1591                                      ctx->param_vertex_id);
1592                 break;
1593
1594         case TGSI_SEMANTIC_BASEVERTEX:
1595                 value = LLVMGetParam(radeon_bld->main_fn,
1596                                      SI_PARAM_BASE_VERTEX);
1597                 break;
1598
1599         case TGSI_SEMANTIC_INVOCATIONID:
1600                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1601                         value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1602                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1603                         value = LLVMGetParam(radeon_bld->main_fn,
1604                                              SI_PARAM_GS_INSTANCE_ID);
1605                 else
1606                         assert(!"INVOCATIONID not implemented");
1607                 break;
1608
1609         case TGSI_SEMANTIC_POSITION:
1610         {
1611                 LLVMValueRef pos[4] = {
1612                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1613                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1614                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1615                         lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1616                                                  LLVMGetParam(radeon_bld->main_fn,
1617                                                               SI_PARAM_POS_W_FLOAT)),
1618                 };
1619                 value = lp_build_gather_values(gallivm, pos, 4);
1620                 break;
1621         }
1622
1623         case TGSI_SEMANTIC_FACE:
1624                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1625                 break;
1626
1627         case TGSI_SEMANTIC_SAMPLEID:
1628                 value = get_sample_id(radeon_bld);
1629                 break;
1630
1631         case TGSI_SEMANTIC_SAMPLEPOS: {
1632                 LLVMValueRef pos[4] = {
1633                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1634                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1635                         lp_build_const_float(gallivm, 0),
1636                         lp_build_const_float(gallivm, 0)
1637                 };
1638                 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1639                                                   TGSI_OPCODE_FRC, pos[0]);
1640                 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1641                                                   TGSI_OPCODE_FRC, pos[1]);
1642                 value = lp_build_gather_values(gallivm, pos, 4);
1643                 break;
1644         }
1645
1646         case TGSI_SEMANTIC_SAMPLEMASK:
1647                 /* This can only occur with the OpenGL Core profile, which
1648                  * doesn't support smoothing.
1649                  */
1650                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1651                 break;
1652
1653         case TGSI_SEMANTIC_TESSCOORD:
1654         {
1655                 LLVMValueRef coord[4] = {
1656                         LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1657                         LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1658                         bld->zero,
1659                         bld->zero
1660                 };
1661
1662                 /* For triangles, the vector should be (u, v, 1-u-v). */
1663                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1664                     PIPE_PRIM_TRIANGLES)
1665                         coord[2] = lp_build_sub(bld, bld->one,
1666                                                 lp_build_add(bld, coord[0], coord[1]));
1667
1668                 value = lp_build_gather_values(gallivm, coord, 4);
1669                 break;
1670         }
1671
1672         case TGSI_SEMANTIC_VERTICESIN:
1673                 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1674                 break;
1675
1676         case TGSI_SEMANTIC_TESSINNER:
1677         case TGSI_SEMANTIC_TESSOUTER:
1678         {
1679                 LLVMValueRef rw_buffers, buffer, base, addr;
1680                 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1681
1682                 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1683                                         SI_PARAM_RW_BUFFERS);
1684                 buffer = build_indexed_load_const(ctx, rw_buffers,
1685                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1686
1687                 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1688                 addr = get_tcs_tes_buffer_address(ctx, NULL,
1689                                           lp_build_const_int32(gallivm, param));
1690
1691                 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1692                                     ~0, buffer, base, addr);
1693
1694                 break;
1695         }
1696
1697         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1698         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1699         {
1700                 LLVMValueRef buf, slot, val[4];
1701                 int i, offset;
1702
1703                 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1704                 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1705                 buf = build_indexed_load_const(ctx, buf, slot);
1706                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1707
1708                 for (i = 0; i < 4; i++)
1709                         val[i] = buffer_load_const(gallivm->builder, buf,
1710                                                    lp_build_const_int32(gallivm, (offset + i) * 4),
1711                                                    ctx->f32);
1712                 value = lp_build_gather_values(gallivm, val, 4);
1713                 break;
1714         }
1715
1716         case TGSI_SEMANTIC_PRIMID:
1717                 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1718                 break;
1719
1720         case TGSI_SEMANTIC_GRID_SIZE:
1721                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1722                 break;
1723
1724         case TGSI_SEMANTIC_BLOCK_SIZE:
1725         {
1726                 LLVMValueRef values[3];
1727                 unsigned i;
1728                 unsigned *properties = ctx->shader->selector->info.properties;
1729                 unsigned sizes[3] = {
1730                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1731                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1732                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1733                 };
1734
1735                 for (i = 0; i < 3; ++i)
1736                         values[i] = lp_build_const_int32(gallivm, sizes[i]);
1737
1738                 value = lp_build_gather_values(gallivm, values, 3);
1739                 break;
1740         }
1741
1742         case TGSI_SEMANTIC_BLOCK_ID:
1743                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1744                 break;
1745
1746         case TGSI_SEMANTIC_THREAD_ID:
1747                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1748                 break;
1749
1750 #if HAVE_LLVM >= 0x0309
1751         case TGSI_SEMANTIC_HELPER_INVOCATION:
1752                 value = lp_build_intrinsic(gallivm->builder,
1753                                            "llvm.amdgcn.ps.live",
1754                                            ctx->i1, NULL, 0,
1755                                            LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1756                 value = LLVMBuildNot(gallivm->builder, value, "");
1757                 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1758                 break;
1759 #endif
1760
1761         default:
1762                 assert(!"unknown system value");
1763                 return;
1764         }
1765
1766         radeon_bld->system_values[index] = value;
1767 }
1768
1769 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1770                                    const struct tgsi_full_declaration *decl)
1771 {
1772         struct si_shader_context *ctx =
1773                 si_shader_context(&radeon_bld->soa.bld_base);
1774         struct si_shader_selector *sel = ctx->shader->selector;
1775         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1776
1777         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1778         LLVMValueRef var;
1779
1780         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1781         assert(decl->Range.First == decl->Range.Last);
1782         assert(!ctx->shared_memory);
1783
1784         var = LLVMAddGlobalInAddressSpace(gallivm->module,
1785                                           LLVMArrayType(ctx->i8, sel->local_size),
1786                                           "compute_lds",
1787                                           LOCAL_ADDR_SPACE);
1788         LLVMSetAlignment(var, 4);
1789
1790         ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1791 }
1792
1793 static LLVMValueRef fetch_constant(
1794         struct lp_build_tgsi_context *bld_base,
1795         const struct tgsi_full_src_register *reg,
1796         enum tgsi_opcode_type type,
1797         unsigned swizzle)
1798 {
1799         struct si_shader_context *ctx = si_shader_context(bld_base);
1800         struct lp_build_context *base = &bld_base->base;
1801         const struct tgsi_ind_register *ireg = &reg->Indirect;
1802         unsigned buf, idx;
1803
1804         LLVMValueRef addr, bufp;
1805         LLVMValueRef result;
1806
1807         if (swizzle == LP_CHAN_ALL) {
1808                 unsigned chan;
1809                 LLVMValueRef values[4];
1810                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1811                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1812
1813                 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1814         }
1815
1816         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1817         idx = reg->Register.Index * 4 + swizzle;
1818
1819         if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1820                 if (!tgsi_type_is_64bit(type))
1821                         return bitcast(bld_base, type, ctx->constants[buf][idx]);
1822                 else {
1823                         return radeon_llvm_emit_fetch_64bit(bld_base, type,
1824                                                             ctx->constants[buf][idx],
1825                                                             ctx->constants[buf][idx + 1]);
1826                 }
1827         }
1828
1829         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1830                 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1831                 LLVMValueRef index;
1832                 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1833                                                    reg->Dimension.Index,
1834                                                    SI_NUM_CONST_BUFFERS);
1835                 bufp = build_indexed_load_const(ctx, ptr, index);
1836         } else
1837                 bufp = ctx->const_buffers[buf];
1838
1839         addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1840         addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1841         addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1842         addr = lp_build_add(&bld_base->uint_bld, addr,
1843                             lp_build_const_int32(base->gallivm, idx * 4));
1844
1845         result = buffer_load_const(base->gallivm->builder, bufp,
1846                                    addr, ctx->f32);
1847
1848         if (!tgsi_type_is_64bit(type))
1849                 result = bitcast(bld_base, type, result);
1850         else {
1851                 LLVMValueRef addr2, result2;
1852                 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1853                 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1854                 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1855                 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1856                                      lp_build_const_int32(base->gallivm, idx * 4));
1857
1858                 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1859                                    addr2, ctx->f32);
1860
1861                 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1862                                                       result, result2);
1863         }
1864         return result;
1865 }
1866
1867 /* Upper 16 bits must be zero. */
1868 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1869                                            LLVMValueRef val[2])
1870 {
1871         return LLVMBuildOr(gallivm->builder, val[0],
1872                            LLVMBuildShl(gallivm->builder, val[1],
1873                                         lp_build_const_int32(gallivm, 16),
1874                                         ""), "");
1875 }
1876
1877 /* Upper 16 bits are ignored and will be dropped. */
1878 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1879                                                     LLVMValueRef val[2])
1880 {
1881         LLVMValueRef v[2] = {
1882                 LLVMBuildAnd(gallivm->builder, val[0],
1883                              lp_build_const_int32(gallivm, 0xffff), ""),
1884                 val[1],
1885         };
1886         return si_llvm_pack_two_int16(gallivm, v);
1887 }
1888
1889 /* Initialize arguments for the shader export intrinsic */
1890 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1891                                      LLVMValueRef *values,
1892                                      unsigned target,
1893                                      LLVMValueRef *args)
1894 {
1895         struct si_shader_context *ctx = si_shader_context(bld_base);
1896         struct lp_build_context *uint =
1897                                 &ctx->radeon_bld.soa.bld_base.uint_bld;
1898         struct lp_build_context *base = &bld_base->base;
1899         struct gallivm_state *gallivm = base->gallivm;
1900         LLVMBuilderRef builder = base->gallivm->builder;
1901         LLVMValueRef val[4];
1902         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1903         unsigned chan;
1904         bool is_int8;
1905
1906         /* Default is 0xf. Adjusted below depending on the format. */
1907         args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1908
1909         /* Specify whether the EXEC mask represents the valid mask */
1910         args[1] = uint->zero;
1911
1912         /* Specify whether this is the last export */
1913         args[2] = uint->zero;
1914
1915         /* Specify the target we are exporting */
1916         args[3] = lp_build_const_int32(base->gallivm, target);
1917
1918         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1919                 const union si_shader_key *key = &ctx->shader->key;
1920                 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1921                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1922
1923                 assert(cbuf >= 0 && cbuf < 8);
1924                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1925                 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1926         }
1927
1928         args[4] = uint->zero; /* COMPR flag */
1929         args[5] = base->undef;
1930         args[6] = base->undef;
1931         args[7] = base->undef;
1932         args[8] = base->undef;
1933
1934         switch (spi_shader_col_format) {
1935         case V_028714_SPI_SHADER_ZERO:
1936                 args[0] = uint->zero; /* writemask */
1937                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
1938                 break;
1939
1940         case V_028714_SPI_SHADER_32_R:
1941                 args[0] = uint->one; /* writemask */
1942                 args[5] = values[0];
1943                 break;
1944
1945         case V_028714_SPI_SHADER_32_GR:
1946                 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
1947                 args[5] = values[0];
1948                 args[6] = values[1];
1949                 break;
1950
1951         case V_028714_SPI_SHADER_32_AR:
1952                 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
1953                 args[5] = values[0];
1954                 args[8] = values[3];
1955                 break;
1956
1957         case V_028714_SPI_SHADER_FP16_ABGR:
1958                 args[4] = uint->one; /* COMPR flag */
1959
1960                 for (chan = 0; chan < 2; chan++) {
1961                         LLVMValueRef pack_args[2] = {
1962                                 values[2 * chan],
1963                                 values[2 * chan + 1]
1964                         };
1965                         LLVMValueRef packed;
1966
1967                         packed = lp_build_intrinsic(base->gallivm->builder,
1968                                                     "llvm.SI.packf16",
1969                                                     ctx->i32, pack_args, 2,
1970                                                     LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1971                         args[chan + 5] =
1972                                 LLVMBuildBitCast(base->gallivm->builder,
1973                                                  packed, ctx->f32, "");
1974                 }
1975                 break;
1976
1977         case V_028714_SPI_SHADER_UNORM16_ABGR:
1978                 for (chan = 0; chan < 4; chan++) {
1979                         val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
1980                         val[chan] = LLVMBuildFMul(builder, val[chan],
1981                                                   lp_build_const_float(gallivm, 65535), "");
1982                         val[chan] = LLVMBuildFAdd(builder, val[chan],
1983                                                   lp_build_const_float(gallivm, 0.5), "");
1984                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
1985                                                     ctx->i32, "");
1986                 }
1987
1988                 args[4] = uint->one; /* COMPR flag */
1989                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1990                                   si_llvm_pack_two_int16(gallivm, val));
1991                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1992                                   si_llvm_pack_two_int16(gallivm, val+2));
1993                 break;
1994
1995         case V_028714_SPI_SHADER_SNORM16_ABGR:
1996                 for (chan = 0; chan < 4; chan++) {
1997                         /* Clamp between [-1, 1]. */
1998                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1999                                                               values[chan],
2000                                                               lp_build_const_float(gallivm, 1));
2001                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2002                                                               val[chan],
2003                                                               lp_build_const_float(gallivm, -1));
2004                         /* Convert to a signed integer in [-32767, 32767]. */
2005                         val[chan] = LLVMBuildFMul(builder, val[chan],
2006                                                   lp_build_const_float(gallivm, 32767), "");
2007                         /* If positive, add 0.5, else add -0.5. */
2008                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2009                                         LLVMBuildSelect(builder,
2010                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
2011                                                               val[chan], base->zero, ""),
2012                                                 lp_build_const_float(gallivm, 0.5),
2013                                                 lp_build_const_float(gallivm, -0.5), ""), "");
2014                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2015                 }
2016
2017                 args[4] = uint->one; /* COMPR flag */
2018                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2019                                   si_llvm_pack_two_int32_as_int16(gallivm, val));
2020                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2021                                   si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2022                 break;
2023
2024         case V_028714_SPI_SHADER_UINT16_ABGR: {
2025                 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2026                                                         255 : 65535);
2027                 /* Clamp. */
2028                 for (chan = 0; chan < 4; chan++) {
2029                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2030                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2031                                                               val[chan], max);
2032                 }
2033
2034                 args[4] = uint->one; /* COMPR flag */
2035                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2036                                   si_llvm_pack_two_int16(gallivm, val));
2037                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2038                                   si_llvm_pack_two_int16(gallivm, val+2));
2039                 break;
2040         }
2041
2042         case V_028714_SPI_SHADER_SINT16_ABGR: {
2043                 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2044                                                         127 : 32767);
2045                 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2046                                                         -128 : -32768);
2047                 /* Clamp. */
2048                 for (chan = 0; chan < 4; chan++) {
2049                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2050                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2051                                                               TGSI_OPCODE_IMIN,
2052                                                               val[chan], max);
2053                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2054                                                               TGSI_OPCODE_IMAX,
2055                                                               val[chan], min);
2056                 }
2057
2058                 args[4] = uint->one; /* COMPR flag */
2059                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2060                                   si_llvm_pack_two_int32_as_int16(gallivm, val));
2061                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2062                                   si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2063                 break;
2064         }
2065
2066         case V_028714_SPI_SHADER_32_ABGR:
2067                 memcpy(&args[5], values, sizeof(values[0]) * 4);
2068                 break;
2069         }
2070 }
2071
2072 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2073                           LLVMValueRef alpha)
2074 {
2075         struct si_shader_context *ctx = si_shader_context(bld_base);
2076         struct gallivm_state *gallivm = bld_base->base.gallivm;
2077
2078         if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2079                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2080                                 SI_PARAM_ALPHA_REF);
2081
2082                 LLVMValueRef alpha_pass =
2083                         lp_build_cmp(&bld_base->base,
2084                                      ctx->shader->key.ps.epilog.alpha_func,
2085                                      alpha, alpha_ref);
2086                 LLVMValueRef arg =
2087                         lp_build_select(&bld_base->base,
2088                                         alpha_pass,
2089                                         lp_build_const_float(gallivm, 1.0f),
2090                                         lp_build_const_float(gallivm, -1.0f));
2091
2092                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2093                                    ctx->voidt, &arg, 1, 0);
2094         } else {
2095                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2096                                    ctx->voidt, NULL, 0, 0);
2097         }
2098 }
2099
2100 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2101                                                   LLVMValueRef alpha,
2102                                                   unsigned samplemask_param)
2103 {
2104         struct si_shader_context *ctx = si_shader_context(bld_base);
2105         struct gallivm_state *gallivm = bld_base->base.gallivm;
2106         LLVMValueRef coverage;
2107
2108         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2109         coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2110                                 samplemask_param);
2111         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2112
2113         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2114                                    ctx->i32,
2115                                    &coverage, 1, LLVMReadNoneAttribute);
2116
2117         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2118                                    ctx->f32, "");
2119
2120         coverage = LLVMBuildFMul(gallivm->builder, coverage,
2121                                  lp_build_const_float(gallivm,
2122                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2123
2124         return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2125 }
2126
2127 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2128                                     LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2129 {
2130         struct si_shader_context *ctx = si_shader_context(bld_base);
2131         struct lp_build_context *base = &bld_base->base;
2132         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2133         unsigned reg_index;
2134         unsigned chan;
2135         unsigned const_chan;
2136         LLVMValueRef base_elt;
2137         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2138         LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2139                                                            SI_VS_CONST_CLIP_PLANES);
2140         LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2141
2142         for (reg_index = 0; reg_index < 2; reg_index ++) {
2143                 LLVMValueRef *args = pos[2 + reg_index];
2144
2145                 args[5] =
2146                 args[6] =
2147                 args[7] =
2148                 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2149
2150                 /* Compute dot products of position and user clip plane vectors */
2151                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2152                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2153                                 args[1] = lp_build_const_int32(base->gallivm,
2154                                                                ((reg_index * 4 + chan) * 4 +
2155                                                                 const_chan) * 4);
2156                                 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
2157                                                       args[1], ctx->f32);
2158                                 args[5 + chan] =
2159                                         lp_build_add(base, args[5 + chan],
2160                                                      lp_build_mul(base, base_elt,
2161                                                                   out_elts[const_chan]));
2162                         }
2163                 }
2164
2165                 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2166                 args[1] = uint->zero;
2167                 args[2] = uint->zero;
2168                 args[3] = lp_build_const_int32(base->gallivm,
2169                                                V_008DFC_SQ_EXP_POS + 2 + reg_index);
2170                 args[4] = uint->zero;
2171         }
2172 }
2173
2174 static void si_dump_streamout(struct pipe_stream_output_info *so)
2175 {
2176         unsigned i;
2177
2178         if (so->num_outputs)
2179                 fprintf(stderr, "STREAMOUT\n");
2180
2181         for (i = 0; i < so->num_outputs; i++) {
2182                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2183                                 so->output[i].start_component;
2184                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2185                         i, so->output[i].output_buffer,
2186                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2187                         so->output[i].register_index,
2188                         mask & 1 ? "x" : "",
2189                         mask & 2 ? "y" : "",
2190                         mask & 4 ? "z" : "",
2191                         mask & 8 ? "w" : "");
2192         }
2193 }
2194
2195 /* On SI, the vertex shader is responsible for writing streamout data
2196  * to buffers. */
2197 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2198                                    struct si_shader_output_values *outputs,
2199                                    unsigned noutput)
2200 {
2201         struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2202         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2203         LLVMBuilderRef builder = gallivm->builder;
2204         int i, j;
2205         struct lp_build_if_state if_ctx;
2206
2207         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2208         LLVMValueRef so_vtx_count =
2209                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2210
2211         LLVMValueRef tid = get_thread_id(ctx);
2212
2213         /* can_emit = tid < so_vtx_count; */
2214         LLVMValueRef can_emit =
2215                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2216
2217         LLVMValueRef stream_id =
2218                 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2219
2220         /* Emit the streamout code conditionally. This actually avoids
2221          * out-of-bounds buffer access. The hw tells us via the SGPR
2222          * (so_vtx_count) which threads are allowed to emit streamout data. */
2223         lp_build_if(&if_ctx, gallivm, can_emit);
2224         {
2225                 /* The buffer offset is computed as follows:
2226                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2227                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2228                  *                attrib_offset
2229                  */
2230
2231                 LLVMValueRef so_write_index =
2232                         LLVMGetParam(ctx->radeon_bld.main_fn,
2233                                      ctx->param_streamout_write_index);
2234
2235                 /* Compute (streamout_write_index + thread_id). */
2236                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2237
2238                 /* Compute the write offset for each enabled buffer. */
2239                 LLVMValueRef so_write_offset[4] = {};
2240                 for (i = 0; i < 4; i++) {
2241                         if (!so->stride[i])
2242                                 continue;
2243
2244                         LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2245                                                               ctx->param_streamout_offset[i]);
2246                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2247
2248                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2249                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2250                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2251                 }
2252
2253                 /* Write streamout data. */
2254                 for (i = 0; i < so->num_outputs; i++) {
2255                         unsigned buf_idx = so->output[i].output_buffer;
2256                         unsigned reg = so->output[i].register_index;
2257                         unsigned start = so->output[i].start_component;
2258                         unsigned num_comps = so->output[i].num_components;
2259                         unsigned stream = so->output[i].stream;
2260                         LLVMValueRef out[4];
2261                         struct lp_build_if_state if_ctx_stream;
2262
2263                         assert(num_comps && num_comps <= 4);
2264                         if (!num_comps || num_comps > 4)
2265                                 continue;
2266
2267                         if (reg >= noutput)
2268                                 continue;
2269
2270                         /* Load the output as int. */
2271                         for (j = 0; j < num_comps; j++) {
2272                                 out[j] = LLVMBuildBitCast(builder,
2273                                                           outputs[reg].values[start+j],
2274                                                 ctx->i32, "");
2275                         }
2276
2277                         /* Pack the output. */
2278                         LLVMValueRef vdata = NULL;
2279
2280                         switch (num_comps) {
2281                         case 1: /* as i32 */
2282                                 vdata = out[0];
2283                                 break;
2284                         case 2: /* as v2i32 */
2285                         case 3: /* as v4i32 (aligned to 4) */
2286                         case 4: /* as v4i32 */
2287                                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2288                                 for (j = 0; j < num_comps; j++) {
2289                                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2290                                                                        LLVMConstInt(ctx->i32, j, 0), "");
2291                                 }
2292                                 break;
2293                         }
2294
2295                         LLVMValueRef can_emit_stream =
2296                                 LLVMBuildICmp(builder, LLVMIntEQ,
2297                                               stream_id,
2298                                               lp_build_const_int32(gallivm, stream), "");
2299
2300                         lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2301                         build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2302                                                    vdata, num_comps,
2303                                                    so_write_offset[buf_idx],
2304                                                    LLVMConstInt(ctx->i32, 0, 0),
2305                                                    so->output[i].dst_offset*4);
2306                         lp_build_endif(&if_ctx_stream);
2307                 }
2308         }
2309         lp_build_endif(&if_ctx);
2310 }
2311
2312
2313 /* Generate export instructions for hardware VS shader stage */
2314 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2315                               struct si_shader_output_values *outputs,
2316                               unsigned noutput)
2317 {
2318         struct si_shader_context *ctx = si_shader_context(bld_base);
2319         struct si_shader *shader = ctx->shader;
2320         struct lp_build_context *base = &bld_base->base;
2321         struct lp_build_context *uint =
2322                                 &ctx->radeon_bld.soa.bld_base.uint_bld;
2323         LLVMValueRef args[9];
2324         LLVMValueRef pos_args[4][9] = { { 0 } };
2325         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2326         unsigned semantic_name, semantic_index;
2327         unsigned target;
2328         unsigned param_count = 0;
2329         unsigned pos_idx;
2330         int i;
2331
2332         if (outputs && ctx->shader->selector->so.num_outputs) {
2333                 si_llvm_emit_streamout(ctx, outputs, noutput);
2334         }
2335
2336         for (i = 0; i < noutput; i++) {
2337                 semantic_name = outputs[i].name;
2338                 semantic_index = outputs[i].sid;
2339
2340 handle_semantic:
2341                 /* Select the correct target */
2342                 switch(semantic_name) {
2343                 case TGSI_SEMANTIC_PSIZE:
2344                         psize_value = outputs[i].values[0];
2345                         continue;
2346                 case TGSI_SEMANTIC_EDGEFLAG:
2347                         edgeflag_value = outputs[i].values[0];
2348                         continue;
2349                 case TGSI_SEMANTIC_LAYER:
2350                         layer_value = outputs[i].values[0];
2351                         semantic_name = TGSI_SEMANTIC_GENERIC;
2352                         goto handle_semantic;
2353                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2354                         viewport_index_value = outputs[i].values[0];
2355                         semantic_name = TGSI_SEMANTIC_GENERIC;
2356                         goto handle_semantic;
2357                 case TGSI_SEMANTIC_POSITION:
2358                         target = V_008DFC_SQ_EXP_POS;
2359                         break;
2360                 case TGSI_SEMANTIC_COLOR:
2361                 case TGSI_SEMANTIC_BCOLOR:
2362                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2363                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2364                         shader->info.vs_output_param_offset[i] = param_count;
2365                         param_count++;
2366                         break;
2367                 case TGSI_SEMANTIC_CLIPDIST:
2368                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2369                         break;
2370                 case TGSI_SEMANTIC_CLIPVERTEX:
2371                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2372                         continue;
2373                 case TGSI_SEMANTIC_PRIMID:
2374                 case TGSI_SEMANTIC_FOG:
2375                 case TGSI_SEMANTIC_TEXCOORD:
2376                 case TGSI_SEMANTIC_GENERIC:
2377                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2378                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2379                         shader->info.vs_output_param_offset[i] = param_count;
2380                         param_count++;
2381                         break;
2382                 default:
2383                         target = 0;
2384                         fprintf(stderr,
2385                                 "Warning: SI unhandled vs output type:%d\n",
2386                                 semantic_name);
2387                 }
2388
2389                 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2390
2391                 if (target >= V_008DFC_SQ_EXP_POS &&
2392                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
2393                         memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2394                                args, sizeof(args));
2395                 } else {
2396                         lp_build_intrinsic(base->gallivm->builder,
2397                                            "llvm.SI.export", ctx->voidt,
2398                                            args, 9, 0);
2399                 }
2400
2401                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2402                         semantic_name = TGSI_SEMANTIC_GENERIC;
2403                         goto handle_semantic;
2404                 }
2405         }
2406
2407         shader->info.nr_param_exports = param_count;
2408
2409         /* We need to add the position output manually if it's missing. */
2410         if (!pos_args[0][0]) {
2411                 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2412                 pos_args[0][1] = uint->zero; /* EXEC mask */
2413                 pos_args[0][2] = uint->zero; /* last export? */
2414                 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2415                 pos_args[0][4] = uint->zero; /* COMPR flag */
2416                 pos_args[0][5] = base->zero; /* X */
2417                 pos_args[0][6] = base->zero; /* Y */
2418                 pos_args[0][7] = base->zero; /* Z */
2419                 pos_args[0][8] = base->one;  /* W */
2420         }
2421
2422         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2423         if (shader->selector->info.writes_psize ||
2424             shader->selector->info.writes_edgeflag ||
2425             shader->selector->info.writes_viewport_index ||
2426             shader->selector->info.writes_layer) {
2427                 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2428                                                       shader->selector->info.writes_psize |
2429                                                       (shader->selector->info.writes_edgeflag << 1) |
2430                                                       (shader->selector->info.writes_layer << 2) |
2431                                                       (shader->selector->info.writes_viewport_index << 3));
2432                 pos_args[1][1] = uint->zero; /* EXEC mask */
2433                 pos_args[1][2] = uint->zero; /* last export? */
2434                 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2435                 pos_args[1][4] = uint->zero; /* COMPR flag */
2436                 pos_args[1][5] = base->zero; /* X */
2437                 pos_args[1][6] = base->zero; /* Y */
2438                 pos_args[1][7] = base->zero; /* Z */
2439                 pos_args[1][8] = base->zero; /* W */
2440
2441                 if (shader->selector->info.writes_psize)
2442                         pos_args[1][5] = psize_value;
2443
2444                 if (shader->selector->info.writes_edgeflag) {
2445                         /* The output is a float, but the hw expects an integer
2446                          * with the first bit containing the edge flag. */
2447                         edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2448                                                          edgeflag_value,
2449                                                          ctx->i32, "");
2450                         edgeflag_value = lp_build_min(&bld_base->int_bld,
2451                                                       edgeflag_value,
2452                                                       bld_base->int_bld.one);
2453
2454                         /* The LLVM intrinsic expects a float. */
2455                         pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2456                                                           edgeflag_value,
2457                                                           ctx->f32, "");
2458                 }
2459
2460                 if (shader->selector->info.writes_layer)
2461                         pos_args[1][7] = layer_value;
2462
2463                 if (shader->selector->info.writes_viewport_index)
2464                         pos_args[1][8] = viewport_index_value;
2465         }
2466
2467         for (i = 0; i < 4; i++)
2468                 if (pos_args[i][0])
2469                         shader->info.nr_pos_exports++;
2470
2471         pos_idx = 0;
2472         for (i = 0; i < 4; i++) {
2473                 if (!pos_args[i][0])
2474                         continue;
2475
2476                 /* Specify the target we are exporting */
2477                 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2478
2479                 if (pos_idx == shader->info.nr_pos_exports)
2480                         /* Specify that this is the last export */
2481                         pos_args[i][2] = uint->one;
2482
2483                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2484                                    ctx->voidt, pos_args[i], 9, 0);
2485         }
2486 }
2487
2488 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2489 {
2490         struct si_shader_context *ctx = si_shader_context(bld_base);
2491         struct gallivm_state *gallivm = bld_base->base.gallivm;
2492         LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2493         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2494         uint64_t inputs;
2495
2496         invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2497
2498         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2499         buffer = build_indexed_load_const(ctx, rw_buffers,
2500                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2501
2502         buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2503
2504         lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2505         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2506                                          lds_vertex_stride, "");
2507         lds_base = get_tcs_in_current_patch_offset(ctx);
2508         lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2509
2510         inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2511         while (inputs) {
2512                 unsigned i = u_bit_scan64(&inputs);
2513
2514                 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2515                                             lp_build_const_int32(gallivm, 4 * i),
2516                                              "");
2517
2518                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2519                                               invocation_id,
2520                                               lp_build_const_int32(gallivm, i));
2521
2522                 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2523                                               lds_ptr);
2524
2525                 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2526                                            buffer_offset, 0);
2527         }
2528 }
2529
2530 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2531                                   LLVMValueRef rel_patch_id,
2532                                   LLVMValueRef invocation_id,
2533                                   LLVMValueRef tcs_out_current_patch_data_offset)
2534 {
2535         struct si_shader_context *ctx = si_shader_context(bld_base);
2536         struct gallivm_state *gallivm = bld_base->base.gallivm;
2537         struct si_shader *shader = ctx->shader;
2538         unsigned tess_inner_index, tess_outer_index;
2539         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2540         LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2541         unsigned stride, outer_comps, inner_comps, i;
2542         struct lp_build_if_state if_ctx, inner_if_ctx;
2543
2544         si_llvm_emit_barrier(NULL, bld_base, NULL);
2545
2546         /* Do this only for invocation 0, because the tess levels are per-patch,
2547          * not per-vertex.
2548          *
2549          * This can't jump, because invocation 0 executes this. It should
2550          * at least mask out the loads and stores for other invocations.
2551          */
2552         lp_build_if(&if_ctx, gallivm,
2553                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2554                                   invocation_id, bld_base->uint_bld.zero, ""));
2555
2556         /* Determine the layout of one tess factor element in the buffer. */
2557         switch (shader->key.tcs.epilog.prim_mode) {
2558         case PIPE_PRIM_LINES:
2559                 stride = 2; /* 2 dwords, 1 vec2 store */
2560                 outer_comps = 2;
2561                 inner_comps = 0;
2562                 break;
2563         case PIPE_PRIM_TRIANGLES:
2564                 stride = 4; /* 4 dwords, 1 vec4 store */
2565                 outer_comps = 3;
2566                 inner_comps = 1;
2567                 break;
2568         case PIPE_PRIM_QUADS:
2569                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2570                 outer_comps = 4;
2571                 inner_comps = 2;
2572                 break;
2573         default:
2574                 assert(0);
2575                 return;
2576         }
2577
2578         /* Load tess_inner and tess_outer from LDS.
2579          * Any invocation can write them, so we can't get them from a temporary.
2580          */
2581         tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2582         tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2583
2584         lds_base = tcs_out_current_patch_data_offset;
2585         lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2586                                  lp_build_const_int32(gallivm,
2587                                                       tess_inner_index * 4), "");
2588         lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2589                                  lp_build_const_int32(gallivm,
2590                                                       tess_outer_index * 4), "");
2591
2592         for (i = 0; i < outer_comps; i++)
2593                 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2594         for (i = 0; i < inner_comps; i++)
2595                 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2596
2597         /* Convert the outputs to vectors for stores. */
2598         vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2599         vec1 = NULL;
2600
2601         if (stride > 4)
2602                 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2603
2604         /* Get the buffer. */
2605         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2606                                   SI_PARAM_RW_BUFFERS);
2607         buffer = build_indexed_load_const(ctx, rw_buffers,
2608                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2609
2610         /* Get the offset. */
2611         tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2612                                SI_PARAM_TESS_FACTOR_OFFSET);
2613         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2614                                   lp_build_const_int32(gallivm, 4 * stride), "");
2615
2616         lp_build_if(&inner_if_ctx, gallivm,
2617                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2618                                   rel_patch_id, bld_base->uint_bld.zero, ""));
2619
2620         /* Store the dynamic HS control word. */
2621         build_tbuffer_store_dwords(ctx, buffer,
2622                                    lp_build_const_int32(gallivm, 0x80000000),
2623                                    1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2624
2625         lp_build_endif(&inner_if_ctx);
2626
2627         /* Store the tessellation factors. */
2628         build_tbuffer_store_dwords(ctx, buffer, vec0,
2629                                    MIN2(stride, 4), byteoffset, tf_base, 4);
2630         if (vec1)
2631                 build_tbuffer_store_dwords(ctx, buffer, vec1,
2632                                            stride - 4, byteoffset, tf_base, 20);
2633         lp_build_endif(&if_ctx);
2634 }
2635
2636 /* This only writes the tessellation factor levels. */
2637 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2638 {
2639         struct si_shader_context *ctx = si_shader_context(bld_base);
2640         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2641
2642         rel_patch_id = get_rel_patch_id(ctx);
2643         invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2644         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2645
2646         if (!ctx->is_monolithic) {
2647                 /* Return epilog parameters from this function. */
2648                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2649                 LLVMValueRef ret = ctx->return_value;
2650                 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2651                 unsigned vgpr;
2652
2653                 /* RW_BUFFERS pointer */
2654                 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2655                                           SI_PARAM_RW_BUFFERS);
2656                 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2657                 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2658                 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2659                                               bld_base->uint_bld.zero, "");
2660                 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2661                                               bld_base->uint_bld.one, "");
2662                 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2663                 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2664
2665                 /* Tess factor buffer soffset is after user SGPRs. */
2666                 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2667                                           SI_PARAM_TESS_FACTOR_OFFSET);
2668                 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2669                                            SI_TCS_NUM_USER_SGPR + 1, "");
2670
2671                 /* VGPRs */
2672                 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2673                 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2674                 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2675
2676                 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2677                 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2678                 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2679                 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2680                 ctx->return_value = ret;
2681                 return;
2682         }
2683
2684         si_copy_tcs_inputs(bld_base);
2685         si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2686 }
2687
2688 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2689 {
2690         struct si_shader_context *ctx = si_shader_context(bld_base);
2691         struct si_shader *shader = ctx->shader;
2692         struct tgsi_shader_info *info = &shader->selector->info;
2693         struct gallivm_state *gallivm = bld_base->base.gallivm;
2694         unsigned i, chan;
2695         LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2696                                               ctx->param_rel_auto_id);
2697         LLVMValueRef vertex_dw_stride =
2698                 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2699         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2700                                                  vertex_dw_stride, "");
2701
2702         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2703          * its inputs from it. */
2704         for (i = 0; i < info->num_outputs; i++) {
2705                 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2706                 unsigned name = info->output_semantic_name[i];
2707                 unsigned index = info->output_semantic_index[i];
2708                 int param = si_shader_io_get_unique_index(name, index);
2709                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2710                                         lp_build_const_int32(gallivm, param * 4), "");
2711
2712                 for (chan = 0; chan < 4; chan++) {
2713                         lds_store(bld_base, chan, dw_addr,
2714                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2715                 }
2716         }
2717 }
2718
2719 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2720 {
2721         struct si_shader_context *ctx = si_shader_context(bld_base);
2722         struct gallivm_state *gallivm = bld_base->base.gallivm;
2723         struct si_shader *es = ctx->shader;
2724         struct tgsi_shader_info *info = &es->selector->info;
2725         LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2726                                             ctx->param_es2gs_offset);
2727         unsigned chan;
2728         int i;
2729
2730         for (i = 0; i < info->num_outputs; i++) {
2731                 LLVMValueRef *out_ptr =
2732                         ctx->radeon_bld.soa.outputs[i];
2733                 int param_index;
2734
2735                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2736                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2737                         continue;
2738
2739                 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2740                                                             info->output_semantic_index[i]);
2741
2742                 for (chan = 0; chan < 4; chan++) {
2743                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2744                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2745
2746                         build_tbuffer_store(ctx,
2747                                             ctx->esgs_ring,
2748                                             out_val, 1,
2749                                             LLVMGetUndef(ctx->i32), soffset,
2750                                             (4 * param_index + chan) * 4,
2751                                             V_008F0C_BUF_DATA_FORMAT_32,
2752                                             V_008F0C_BUF_NUM_FORMAT_UINT,
2753                                             0, 0, 1, 1, 0);
2754                 }
2755         }
2756 }
2757
2758 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2759 {
2760         struct si_shader_context *ctx = si_shader_context(bld_base);
2761         struct gallivm_state *gallivm = bld_base->base.gallivm;
2762         LLVMValueRef args[2];
2763
2764         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2765         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2766         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2767                            ctx->voidt, args, 2, LLVMNoUnwindAttribute);
2768 }
2769
2770 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2771 {
2772         struct si_shader_context *ctx = si_shader_context(bld_base);
2773         struct gallivm_state *gallivm = bld_base->base.gallivm;
2774         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2775         struct si_shader_output_values *outputs = NULL;
2776         int i,j;
2777
2778         assert(!ctx->is_gs_copy_shader);
2779
2780         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2781
2782         /* Vertex color clamping.
2783          *
2784          * This uses a state constant loaded in a user data SGPR and
2785          * an IF statement is added that clamps all colors if the constant
2786          * is true.
2787          */
2788         if (ctx->type == PIPE_SHADER_VERTEX) {
2789                 struct lp_build_if_state if_ctx;
2790                 LLVMValueRef cond = NULL;
2791                 LLVMValueRef addr, val;
2792
2793                 for (i = 0; i < info->num_outputs; i++) {
2794                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2795                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2796                                 continue;
2797
2798                         /* We've found a color. */
2799                         if (!cond) {
2800                                 /* The state is in the first bit of the user SGPR. */
2801                                 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2802                                                     SI_PARAM_VS_STATE_BITS);
2803                                 cond = LLVMBuildTrunc(gallivm->builder, cond,
2804                                                       ctx->i1, "");
2805                                 lp_build_if(&if_ctx, gallivm, cond);
2806                         }
2807
2808                         for (j = 0; j < 4; j++) {
2809                                 addr = ctx->radeon_bld.soa.outputs[i][j];
2810                                 val = LLVMBuildLoad(gallivm->builder, addr, "");
2811                                 val = radeon_llvm_saturate(bld_base, val);
2812                                 LLVMBuildStore(gallivm->builder, val, addr);
2813                         }
2814                 }
2815
2816                 if (cond)
2817                         lp_build_endif(&if_ctx);
2818         }
2819
2820         for (i = 0; i < info->num_outputs; i++) {
2821                 outputs[i].name = info->output_semantic_name[i];
2822                 outputs[i].sid = info->output_semantic_index[i];
2823
2824                 for (j = 0; j < 4; j++)
2825                         outputs[i].values[j] =
2826                                 LLVMBuildLoad(gallivm->builder,
2827                                               ctx->radeon_bld.soa.outputs[i][j],
2828                                               "");
2829         }
2830
2831         if (ctx->is_monolithic) {
2832                 /* Export PrimitiveID when PS needs it. */
2833                 if (si_vs_exports_prim_id(ctx->shader)) {
2834                         outputs[i].name = TGSI_SEMANTIC_PRIMID;
2835                         outputs[i].sid = 0;
2836                         outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2837                                                        get_primitive_id(bld_base, 0));
2838                         outputs[i].values[1] = bld_base->base.undef;
2839                         outputs[i].values[2] = bld_base->base.undef;
2840                         outputs[i].values[3] = bld_base->base.undef;
2841                         i++;
2842                 }
2843         } else {
2844                 /* Return the primitive ID from the LLVM function. */
2845                 ctx->return_value =
2846                         LLVMBuildInsertValue(gallivm->builder,
2847                                              ctx->return_value,
2848                                              bitcast(bld_base, TGSI_TYPE_FLOAT,
2849                                                      get_primitive_id(bld_base, 0)),
2850                                              VS_EPILOG_PRIMID_LOC, "");
2851         }
2852
2853         si_llvm_export_vs(bld_base, outputs, i);
2854         FREE(outputs);
2855 }
2856
2857 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2858                            LLVMValueRef depth, LLVMValueRef stencil,
2859                            LLVMValueRef samplemask)
2860 {
2861         struct si_shader_context *ctx = si_shader_context(bld_base);
2862         struct lp_build_context *base = &bld_base->base;
2863         struct lp_build_context *uint = &bld_base->uint_bld;
2864         LLVMValueRef args[9];
2865         unsigned mask = 0;
2866
2867         assert(depth || stencil || samplemask);
2868
2869         args[1] = uint->one; /* whether the EXEC mask is valid */
2870         args[2] = uint->one; /* DONE bit */
2871
2872         /* Specify the target we are exporting */
2873         args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2874
2875         args[4] = uint->zero; /* COMP flag */
2876         args[5] = base->undef; /* R, depth */
2877         args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2878         args[7] = base->undef; /* B, sample mask */
2879         args[8] = base->undef; /* A, alpha to mask */
2880
2881         if (depth) {
2882                 args[5] = depth;
2883                 mask |= 0x1;
2884         }
2885
2886         if (stencil) {
2887                 args[6] = stencil;
2888                 mask |= 0x2;
2889         }
2890
2891         if (samplemask) {
2892                 args[7] = samplemask;
2893                 mask |= 0x4;
2894         }
2895
2896         /* SI (except OLAND) has a bug that it only looks
2897          * at the X writemask component. */
2898         if (ctx->screen->b.chip_class == SI &&
2899             ctx->screen->b.family != CHIP_OLAND)
2900                 mask |= 0x1;
2901
2902         /* Specify which components to enable */
2903         args[0] = lp_build_const_int32(base->gallivm, mask);
2904
2905         lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2906                            ctx->voidt, args, 9, 0);
2907 }
2908
2909 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2910                                 LLVMValueRef *color, unsigned index,
2911                                 unsigned samplemask_param,
2912                                 bool is_last)
2913 {
2914         struct si_shader_context *ctx = si_shader_context(bld_base);
2915         struct lp_build_context *base = &bld_base->base;
2916         int i;
2917
2918         /* Clamp color */
2919         if (ctx->shader->key.ps.epilog.clamp_color)
2920                 for (i = 0; i < 4; i++)
2921                         color[i] = radeon_llvm_saturate(bld_base, color[i]);
2922
2923         /* Alpha to one */
2924         if (ctx->shader->key.ps.epilog.alpha_to_one)
2925                 color[3] = base->one;
2926
2927         /* Alpha test */
2928         if (index == 0 &&
2929             ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2930                 si_alpha_test(bld_base, color[3]);
2931
2932         /* Line & polygon smoothing */
2933         if (ctx->shader->key.ps.epilog.poly_line_smoothing)
2934                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2935                                                          samplemask_param);
2936
2937         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2938         if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
2939                 LLVMValueRef args[8][9];
2940                 int c, last = -1;
2941
2942                 /* Get the export arguments, also find out what the last one is. */
2943                 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2944                         si_llvm_init_export_args(bld_base, color,
2945                                                  V_008DFC_SQ_EXP_MRT + c, args[c]);
2946                         if (args[c][0] != bld_base->uint_bld.zero)
2947                                 last = c;
2948                 }
2949
2950                 /* Emit all exports. */
2951                 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2952                         if (is_last && last == c) {
2953                                 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2954                                 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
2955                         } else if (args[c][0] == bld_base->uint_bld.zero)
2956                                 continue; /* unnecessary NULL export */
2957
2958                         lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2959                                            ctx->voidt, args[c], 9, 0);
2960                 }
2961         } else {
2962                 LLVMValueRef args[9];
2963
2964                 /* Export */
2965                 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
2966                                          args);
2967                 if (is_last) {
2968                         args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2969                         args[2] = bld_base->uint_bld.one; /* DONE bit */
2970                 } else if (args[0] == bld_base->uint_bld.zero)
2971                         return; /* unnecessary NULL export */
2972
2973                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2974                                    ctx->voidt, args, 9, 0);
2975         }
2976 }
2977
2978 static void si_export_null(struct lp_build_tgsi_context *bld_base)
2979 {
2980         struct si_shader_context *ctx = si_shader_context(bld_base);
2981         struct lp_build_context *base = &bld_base->base;
2982         struct lp_build_context *uint = &bld_base->uint_bld;
2983         LLVMValueRef args[9];
2984
2985         args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
2986         args[1] = uint->one; /* whether the EXEC mask is valid */
2987         args[2] = uint->one; /* DONE bit */
2988         args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2989         args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
2990         args[5] = uint->undef; /* R */
2991         args[6] = uint->undef; /* G */
2992         args[7] = uint->undef; /* B */
2993         args[8] = uint->undef; /* A */
2994
2995         lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2996                            ctx->voidt, args, 9, 0);
2997 }
2998
2999 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3000 {
3001         struct si_shader_context *ctx = si_shader_context(bld_base);
3002         struct si_shader *shader = ctx->shader;
3003         struct lp_build_context *base = &bld_base->base;
3004         struct tgsi_shader_info *info = &shader->selector->info;
3005         LLVMBuilderRef builder = base->gallivm->builder;
3006         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3007         int last_color_export = -1;
3008         int i;
3009
3010         /* Determine the last export. If MRTZ is present, it's always last.
3011          * Otherwise, find the last color export.
3012          */
3013         if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3014                 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3015
3016                 /* Don't export NULL and return if alpha-test is enabled. */
3017                 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3018                     shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3019                     (spi_format & 0xf) == 0)
3020                         spi_format |= V_028714_SPI_SHADER_32_AR;
3021
3022                 for (i = 0; i < info->num_outputs; i++) {
3023                         unsigned index = info->output_semantic_index[i];
3024
3025                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3026                                 continue;
3027
3028                         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3029                         if (shader->key.ps.epilog.last_cbuf > 0) {
3030                                 /* Just set this if any of the colorbuffers are enabled. */
3031                                 if (spi_format &
3032                                     ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3033                                         last_color_export = i;
3034                                 continue;
3035                         }
3036
3037                         if ((spi_format >> (index * 4)) & 0xf)
3038                                 last_color_export = i;
3039                 }
3040
3041                 /* If there are no outputs, export NULL. */
3042                 if (last_color_export == -1) {
3043                         si_export_null(bld_base);
3044                         return;
3045                 }
3046         }
3047
3048         for (i = 0; i < info->num_outputs; i++) {
3049                 unsigned semantic_name = info->output_semantic_name[i];
3050                 unsigned semantic_index = info->output_semantic_index[i];
3051                 unsigned j;
3052                 LLVMValueRef color[4] = {};
3053
3054                 /* Select the correct target */
3055                 switch (semantic_name) {
3056                 case TGSI_SEMANTIC_POSITION:
3057                         depth = LLVMBuildLoad(builder,
3058                                               ctx->radeon_bld.soa.outputs[i][2], "");
3059                         break;
3060                 case TGSI_SEMANTIC_STENCIL:
3061                         stencil = LLVMBuildLoad(builder,
3062                                                 ctx->radeon_bld.soa.outputs[i][1], "");
3063                         break;
3064                 case TGSI_SEMANTIC_SAMPLEMASK:
3065                         samplemask = LLVMBuildLoad(builder,
3066                                                    ctx->radeon_bld.soa.outputs[i][0], "");
3067                         break;
3068                 case TGSI_SEMANTIC_COLOR:
3069                         for (j = 0; j < 4; j++)
3070                                 color[j] = LLVMBuildLoad(builder,
3071                                                          ctx->radeon_bld.soa.outputs[i][j], "");
3072
3073                         si_export_mrt_color(bld_base, color, semantic_index,
3074                                             SI_PARAM_SAMPLE_COVERAGE,
3075                                             last_color_export == i);
3076                         break;
3077                 default:
3078                         fprintf(stderr,
3079                                 "Warning: SI unhandled fs output type:%d\n",
3080                                 semantic_name);
3081                 }
3082         }
3083
3084         if (depth || stencil || samplemask)
3085                 si_export_mrt_z(bld_base, depth, stencil, samplemask);
3086 }
3087
3088 /**
3089  * Return PS outputs in this order:
3090  *
3091  * v[0:3] = color0.xyzw
3092  * v[4:7] = color1.xyzw
3093  * ...
3094  * vN+0 = Depth
3095  * vN+1 = Stencil
3096  * vN+2 = SampleMask
3097  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3098  *
3099  * The alpha-ref SGPR is returned via its original location.
3100  */
3101 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3102 {
3103         struct si_shader_context *ctx = si_shader_context(bld_base);
3104         struct si_shader *shader = ctx->shader;
3105         struct lp_build_context *base = &bld_base->base;
3106         struct tgsi_shader_info *info = &shader->selector->info;
3107         LLVMBuilderRef builder = base->gallivm->builder;
3108         unsigned i, j, first_vgpr, vgpr;
3109
3110         LLVMValueRef color[8][4] = {};
3111         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3112         LLVMValueRef ret;
3113
3114         /* Read the output values. */
3115         for (i = 0; i < info->num_outputs; i++) {
3116                 unsigned semantic_name = info->output_semantic_name[i];
3117                 unsigned semantic_index = info->output_semantic_index[i];
3118
3119                 switch (semantic_name) {
3120                 case TGSI_SEMANTIC_COLOR:
3121                         assert(semantic_index < 8);
3122                         for (j = 0; j < 4; j++) {
3123                                 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3124                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3125                                 color[semantic_index][j] = result;
3126                         }
3127                         break;
3128                 case TGSI_SEMANTIC_POSITION:
3129                         depth = LLVMBuildLoad(builder,
3130                                               ctx->radeon_bld.soa.outputs[i][2], "");
3131                         break;
3132                 case TGSI_SEMANTIC_STENCIL:
3133                         stencil = LLVMBuildLoad(builder,
3134                                                 ctx->radeon_bld.soa.outputs[i][1], "");
3135                         break;
3136                 case TGSI_SEMANTIC_SAMPLEMASK:
3137                         samplemask = LLVMBuildLoad(builder,
3138                                                    ctx->radeon_bld.soa.outputs[i][0], "");
3139                         break;
3140                 default:
3141                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3142                                 semantic_name);
3143                 }
3144         }
3145
3146         /* Fill the return structure. */
3147         ret = ctx->return_value;
3148
3149         /* Set SGPRs. */
3150         ret = LLVMBuildInsertValue(builder, ret,
3151                                    bitcast(bld_base, TGSI_TYPE_SIGNED,
3152                                            LLVMGetParam(ctx->radeon_bld.main_fn,
3153                                                         SI_PARAM_ALPHA_REF)),
3154                                    SI_SGPR_ALPHA_REF, "");
3155
3156         /* Set VGPRs */
3157         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3158         for (i = 0; i < ARRAY_SIZE(color); i++) {
3159                 if (!color[i][0])
3160                         continue;
3161
3162                 for (j = 0; j < 4; j++)
3163                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3164         }
3165         if (depth)
3166                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3167         if (stencil)
3168                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3169         if (samplemask)
3170                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3171
3172         /* Add the input sample mask for smoothing at the end. */
3173         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3174                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3175         ret = LLVMBuildInsertValue(builder, ret,
3176                                    LLVMGetParam(ctx->radeon_bld.main_fn,
3177                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3178
3179         ctx->return_value = ret;
3180 }
3181
3182 /**
3183  * Given a v8i32 resource descriptor for a buffer, extract the size of the
3184  * buffer in number of elements and return it as an i32.
3185  */
3186 static LLVMValueRef get_buffer_size(
3187         struct lp_build_tgsi_context *bld_base,
3188         LLVMValueRef descriptor)
3189 {
3190         struct si_shader_context *ctx = si_shader_context(bld_base);
3191         struct gallivm_state *gallivm = bld_base->base.gallivm;
3192         LLVMBuilderRef builder = gallivm->builder;
3193         LLVMValueRef size =
3194                 LLVMBuildExtractElement(builder, descriptor,
3195                                         lp_build_const_int32(gallivm, 6), "");
3196
3197         if (ctx->screen->b.chip_class >= VI) {
3198                 /* On VI, the descriptor contains the size in bytes,
3199                  * but TXQ must return the size in elements.
3200                  * The stride is always non-zero for resources using TXQ.
3201                  */
3202                 LLVMValueRef stride =
3203                         LLVMBuildExtractElement(builder, descriptor,
3204                                                 lp_build_const_int32(gallivm, 5), "");
3205                 stride = LLVMBuildLShr(builder, stride,
3206                                        lp_build_const_int32(gallivm, 16), "");
3207                 stride = LLVMBuildAnd(builder, stride,
3208                                       lp_build_const_int32(gallivm, 0x3FFF), "");
3209
3210                 size = LLVMBuildUDiv(builder, size, stride, "");
3211         }
3212
3213         return size;
3214 }
3215
3216 /**
3217  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3218  * intrinsic names).
3219  */
3220 static void build_int_type_name(
3221         LLVMTypeRef type,
3222         char *buf, unsigned bufsize)
3223 {
3224         assert(bufsize >= 6);
3225
3226         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3227                 snprintf(buf, bufsize, "v%ui32",
3228                          LLVMGetVectorSize(type));
3229         else
3230                 strcpy(buf, "i32");
3231 }
3232
3233 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3234                                 struct lp_build_tgsi_context *bld_base,
3235                                 struct lp_build_emit_data *emit_data);
3236
3237 /* Prevent optimizations (at least of memory accesses) across the current
3238  * point in the program by emitting empty inline assembly that is marked as
3239  * having side effects.
3240  */
3241 static void emit_optimization_barrier(struct si_shader_context *ctx)
3242 {
3243         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3244         LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3245         LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3246         LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3247 }
3248
3249 static void emit_waitcnt(struct si_shader_context *ctx)
3250 {
3251         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3252         LLVMBuilderRef builder = gallivm->builder;
3253         LLVMValueRef args[1] = {
3254                 lp_build_const_int32(gallivm, 0xf70)
3255         };
3256         lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3257                            ctx->voidt, args, 1, LLVMNoUnwindAttribute);
3258 }
3259
3260 static void membar_emit(
3261                 const struct lp_build_tgsi_action *action,
3262                 struct lp_build_tgsi_context *bld_base,
3263                 struct lp_build_emit_data *emit_data)
3264 {
3265         struct si_shader_context *ctx = si_shader_context(bld_base);
3266
3267         emit_waitcnt(ctx);
3268 }
3269
3270 static LLVMValueRef
3271 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3272                          const struct tgsi_full_src_register *reg)
3273 {
3274         LLVMValueRef ind_index;
3275         LLVMValueRef rsrc_ptr;
3276
3277         if (!reg->Register.Indirect)
3278                 return ctx->shader_buffers[reg->Register.Index];
3279
3280         ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3281                                                reg->Register.Index,
3282                                                SI_NUM_SHADER_BUFFERS);
3283
3284         rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3285         return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3286 }
3287
3288 static bool tgsi_is_array_sampler(unsigned target)
3289 {
3290         return target == TGSI_TEXTURE_1D_ARRAY ||
3291                target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3292                target == TGSI_TEXTURE_2D_ARRAY ||
3293                target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3294                target == TGSI_TEXTURE_CUBE_ARRAY ||
3295                target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3296                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3297 }
3298
3299 static bool tgsi_is_array_image(unsigned target)
3300 {
3301         return target == TGSI_TEXTURE_3D ||
3302                target == TGSI_TEXTURE_CUBE ||
3303                target == TGSI_TEXTURE_1D_ARRAY ||
3304                target == TGSI_TEXTURE_2D_ARRAY ||
3305                target == TGSI_TEXTURE_CUBE_ARRAY ||
3306                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3307 }
3308
3309 /**
3310  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3311  *
3312  * At least on Tonga, executing image stores on images with DCC enabled and
3313  * non-trivial can eventually lead to lockups. This can occur when an
3314  * application binds an image as read-only but then uses a shader that writes
3315  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3316  * program termination) in this case, but it doesn't cost much to be a bit
3317  * nicer: disabling DCC in the shader still leads to undefined results but
3318  * avoids the lockup.
3319  */
3320 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3321                                   LLVMValueRef rsrc)
3322 {
3323         if (ctx->screen->b.chip_class <= CIK) {
3324                 return rsrc;
3325         } else {
3326                 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3327                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3328                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3329                 LLVMValueRef tmp;
3330
3331                 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3332                 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3333                 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3334         }
3335 }
3336
3337 /**
3338  * Load the resource descriptor for \p image.
3339  */
3340 static void
3341 image_fetch_rsrc(
3342         struct lp_build_tgsi_context *bld_base,
3343         const struct tgsi_full_src_register *image,
3344         bool dcc_off,
3345         LLVMValueRef *rsrc)
3346 {
3347         struct si_shader_context *ctx = si_shader_context(bld_base);
3348
3349         assert(image->Register.File == TGSI_FILE_IMAGE);
3350
3351         if (!image->Register.Indirect) {
3352                 /* Fast path: use preloaded resources */
3353                 *rsrc = ctx->images[image->Register.Index];
3354         } else {
3355                 /* Indexing and manual load */
3356                 LLVMValueRef ind_index;
3357                 LLVMValueRef rsrc_ptr;
3358                 LLVMValueRef tmp;
3359
3360                 /* From the GL_ARB_shader_image_load_store extension spec:
3361                  *
3362                  *    If a shader performs an image load, store, or atomic
3363                  *    operation using an image variable declared as an array,
3364                  *    and if the index used to select an individual element is
3365                  *    negative or greater than or equal to the size of the
3366                  *    array, the results of the operation are undefined but may
3367                  *    not lead to termination.
3368                  */
3369                 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3370                                                        image->Register.Index,
3371                                                        SI_NUM_IMAGES);
3372
3373                 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3374                 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3375                 if (dcc_off)
3376                         tmp = force_dcc_off(ctx, tmp);
3377                 *rsrc = tmp;
3378         }
3379 }
3380
3381 static LLVMValueRef image_fetch_coords(
3382                 struct lp_build_tgsi_context *bld_base,
3383                 const struct tgsi_full_instruction *inst,
3384                 unsigned src)
3385 {
3386         struct gallivm_state *gallivm = bld_base->base.gallivm;
3387         LLVMBuilderRef builder = gallivm->builder;
3388         unsigned target = inst->Memory.Texture;
3389         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3390         LLVMValueRef coords[4];
3391         LLVMValueRef tmp;
3392         int chan;
3393
3394         for (chan = 0; chan < num_coords; ++chan) {
3395                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3396                 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3397                 coords[chan] = tmp;
3398         }
3399
3400         if (num_coords == 1)
3401                 return coords[0];
3402
3403         if (num_coords == 3) {
3404                 /* LLVM has difficulties lowering 3-element vectors. */
3405                 coords[3] = bld_base->uint_bld.undef;
3406                 num_coords = 4;
3407         }
3408
3409         return lp_build_gather_values(gallivm, coords, num_coords);
3410 }
3411
3412 /**
3413  * Append the extra mode bits that are used by image load and store.
3414  */
3415 static void image_append_args(
3416                 struct si_shader_context *ctx,
3417                 struct lp_build_emit_data * emit_data,
3418                 unsigned target,
3419                 bool atomic)
3420 {
3421         const struct tgsi_full_instruction *inst = emit_data->inst;
3422         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3423         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3424
3425         emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3426         emit_data->args[emit_data->arg_count++] =
3427                 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3428         if (!atomic) {
3429                 emit_data->args[emit_data->arg_count++] =
3430                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3431                         i1true : i1false; /* glc */
3432         }
3433         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3434 }
3435
3436 /**
3437  * Given a 256 bit resource, extract the top half (which stores the buffer
3438  * resource in the case of textures and images).
3439  */
3440 static LLVMValueRef extract_rsrc_top_half(
3441                 struct si_shader_context *ctx,
3442                 LLVMValueRef rsrc)
3443 {
3444         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3445         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3446         LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3447
3448         rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3449         rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3450         rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3451
3452         return rsrc;
3453 }
3454
3455 /**
3456  * Append the resource and indexing arguments for buffer intrinsics.
3457  *
3458  * \param rsrc the v4i32 buffer resource
3459  * \param index index into the buffer (stride-based)
3460  * \param offset byte offset into the buffer
3461  */
3462 static void buffer_append_args(
3463                 struct si_shader_context *ctx,
3464                 struct lp_build_emit_data *emit_data,
3465                 LLVMValueRef rsrc,
3466                 LLVMValueRef index,
3467                 LLVMValueRef offset,
3468                 bool atomic)
3469 {
3470         const struct tgsi_full_instruction *inst = emit_data->inst;
3471         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3472         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3473
3474         emit_data->args[emit_data->arg_count++] = rsrc;
3475         emit_data->args[emit_data->arg_count++] = index; /* vindex */
3476         emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3477         if (!atomic) {
3478                 emit_data->args[emit_data->arg_count++] =
3479                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3480                         i1true : i1false; /* glc */
3481         }
3482         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3483 }
3484
3485 static void load_fetch_args(
3486                 struct lp_build_tgsi_context * bld_base,
3487                 struct lp_build_emit_data * emit_data)
3488 {
3489         struct si_shader_context *ctx = si_shader_context(bld_base);
3490         struct gallivm_state *gallivm = bld_base->base.gallivm;
3491         const struct tgsi_full_instruction * inst = emit_data->inst;
3492         unsigned target = inst->Memory.Texture;
3493         LLVMValueRef rsrc;
3494
3495         emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3496
3497         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3498                 LLVMBuilderRef builder = gallivm->builder;
3499                 LLVMValueRef offset;
3500                 LLVMValueRef tmp;
3501
3502                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3503
3504                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3505                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3506
3507                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3508                                    offset, false);
3509         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3510                 LLVMValueRef coords;
3511
3512                 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3513                 coords = image_fetch_coords(bld_base, inst, 1);
3514
3515                 if (target == TGSI_TEXTURE_BUFFER) {
3516                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3517                         buffer_append_args(ctx, emit_data, rsrc, coords,
3518                                         bld_base->uint_bld.zero, false);
3519                 } else {
3520                         emit_data->args[0] = coords;
3521                         emit_data->args[1] = rsrc;
3522                         emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3523                         emit_data->arg_count = 3;
3524
3525                         image_append_args(ctx, emit_data, target, false);
3526                 }
3527         }
3528 }
3529
3530 static void load_emit_buffer(struct si_shader_context *ctx,
3531                              struct lp_build_emit_data *emit_data)
3532 {
3533         const struct tgsi_full_instruction *inst = emit_data->inst;
3534         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3535         LLVMBuilderRef builder = gallivm->builder;
3536         uint writemask = inst->Dst[0].Register.WriteMask;
3537         uint count = util_last_bit(writemask);
3538         const char *intrinsic_name;
3539         LLVMTypeRef dst_type;
3540
3541         switch (count) {
3542         case 1:
3543                 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3544                 dst_type = ctx->f32;
3545                 break;
3546         case 2:
3547                 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3548                 dst_type = LLVMVectorType(ctx->f32, 2);
3549                 break;
3550         default: // 3 & 4
3551                 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3552                 dst_type = ctx->v4f32;
3553                 count = 4;
3554         }
3555
3556         emit_data->output[emit_data->chan] = lp_build_intrinsic(
3557                         builder, intrinsic_name, dst_type,
3558                         emit_data->args, emit_data->arg_count,
3559                         LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3560 }
3561
3562 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3563                                    const struct tgsi_full_instruction *inst,
3564                                    LLVMTypeRef type, int arg)
3565 {
3566         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3567         LLVMBuilderRef builder = gallivm->builder;
3568         LLVMValueRef offset, ptr;
3569         int addr_space;
3570
3571         offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3572         offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3573
3574         ptr = ctx->shared_memory;
3575         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3576         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3577         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3578
3579         return ptr;
3580 }
3581
3582 static void load_emit_memory(
3583                 struct si_shader_context *ctx,
3584                 struct lp_build_emit_data *emit_data)
3585 {
3586         const struct tgsi_full_instruction *inst = emit_data->inst;
3587         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3588         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3589         LLVMBuilderRef builder = gallivm->builder;
3590         unsigned writemask = inst->Dst[0].Register.WriteMask;
3591         LLVMValueRef channels[4], ptr, derived_ptr, index;
3592         int chan;
3593
3594         ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3595
3596         for (chan = 0; chan < 4; ++chan) {
3597                 if (!(writemask & (1 << chan))) {
3598                         channels[chan] = LLVMGetUndef(base->elem_type);
3599                         continue;
3600                 }
3601
3602                 index = lp_build_const_int32(gallivm, chan);
3603                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3604                 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3605         }
3606         emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3607 }
3608
3609 static void load_emit(
3610                 const struct lp_build_tgsi_action *action,
3611                 struct lp_build_tgsi_context *bld_base,
3612                 struct lp_build_emit_data *emit_data)
3613 {
3614         struct si_shader_context *ctx = si_shader_context(bld_base);
3615         struct gallivm_state *gallivm = bld_base->base.gallivm;
3616         LLVMBuilderRef builder = gallivm->builder;
3617         const struct tgsi_full_instruction * inst = emit_data->inst;
3618         char intrinsic_name[32];
3619         char coords_type[8];
3620
3621         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3622                 load_emit_memory(ctx, emit_data);
3623                 return;
3624         }
3625
3626         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3627                 emit_waitcnt(ctx);
3628
3629         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3630                 load_emit_buffer(ctx, emit_data);
3631                 return;
3632         }
3633
3634         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3635                 emit_data->output[emit_data->chan] =
3636                         lp_build_intrinsic(
3637                                 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3638                                 emit_data->args, emit_data->arg_count,
3639                                 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3640         } else {
3641                 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3642                                     coords_type, sizeof(coords_type));
3643
3644                 snprintf(intrinsic_name, sizeof(intrinsic_name),
3645                          "llvm.amdgcn.image.load.%s", coords_type);
3646
3647                 emit_data->output[emit_data->chan] =
3648                         lp_build_intrinsic(
3649                                 builder, intrinsic_name, emit_data->dst_type,
3650                                 emit_data->args, emit_data->arg_count,
3651                                 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3652         }
3653 }
3654
3655 static void store_fetch_args(
3656                 struct lp_build_tgsi_context * bld_base,
3657                 struct lp_build_emit_data * emit_data)
3658 {
3659         struct si_shader_context *ctx = si_shader_context(bld_base);
3660         struct gallivm_state *gallivm = bld_base->base.gallivm;
3661         LLVMBuilderRef builder = gallivm->builder;
3662         const struct tgsi_full_instruction * inst = emit_data->inst;
3663         struct tgsi_full_src_register memory;
3664         LLVMValueRef chans[4];
3665         LLVMValueRef data;
3666         LLVMValueRef rsrc;
3667         unsigned chan;
3668
3669         emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3670
3671         for (chan = 0; chan < 4; ++chan) {
3672                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3673         }
3674         data = lp_build_gather_values(gallivm, chans, 4);
3675
3676         emit_data->args[emit_data->arg_count++] = data;
3677
3678         memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3679
3680         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3681                 LLVMValueRef offset;
3682                 LLVMValueRef tmp;
3683
3684                 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3685
3686                 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3687                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3688
3689                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3690                                    offset, false);
3691         } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3692                 unsigned target = inst->Memory.Texture;
3693                 LLVMValueRef coords;
3694
3695                 coords = image_fetch_coords(bld_base, inst, 0);
3696
3697                 if (target == TGSI_TEXTURE_BUFFER) {
3698                         image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3699
3700                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3701                         buffer_append_args(ctx, emit_data, rsrc, coords,
3702                                         bld_base->uint_bld.zero, false);
3703                 } else {
3704                         emit_data->args[1] = coords;
3705                         image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3706                         emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3707                         emit_data->arg_count = 4;
3708
3709                         image_append_args(ctx, emit_data, target, false);
3710                 }
3711         }
3712 }
3713
3714 static void store_emit_buffer(
3715                 struct si_shader_context *ctx,
3716                 struct lp_build_emit_data *emit_data)
3717 {
3718         const struct tgsi_full_instruction *inst = emit_data->inst;
3719         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3720         LLVMBuilderRef builder = gallivm->builder;
3721         struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3722         LLVMValueRef base_data = emit_data->args[0];
3723         LLVMValueRef base_offset = emit_data->args[3];
3724         unsigned writemask = inst->Dst[0].Register.WriteMask;
3725
3726         while (writemask) {
3727                 int start, count;
3728                 const char *intrinsic_name;
3729                 LLVMValueRef data;
3730                 LLVMValueRef offset;
3731                 LLVMValueRef tmp;
3732
3733                 u_bit_scan_consecutive_range(&writemask, &start, &count);
3734
3735                 /* Due to an LLVM limitation, split 3-element writes
3736                  * into a 2-element and a 1-element write. */
3737                 if (count == 3) {
3738                         writemask |= 1 << (start + 2);
3739                         count = 2;
3740                 }
3741
3742                 if (count == 4) {
3743                         data = base_data;
3744                         intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3745                 } else if (count == 2) {
3746                         LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3747
3748                         tmp = LLVMBuildExtractElement(
3749                                 builder, base_data,
3750                                 lp_build_const_int32(gallivm, start), "");
3751                         data = LLVMBuildInsertElement(
3752                                 builder, LLVMGetUndef(v2f32), tmp,
3753                                 uint_bld->zero, "");
3754
3755                         tmp = LLVMBuildExtractElement(
3756                                 builder, base_data,
3757                                 lp_build_const_int32(gallivm, start + 1), "");
3758                         data = LLVMBuildInsertElement(
3759                                 builder, data, tmp, uint_bld->one, "");
3760
3761                         intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3762                 } else {
3763                         assert(count == 1);
3764                         data = LLVMBuildExtractElement(
3765                                 builder, base_data,
3766                                 lp_build_const_int32(gallivm, start), "");
3767                         intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3768                 }
3769
3770                 offset = base_offset;
3771                 if (start != 0) {
3772                         offset = LLVMBuildAdd(
3773                                 builder, offset,
3774                                 lp_build_const_int32(gallivm, start * 4), "");
3775                 }
3776
3777                 emit_data->args[0] = data;
3778                 emit_data->args[3] = offset;
3779
3780                 lp_build_intrinsic(
3781                         builder, intrinsic_name, emit_data->dst_type,
3782                         emit_data->args, emit_data->arg_count,
3783                         LLVMNoUnwindAttribute);
3784         }
3785 }
3786
3787 static void store_emit_memory(
3788                 struct si_shader_context *ctx,
3789                 struct lp_build_emit_data *emit_data)
3790 {
3791         const struct tgsi_full_instruction *inst = emit_data->inst;
3792         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3793         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3794         LLVMBuilderRef builder = gallivm->builder;
3795         unsigned writemask = inst->Dst[0].Register.WriteMask;
3796         LLVMValueRef ptr, derived_ptr, data, index;
3797         int chan;
3798
3799         ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3800
3801         for (chan = 0; chan < 4; ++chan) {
3802                 if (!(writemask & (1 << chan))) {
3803                         continue;
3804                 }
3805                 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3806                 index = lp_build_const_int32(gallivm, chan);
3807                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3808                 LLVMBuildStore(builder, data, derived_ptr);
3809         }
3810 }
3811
3812 static void store_emit(
3813                 const struct lp_build_tgsi_action *action,
3814                 struct lp_build_tgsi_context *bld_base,
3815                 struct lp_build_emit_data *emit_data)
3816 {
3817         struct si_shader_context *ctx = si_shader_context(bld_base);
3818         struct gallivm_state *gallivm = bld_base->base.gallivm;
3819         LLVMBuilderRef builder = gallivm->builder;
3820         const struct tgsi_full_instruction * inst = emit_data->inst;
3821         unsigned target = inst->Memory.Texture;
3822         char intrinsic_name[32];
3823         char coords_type[8];
3824
3825         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3826                 store_emit_memory(ctx, emit_data);
3827                 return;
3828         }
3829
3830         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3831                 emit_waitcnt(ctx);
3832
3833         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3834                 store_emit_buffer(ctx, emit_data);
3835                 return;
3836         }
3837
3838         if (target == TGSI_TEXTURE_BUFFER) {
3839                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3840                         builder, "llvm.amdgcn.buffer.store.format.v4f32",
3841                         emit_data->dst_type, emit_data->args, emit_data->arg_count,
3842                         LLVMNoUnwindAttribute);
3843         } else {
3844                 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3845                                     coords_type, sizeof(coords_type));
3846                 snprintf(intrinsic_name, sizeof(intrinsic_name),
3847                          "llvm.amdgcn.image.store.%s", coords_type);
3848
3849                 emit_data->output[emit_data->chan] =
3850                         lp_build_intrinsic(
3851                                 builder, intrinsic_name, emit_data->dst_type,
3852                                 emit_data->args, emit_data->arg_count,
3853                                 LLVMNoUnwindAttribute);
3854         }
3855 }
3856
3857 static void atomic_fetch_args(
3858                 struct lp_build_tgsi_context * bld_base,
3859                 struct lp_build_emit_data * emit_data)
3860 {
3861         struct si_shader_context *ctx = si_shader_context(bld_base);
3862         struct gallivm_state *gallivm = bld_base->base.gallivm;
3863         LLVMBuilderRef builder = gallivm->builder;
3864         const struct tgsi_full_instruction * inst = emit_data->inst;
3865         LLVMValueRef data1, data2;
3866         LLVMValueRef rsrc;
3867         LLVMValueRef tmp;
3868
3869         emit_data->dst_type = bld_base->base.elem_type;
3870
3871         tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3872         data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3873
3874         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3875                 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3876                 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3877         }
3878
3879         /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3880          * of arguments, which is reversed relative to TGSI (and GLSL)
3881          */
3882         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3883                 emit_data->args[emit_data->arg_count++] = data2;
3884         emit_data->args[emit_data->arg_count++] = data1;
3885
3886         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3887                 LLVMValueRef offset;
3888
3889                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3890
3891                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3892                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3893
3894                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3895                                    offset, true);
3896         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3897                 unsigned target = inst->Memory.Texture;
3898                 LLVMValueRef coords;
3899
3900                 image_fetch_rsrc(bld_base, &inst->Src[0],
3901                                  target != TGSI_TEXTURE_BUFFER, &rsrc);
3902                 coords = image_fetch_coords(bld_base, inst, 1);
3903
3904                 if (target == TGSI_TEXTURE_BUFFER) {
3905                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3906                         buffer_append_args(ctx, emit_data, rsrc, coords,
3907                                            bld_base->uint_bld.zero, true);
3908                 } else {
3909                         emit_data->args[emit_data->arg_count++] = coords;
3910                         emit_data->args[emit_data->arg_count++] = rsrc;
3911
3912                         image_append_args(ctx, emit_data, target, true);
3913                 }
3914         }
3915 }
3916
3917 static void atomic_emit_memory(struct si_shader_context *ctx,
3918                                struct lp_build_emit_data *emit_data) {
3919         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3920         LLVMBuilderRef builder = gallivm->builder;
3921         const struct tgsi_full_instruction * inst = emit_data->inst;
3922         LLVMValueRef ptr, result, arg;
3923
3924         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3925
3926         arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
3927         arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3928
3929         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3930                 LLVMValueRef new_data;
3931                 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
3932                                                inst, 3, 0);
3933
3934                 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
3935
3936 #if HAVE_LLVM >= 0x309
3937                 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
3938                                        LLVMAtomicOrderingSequentiallyConsistent,
3939                                        LLVMAtomicOrderingSequentiallyConsistent,
3940                                        false);
3941 #endif
3942
3943                 result = LLVMBuildExtractValue(builder, result, 0, "");
3944         } else {
3945                 LLVMAtomicRMWBinOp op;
3946
3947                 switch(inst->Instruction.Opcode) {
3948                         case TGSI_OPCODE_ATOMUADD:
3949                                 op = LLVMAtomicRMWBinOpAdd;
3950                                 break;
3951                         case TGSI_OPCODE_ATOMXCHG:
3952                                 op = LLVMAtomicRMWBinOpXchg;
3953                                 break;
3954                         case TGSI_OPCODE_ATOMAND:
3955                                 op = LLVMAtomicRMWBinOpAnd;
3956                                 break;
3957                         case TGSI_OPCODE_ATOMOR:
3958                                 op = LLVMAtomicRMWBinOpOr;
3959                                 break;
3960                         case TGSI_OPCODE_ATOMXOR:
3961                                 op = LLVMAtomicRMWBinOpXor;
3962                                 break;
3963                         case TGSI_OPCODE_ATOMUMIN:
3964                                 op = LLVMAtomicRMWBinOpUMin;
3965                                 break;
3966                         case TGSI_OPCODE_ATOMUMAX:
3967                                 op = LLVMAtomicRMWBinOpUMax;
3968                                 break;
3969                         case TGSI_OPCODE_ATOMIMIN:
3970                                 op = LLVMAtomicRMWBinOpMin;
3971                                 break;
3972                         case TGSI_OPCODE_ATOMIMAX:
3973                                 op = LLVMAtomicRMWBinOpMax;
3974                                 break;
3975                         default:
3976                                 unreachable("unknown atomic opcode");
3977                 }
3978
3979                 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
3980                                        LLVMAtomicOrderingSequentiallyConsistent,
3981                                        false);
3982         }
3983         emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
3984 }
3985
3986 static void atomic_emit(
3987                 const struct lp_build_tgsi_action *action,
3988                 struct lp_build_tgsi_context *bld_base,
3989                 struct lp_build_emit_data *emit_data)
3990 {
3991         struct si_shader_context *ctx = si_shader_context(bld_base);
3992         struct gallivm_state *gallivm = bld_base->base.gallivm;
3993         LLVMBuilderRef builder = gallivm->builder;
3994         const struct tgsi_full_instruction * inst = emit_data->inst;
3995         char intrinsic_name[40];
3996         LLVMValueRef tmp;
3997
3998         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3999                 atomic_emit_memory(ctx, emit_data);
4000                 return;
4001         }
4002
4003         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4004             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4005                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4006                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4007         } else {
4008                 char coords_type[8];
4009
4010                 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4011                                     coords_type, sizeof(coords_type));
4012                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4013                          "llvm.amdgcn.image.atomic.%s.%s",
4014                          action->intr_name, coords_type);
4015         }
4016
4017         tmp = lp_build_intrinsic(
4018                 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4019                 emit_data->args, emit_data->arg_count,
4020                 LLVMNoUnwindAttribute);
4021         emit_data->output[emit_data->chan] =
4022                 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4023 }
4024
4025 static void resq_fetch_args(
4026                 struct lp_build_tgsi_context * bld_base,
4027                 struct lp_build_emit_data * emit_data)
4028 {
4029         struct si_shader_context *ctx = si_shader_context(bld_base);
4030         struct gallivm_state *gallivm = bld_base->base.gallivm;
4031         const struct tgsi_full_instruction *inst = emit_data->inst;
4032         const struct tgsi_full_src_register *reg = &inst->Src[0];
4033
4034         emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
4035
4036         if (reg->Register.File == TGSI_FILE_BUFFER) {
4037                 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4038                 emit_data->arg_count = 1;
4039         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4040                 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4041                 emit_data->arg_count = 1;
4042         } else {
4043                 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4044                 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4045                 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4046                 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4047                 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4048                 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4049                         bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4050                 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4051                 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4052                 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4053                 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4054                 emit_data->arg_count = 10;
4055         }
4056 }
4057
4058 static void resq_emit(
4059                 const struct lp_build_tgsi_action *action,
4060                 struct lp_build_tgsi_context *bld_base,
4061                 struct lp_build_emit_data *emit_data)
4062 {
4063         struct gallivm_state *gallivm = bld_base->base.gallivm;
4064         LLVMBuilderRef builder = gallivm->builder;
4065         const struct tgsi_full_instruction *inst = emit_data->inst;
4066         LLVMValueRef out;
4067
4068         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4069                 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4070                                               lp_build_const_int32(gallivm, 2), "");
4071         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4072                 out = get_buffer_size(bld_base, emit_data->args[0]);
4073         } else {
4074                 out = lp_build_intrinsic(
4075                         builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4076                         emit_data->args, emit_data->arg_count,
4077                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4078
4079                 /* Divide the number of layers by 6 to get the number of cubes. */
4080                 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4081                         LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4082                         LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4083
4084                         LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4085                         z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
4086                         z = LLVMBuildSDiv(builder, z, imm6, "");
4087                         z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
4088                         out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4089                 }
4090         }
4091
4092         emit_data->output[emit_data->chan] = out;
4093 }
4094
4095 static void set_tex_fetch_args(struct si_shader_context *ctx,
4096                                struct lp_build_emit_data *emit_data,
4097                                unsigned opcode, unsigned target,
4098                                LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4099                                LLVMValueRef *param, unsigned count,
4100                                unsigned dmask)
4101 {
4102         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4103         unsigned num_args;
4104         unsigned is_rect = target == TGSI_TEXTURE_RECT;
4105
4106         /* Pad to power of two vector */
4107         while (count < util_next_power_of_two(count))
4108                 param[count++] = LLVMGetUndef(ctx->i32);
4109
4110         /* Texture coordinates. */
4111         if (count > 1)
4112                 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4113         else
4114                 emit_data->args[0] = param[0];
4115
4116         /* Resource. */
4117         emit_data->args[1] = res_ptr;
4118         num_args = 2;
4119
4120         if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4121                 emit_data->dst_type = ctx->v4i32;
4122         else {
4123                 emit_data->dst_type = ctx->v4f32;
4124
4125                 emit_data->args[num_args++] = samp_ptr;
4126         }
4127
4128         emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4129         emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4130         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4131         emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4132                                         tgsi_is_array_sampler(target)); /* da */
4133         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4134         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4135         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4136         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4137
4138         emit_data->arg_count = num_args;
4139 }
4140
4141 static const struct lp_build_tgsi_action tex_action;
4142
4143 enum desc_type {
4144         DESC_IMAGE,
4145         DESC_FMASK,
4146         DESC_SAMPLER
4147 };
4148
4149 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4150 {
4151         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4152                                CONST_ADDR_SPACE);
4153 }
4154
4155 /**
4156  * Load an image view, fmask view. or sampler state descriptor.
4157  */
4158 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4159                                             LLVMValueRef list, LLVMValueRef index,
4160                                             enum desc_type type)
4161 {
4162         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4163         LLVMBuilderRef builder = gallivm->builder;
4164
4165         switch (type) {
4166         case DESC_IMAGE:
4167                 /* The image is at [0:7]. */
4168                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4169                 break;
4170         case DESC_FMASK:
4171                 /* The FMASK is at [8:15]. */
4172                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4173                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4174                 break;
4175         case DESC_SAMPLER:
4176                 /* The sampler state is at [12:15]. */
4177                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4178                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4179                 list = LLVMBuildPointerCast(builder, list,
4180                                             const_array(ctx->v4i32, 0), "");
4181                 break;
4182         }
4183
4184         return build_indexed_load_const(ctx, list, index);
4185 }
4186
4187 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4188                                      LLVMValueRef index, enum desc_type type)
4189 {
4190         LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4191                                          SI_PARAM_SAMPLERS);
4192
4193         return get_sampler_desc_custom(ctx, list, index, type);
4194 }
4195
4196 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4197  *
4198  * SI-CI:
4199  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4200  *   filtering manually. The driver sets img7 to a mask clearing
4201  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4202  *     s_and_b32 samp0, samp0, img7
4203  *
4204  * VI:
4205  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
4206  */
4207 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4208                                            LLVMValueRef res, LLVMValueRef samp)
4209 {
4210         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4211         LLVMValueRef img7, samp0;
4212
4213         if (ctx->screen->b.chip_class >= VI)
4214                 return samp;
4215
4216         img7 = LLVMBuildExtractElement(builder, res,
4217                                        LLVMConstInt(ctx->i32, 7, 0), "");
4218         samp0 = LLVMBuildExtractElement(builder, samp,
4219                                         LLVMConstInt(ctx->i32, 0, 0), "");
4220         samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4221         return LLVMBuildInsertElement(builder, samp, samp0,
4222                                       LLVMConstInt(ctx->i32, 0, 0), "");
4223 }
4224
4225 static void tex_fetch_ptrs(
4226         struct lp_build_tgsi_context *bld_base,
4227         struct lp_build_emit_data *emit_data,
4228         LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4229 {
4230         struct si_shader_context *ctx = si_shader_context(bld_base);
4231         const struct tgsi_full_instruction *inst = emit_data->inst;
4232         unsigned target = inst->Texture.Texture;
4233         unsigned sampler_src;
4234         unsigned sampler_index;
4235
4236         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4237         sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4238
4239         if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4240                 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4241                 LLVMValueRef ind_index;
4242
4243                 ind_index = get_bounded_indirect_index(ctx,
4244                                                        &reg->Indirect,
4245                                                        reg->Register.Index,
4246                                                        SI_NUM_SAMPLERS);
4247
4248                 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4249
4250                 if (target == TGSI_TEXTURE_2D_MSAA ||
4251                     target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4252                         if (samp_ptr)
4253                                 *samp_ptr = NULL;
4254                         if (fmask_ptr)
4255                                 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4256                 } else {
4257                         if (samp_ptr) {
4258                                 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4259                                 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4260                         }
4261                         if (fmask_ptr)
4262                                 *fmask_ptr = NULL;
4263                 }
4264         } else {
4265                 *res_ptr = ctx->sampler_views[sampler_index];
4266                 if (samp_ptr)
4267                         *samp_ptr = ctx->sampler_states[sampler_index];
4268                 if (fmask_ptr)
4269                         *fmask_ptr = ctx->fmasks[sampler_index];
4270         }
4271 }
4272
4273 static void txq_fetch_args(
4274         struct lp_build_tgsi_context *bld_base,
4275         struct lp_build_emit_data *emit_data)
4276 {
4277         struct si_shader_context *ctx = si_shader_context(bld_base);
4278         struct gallivm_state *gallivm = bld_base->base.gallivm;
4279         LLVMBuilderRef builder = gallivm->builder;
4280         const struct tgsi_full_instruction *inst = emit_data->inst;
4281         unsigned target = inst->Texture.Texture;
4282         LLVMValueRef res_ptr;
4283         LLVMValueRef address;
4284
4285         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4286
4287         if (target == TGSI_TEXTURE_BUFFER) {
4288                 /* Read the size from the buffer descriptor directly. */
4289                 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4290                 emit_data->args[0] = get_buffer_size(bld_base, res);
4291                 return;
4292         }
4293
4294         /* Textures - set the mip level. */
4295         address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4296
4297         set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4298                            NULL, &address, 1, 0xf);
4299 }
4300
4301 static void txq_emit(const struct lp_build_tgsi_action *action,
4302                      struct lp_build_tgsi_context *bld_base,
4303                      struct lp_build_emit_data *emit_data)
4304 {
4305         struct lp_build_context *base = &bld_base->base;
4306         unsigned target = emit_data->inst->Texture.Texture;
4307
4308         if (target == TGSI_TEXTURE_BUFFER) {
4309                 /* Just return the buffer size. */
4310                 emit_data->output[emit_data->chan] = emit_data->args[0];
4311                 return;
4312         }
4313
4314         emit_data->output[emit_data->chan] = lp_build_intrinsic(
4315                 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4316                 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4317                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4318
4319         /* Divide the number of layers by 6 to get the number of cubes. */
4320         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4321             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4322                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4323                 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4324                 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4325
4326                 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4327                 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4328                 z = LLVMBuildSDiv(builder, z, six, "");
4329
4330                 emit_data->output[emit_data->chan] =
4331                         LLVMBuildInsertElement(builder, v4, z, two, "");
4332         }
4333 }
4334
4335 static void tex_fetch_args(
4336         struct lp_build_tgsi_context *bld_base,
4337         struct lp_build_emit_data *emit_data)
4338 {
4339         struct si_shader_context *ctx = si_shader_context(bld_base);
4340         struct gallivm_state *gallivm = bld_base->base.gallivm;
4341         const struct tgsi_full_instruction *inst = emit_data->inst;
4342         unsigned opcode = inst->Instruction.Opcode;
4343         unsigned target = inst->Texture.Texture;
4344         LLVMValueRef coords[5], derivs[6];
4345         LLVMValueRef address[16];
4346         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4347         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4348         unsigned count = 0;
4349         unsigned chan;
4350         unsigned num_deriv_channels = 0;
4351         bool has_offset = inst->Texture.NumOffsets > 0;
4352         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4353         unsigned dmask = 0xf;
4354
4355         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4356
4357         if (target == TGSI_TEXTURE_BUFFER) {
4358                 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4359
4360                 /* Bitcast and truncate v8i32 to v16i8. */
4361                 LLVMValueRef res = res_ptr;
4362                 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4363                 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4364                 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4365
4366                 emit_data->dst_type = ctx->v4f32;
4367                 emit_data->args[0] = res;
4368                 emit_data->args[1] = bld_base->uint_bld.zero;
4369                 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4370                 emit_data->arg_count = 3;
4371                 return;
4372         }
4373
4374         /* Fetch and project texture coordinates */
4375         coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4376         for (chan = 0; chan < 3; chan++ ) {
4377                 coords[chan] = lp_build_emit_fetch(bld_base,
4378                                                    emit_data->inst, 0,
4379                                                    chan);
4380                 if (opcode == TGSI_OPCODE_TXP)
4381                         coords[chan] = lp_build_emit_llvm_binary(bld_base,
4382                                                                  TGSI_OPCODE_DIV,
4383                                                                  coords[chan],
4384                                                                  coords[3]);
4385         }
4386
4387         if (opcode == TGSI_OPCODE_TXP)
4388                 coords[3] = bld_base->base.one;
4389
4390         /* Pack offsets. */
4391         if (has_offset && opcode != TGSI_OPCODE_TXF) {
4392                 /* The offsets are six-bit signed integers packed like this:
4393                  *   X=[5:0], Y=[13:8], and Z=[21:16].
4394                  */
4395                 LLVMValueRef offset[3], pack;
4396
4397                 assert(inst->Texture.NumOffsets == 1);
4398
4399                 for (chan = 0; chan < 3; chan++) {
4400                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4401                                                                      emit_data->inst, 0, chan);
4402                         offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4403                                                     lp_build_const_int32(gallivm, 0x3f), "");
4404                         if (chan)
4405                                 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4406                                                             lp_build_const_int32(gallivm, chan*8), "");
4407                 }
4408
4409                 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4410                 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4411                 address[count++] = pack;
4412         }
4413
4414         /* Pack LOD bias value */
4415         if (opcode == TGSI_OPCODE_TXB)
4416                 address[count++] = coords[3];
4417         if (opcode == TGSI_OPCODE_TXB2)
4418                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4419
4420         /* Pack depth comparison value */
4421         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4422                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4423                         address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4424                 } else {
4425                         assert(ref_pos >= 0);
4426                         address[count++] = coords[ref_pos];
4427                 }
4428         }
4429
4430         /* Pack user derivatives */
4431         if (opcode == TGSI_OPCODE_TXD) {
4432                 int param, num_src_deriv_channels;
4433
4434                 switch (target) {
4435                 case TGSI_TEXTURE_3D:
4436                         num_src_deriv_channels = 3;
4437                         num_deriv_channels = 3;
4438                         break;
4439                 case TGSI_TEXTURE_2D:
4440                 case TGSI_TEXTURE_SHADOW2D:
4441                 case TGSI_TEXTURE_RECT:
4442                 case TGSI_TEXTURE_SHADOWRECT:
4443                 case TGSI_TEXTURE_2D_ARRAY:
4444                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4445                         num_src_deriv_channels = 2;
4446                         num_deriv_channels = 2;
4447                         break;
4448                 case TGSI_TEXTURE_CUBE:
4449                 case TGSI_TEXTURE_SHADOWCUBE:
4450                 case TGSI_TEXTURE_CUBE_ARRAY:
4451                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4452                         /* Cube derivatives will be converted to 2D. */
4453                         num_src_deriv_channels = 3;
4454                         num_deriv_channels = 2;
4455                         break;
4456                 case TGSI_TEXTURE_1D:
4457                 case TGSI_TEXTURE_SHADOW1D:
4458                 case TGSI_TEXTURE_1D_ARRAY:
4459                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4460                         num_src_deriv_channels = 1;
4461                         num_deriv_channels = 1;
4462                         break;
4463                 default:
4464                         unreachable("invalid target");
4465                 }
4466
4467                 for (param = 0; param < 2; param++)
4468                         for (chan = 0; chan < num_src_deriv_channels; chan++)
4469                                 derivs[param * num_src_deriv_channels + chan] =
4470                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
4471         }
4472
4473         if (target == TGSI_TEXTURE_CUBE ||
4474             target == TGSI_TEXTURE_CUBE_ARRAY ||
4475             target == TGSI_TEXTURE_SHADOWCUBE ||
4476             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4477                 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4478
4479         if (opcode == TGSI_OPCODE_TXD)
4480                 for (int i = 0; i < num_deriv_channels * 2; i++)
4481                         address[count++] = derivs[i];
4482
4483         /* Pack texture coordinates */
4484         address[count++] = coords[0];
4485         if (num_coords > 1)
4486                 address[count++] = coords[1];
4487         if (num_coords > 2)
4488                 address[count++] = coords[2];
4489
4490         /* Pack LOD or sample index */
4491         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4492                 address[count++] = coords[3];
4493         else if (opcode == TGSI_OPCODE_TXL2)
4494                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4495
4496         if (count > 16) {
4497                 assert(!"Cannot handle more than 16 texture address parameters");
4498                 count = 16;
4499         }
4500
4501         for (chan = 0; chan < count; chan++ ) {
4502                 address[chan] = LLVMBuildBitCast(gallivm->builder,
4503                                                  address[chan], ctx->i32, "");
4504         }
4505
4506         /* Adjust the sample index according to FMASK.
4507          *
4508          * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4509          * which is the identity mapping. Each nibble says which physical sample
4510          * should be fetched to get that sample.
4511          *
4512          * For example, 0x11111100 means there are only 2 samples stored and
4513          * the second sample covers 3/4 of the pixel. When reading samples 0
4514          * and 1, return physical sample 0 (determined by the first two 0s
4515          * in FMASK), otherwise return physical sample 1.
4516          *
4517          * The sample index should be adjusted as follows:
4518          *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
4519          */
4520         if (target == TGSI_TEXTURE_2D_MSAA ||
4521             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4522                 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4523                 struct lp_build_emit_data txf_emit_data = *emit_data;
4524                 LLVMValueRef txf_address[4];
4525                 unsigned txf_count = count;
4526                 struct tgsi_full_instruction inst = {};
4527
4528                 memcpy(txf_address, address, sizeof(txf_address));
4529
4530                 if (target == TGSI_TEXTURE_2D_MSAA) {
4531                         txf_address[2] = bld_base->uint_bld.zero;
4532                 }
4533                 txf_address[3] = bld_base->uint_bld.zero;
4534
4535                 /* Read FMASK using TXF. */
4536                 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4537                 inst.Texture.Texture = target;
4538                 txf_emit_data.inst = &inst;
4539                 txf_emit_data.chan = 0;
4540                 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4541                                    target, fmask_ptr, NULL,
4542                                    txf_address, txf_count, 0xf);
4543                 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4544
4545                 /* Initialize some constants. */
4546                 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4547                 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4548
4549                 /* Apply the formula. */
4550                 LLVMValueRef fmask =
4551                         LLVMBuildExtractElement(gallivm->builder,
4552                                                 txf_emit_data.output[0],
4553                                                 uint_bld->zero, "");
4554
4555                 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4556
4557                 LLVMValueRef sample_index4 =
4558                         LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4559
4560                 LLVMValueRef shifted_fmask =
4561                         LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4562
4563                 LLVMValueRef final_sample =
4564                         LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4565
4566                 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4567                  * resource descriptor is 0 (invalid),
4568                  */
4569                 LLVMValueRef fmask_desc =
4570                         LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4571                                          ctx->v8i32, "");
4572
4573                 LLVMValueRef fmask_word1 =
4574                         LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4575                                                 uint_bld->one, "");
4576
4577                 LLVMValueRef word1_is_nonzero =
4578                         LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4579                                       fmask_word1, uint_bld->zero, "");
4580
4581                 /* Replace the MSAA sample index. */
4582                 address[sample_chan] =
4583                         LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4584                                         final_sample, address[sample_chan], "");
4585         }
4586
4587         if (opcode == TGSI_OPCODE_TXF) {
4588                 /* add tex offsets */
4589                 if (inst->Texture.NumOffsets) {
4590                         struct lp_build_context *uint_bld = &bld_base->uint_bld;
4591                         struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4592                         const struct tgsi_texture_offset *off = inst->TexOffsets;
4593
4594                         assert(inst->Texture.NumOffsets == 1);
4595
4596                         switch (target) {
4597                         case TGSI_TEXTURE_3D:
4598                                 address[2] = lp_build_add(uint_bld, address[2],
4599                                                 bld->immediates[off->Index][off->SwizzleZ]);
4600                                 /* fall through */
4601                         case TGSI_TEXTURE_2D:
4602                         case TGSI_TEXTURE_SHADOW2D:
4603                         case TGSI_TEXTURE_RECT:
4604                         case TGSI_TEXTURE_SHADOWRECT:
4605                         case TGSI_TEXTURE_2D_ARRAY:
4606                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
4607                                 address[1] =
4608                                         lp_build_add(uint_bld, address[1],
4609                                                 bld->immediates[off->Index][off->SwizzleY]);
4610                                 /* fall through */
4611                         case TGSI_TEXTURE_1D:
4612                         case TGSI_TEXTURE_SHADOW1D:
4613                         case TGSI_TEXTURE_1D_ARRAY:
4614                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
4615                                 address[0] =
4616                                         lp_build_add(uint_bld, address[0],
4617                                                 bld->immediates[off->Index][off->SwizzleX]);
4618                                 break;
4619                                 /* texture offsets do not apply to other texture targets */
4620                         }
4621                 }
4622         }
4623
4624         if (opcode == TGSI_OPCODE_TG4) {
4625                 unsigned gather_comp = 0;
4626
4627                 /* DMASK was repurposed for GATHER4. 4 components are always
4628                  * returned and DMASK works like a swizzle - it selects
4629                  * the component to fetch. The only valid DMASK values are
4630                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4631                  * (red,red,red,red) etc.) The ISA document doesn't mention
4632                  * this.
4633                  */
4634
4635                 /* Get the component index from src1.x for Gather4. */
4636                 if (!tgsi_is_shadow_target(target)) {
4637                         LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4638                         LLVMValueRef comp_imm;
4639                         struct tgsi_src_register src1 = inst->Src[1].Register;
4640
4641                         assert(src1.File == TGSI_FILE_IMMEDIATE);
4642
4643                         comp_imm = imms[src1.Index][src1.SwizzleX];
4644                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4645                         gather_comp = CLAMP(gather_comp, 0, 3);
4646                 }
4647
4648                 dmask = 1 << gather_comp;
4649         }
4650
4651         set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4652                            samp_ptr, address, count, dmask);
4653 }
4654
4655 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4656                                 struct lp_build_tgsi_context *bld_base,
4657                                 struct lp_build_emit_data *emit_data)
4658 {
4659         struct si_shader_context *ctx = si_shader_context(bld_base);
4660         struct lp_build_context *base = &bld_base->base;
4661         unsigned opcode = emit_data->inst->Instruction.Opcode;
4662         unsigned target = emit_data->inst->Texture.Texture;
4663         char intr_name[127];
4664         bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4665         bool is_shadow = tgsi_is_shadow_target(target);
4666         char type[64];
4667         const char *name = "llvm.SI.image.sample";
4668         const char *infix = "";
4669
4670         if (target == TGSI_TEXTURE_BUFFER) {
4671                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4672                         base->gallivm->builder,
4673                         "llvm.SI.vs.load.input", emit_data->dst_type,
4674                         emit_data->args, emit_data->arg_count,
4675                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4676                 return;
4677         }
4678
4679         switch (opcode) {
4680         case TGSI_OPCODE_TXF:
4681                 name = target == TGSI_TEXTURE_2D_MSAA ||
4682                        target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4683                                "llvm.SI.image.load" :
4684                                "llvm.SI.image.load.mip";
4685                 is_shadow = false;
4686                 has_offset = false;
4687                 break;
4688         case TGSI_OPCODE_LODQ:
4689                 name = "llvm.SI.getlod";
4690                 is_shadow = false;
4691                 has_offset = false;
4692                 break;
4693         case TGSI_OPCODE_TEX:
4694         case TGSI_OPCODE_TEX2:
4695         case TGSI_OPCODE_TXP:
4696                 if (ctx->type != PIPE_SHADER_FRAGMENT)
4697                         infix = ".lz";
4698                 break;
4699         case TGSI_OPCODE_TXB:
4700         case TGSI_OPCODE_TXB2:
4701                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4702                 infix = ".b";
4703                 break;
4704         case TGSI_OPCODE_TXL:
4705         case TGSI_OPCODE_TXL2:
4706                 infix = ".l";
4707                 break;
4708         case TGSI_OPCODE_TXD:
4709                 infix = ".d";
4710                 break;
4711         case TGSI_OPCODE_TG4:
4712                 name = "llvm.SI.gather4";
4713                 infix = ".lz";
4714                 break;
4715         default:
4716                 assert(0);
4717                 return;
4718         }
4719
4720         /* Add the type and suffixes .c, .o if needed. */
4721         build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4722         sprintf(intr_name, "%s%s%s%s.%s",
4723                 name, is_shadow ? ".c" : "", infix,
4724                 has_offset ? ".o" : "", type);
4725
4726         emit_data->output[emit_data->chan] = lp_build_intrinsic(
4727                 base->gallivm->builder, intr_name, emit_data->dst_type,
4728                 emit_data->args, emit_data->arg_count,
4729                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4730 }
4731
4732 static void si_llvm_emit_txqs(
4733         const struct lp_build_tgsi_action *action,
4734         struct lp_build_tgsi_context *bld_base,
4735         struct lp_build_emit_data *emit_data)
4736 {
4737         struct si_shader_context *ctx = si_shader_context(bld_base);
4738         struct gallivm_state *gallivm = bld_base->base.gallivm;
4739         LLVMBuilderRef builder = gallivm->builder;
4740         LLVMValueRef res, samples;
4741         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4742
4743         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4744
4745
4746         /* Read the samples from the descriptor directly. */
4747         res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4748         samples = LLVMBuildExtractElement(
4749                 builder, res,
4750                 lp_build_const_int32(gallivm, 3), "");
4751         samples = LLVMBuildLShr(builder, samples,
4752                                 lp_build_const_int32(gallivm, 16), "");
4753         samples = LLVMBuildAnd(builder, samples,
4754                                lp_build_const_int32(gallivm, 0xf), "");
4755         samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4756                                samples, "");
4757
4758         emit_data->output[emit_data->chan] = samples;
4759 }
4760
4761 /*
4762  * SI implements derivatives using the local data store (LDS)
4763  * All writes to the LDS happen in all executing threads at
4764  * the same time. TID is the Thread ID for the current
4765  * thread and is a value between 0 and 63, representing
4766  * the thread's position in the wavefront.
4767  *
4768  * For the pixel shader threads are grouped into quads of four pixels.
4769  * The TIDs of the pixels of a quad are:
4770  *
4771  *  +------+------+
4772  *  |4n + 0|4n + 1|
4773  *  +------+------+
4774  *  |4n + 2|4n + 3|
4775  *  +------+------+
4776  *
4777  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4778  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4779  * the current pixel's column, and masking with 0xfffffffe yields the TID
4780  * of the left pixel of the current pixel's row.
4781  *
4782  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4783  * adding 2 yields the TID of the pixel below the top pixel.
4784  */
4785 /* masks for thread ID. */
4786 #define TID_MASK_TOP_LEFT 0xfffffffc
4787 #define TID_MASK_TOP      0xfffffffd
4788 #define TID_MASK_LEFT     0xfffffffe
4789
4790 static void si_llvm_emit_ddxy(
4791         const struct lp_build_tgsi_action *action,
4792         struct lp_build_tgsi_context *bld_base,
4793         struct lp_build_emit_data *emit_data)
4794 {
4795         struct si_shader_context *ctx = si_shader_context(bld_base);
4796         struct gallivm_state *gallivm = bld_base->base.gallivm;
4797         const struct tgsi_full_instruction *inst = emit_data->inst;
4798         unsigned opcode = inst->Instruction.Opcode;
4799         LLVMValueRef indices[2];
4800         LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4801         LLVMValueRef tl, trbl, result[4];
4802         LLVMValueRef tl_tid, trbl_tid;
4803         unsigned swizzle[4];
4804         unsigned c;
4805         int idx;
4806         unsigned mask;
4807
4808         indices[0] = bld_base->uint_bld.zero;
4809         indices[1] = get_thread_id(ctx);
4810         store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4811                                  indices, 2, "");
4812
4813         if (opcode == TGSI_OPCODE_DDX_FINE)
4814                 mask = TID_MASK_LEFT;
4815         else if (opcode == TGSI_OPCODE_DDY_FINE)
4816                 mask = TID_MASK_TOP;
4817         else
4818                 mask = TID_MASK_TOP_LEFT;
4819
4820         tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4821                                 lp_build_const_int32(gallivm, mask), "");
4822         indices[1] = tl_tid;
4823         load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4824                                  indices, 2, "");
4825
4826         /* for DDX we want to next X pixel, DDY next Y pixel. */
4827         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4828         trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4829                                   lp_build_const_int32(gallivm, idx), "");
4830         indices[1] = trbl_tid;
4831         load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4832                                  indices, 2, "");
4833
4834         for (c = 0; c < 4; ++c) {
4835                 unsigned i;
4836                 LLVMValueRef val;
4837                 LLVMValueRef args[2];
4838
4839                 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4840                 for (i = 0; i < c; ++i) {
4841                         if (swizzle[i] == swizzle[c]) {
4842                                 result[c] = result[i];
4843                                 break;
4844                         }
4845                 }
4846                 if (i != c)
4847                         continue;
4848
4849                 val = LLVMBuildBitCast(gallivm->builder,
4850                                 lp_build_emit_fetch(bld_base, inst, 0, c),
4851                                                 ctx->i32, "");
4852
4853                 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4854
4855                         args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4856                                         lp_build_const_int32(gallivm, 4), "");
4857                         args[1] = val;
4858                         tl = lp_build_intrinsic(gallivm->builder,
4859                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
4860                                         args, 2, LLVMReadNoneAttribute);
4861
4862                         args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4863                                         lp_build_const_int32(gallivm, 4), "");
4864                         trbl = lp_build_intrinsic(gallivm->builder,
4865                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
4866                                         args, 2, LLVMReadNoneAttribute);
4867                 } else {
4868                         LLVMBuildStore(gallivm->builder, val, store_ptr);
4869                         tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4870                         trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4871                 }
4872                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4873                 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4874                 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4875         }
4876
4877         emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4878 }
4879
4880 /*
4881  * this takes an I,J coordinate pair,
4882  * and works out the X and Y derivatives.
4883  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4884  */
4885 static LLVMValueRef si_llvm_emit_ddxy_interp(
4886         struct lp_build_tgsi_context *bld_base,
4887         LLVMValueRef interp_ij)
4888 {
4889         struct si_shader_context *ctx = si_shader_context(bld_base);
4890         struct gallivm_state *gallivm = bld_base->base.gallivm;
4891         LLVMValueRef indices[2];
4892         LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4893         LLVMValueRef tl, tr, bl, result[4];
4894         unsigned c;
4895
4896         indices[0] = bld_base->uint_bld.zero;
4897         indices[1] = get_thread_id(ctx);
4898         store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4899                                  indices, 2, "");
4900
4901         temp = LLVMBuildAnd(gallivm->builder, indices[1],
4902                             lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4903
4904         temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4905                              lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4906
4907         indices[1] = temp;
4908         load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4909                                   indices, 2, "");
4910
4911         indices[1] = temp2;
4912         load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4913                                   indices, 2, "");
4914
4915         indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4916                                   lp_build_const_int32(gallivm, 1), "");
4917         load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4918                                    indices, 2, "");
4919
4920         indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4921                                   lp_build_const_int32(gallivm, 2), "");
4922         load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4923                                    indices, 2, "");
4924
4925         for (c = 0; c < 2; ++c) {
4926                 LLVMValueRef store_val;
4927                 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
4928
4929                 store_val = LLVMBuildExtractElement(gallivm->builder,
4930                                                     interp_ij, c_ll, "");
4931                 LLVMBuildStore(gallivm->builder,
4932                                store_val,
4933                                store_ptr);
4934
4935                 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
4936                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4937
4938                 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
4939                 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
4940
4941                 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
4942
4943                 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
4944                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4945
4946                 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
4947                 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
4948
4949                 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
4950         }
4951
4952         return lp_build_gather_values(gallivm, result, 4);
4953 }
4954
4955 static void interp_fetch_args(
4956         struct lp_build_tgsi_context *bld_base,
4957         struct lp_build_emit_data *emit_data)
4958 {
4959         struct si_shader_context *ctx = si_shader_context(bld_base);
4960         struct gallivm_state *gallivm = bld_base->base.gallivm;
4961         const struct tgsi_full_instruction *inst = emit_data->inst;
4962
4963         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
4964                 /* offset is in second src, first two channels */
4965                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
4966                                                          emit_data->inst, 1,
4967                                                          TGSI_CHAN_X);
4968                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
4969                                                          emit_data->inst, 1,
4970                                                          TGSI_CHAN_Y);
4971                 emit_data->arg_count = 2;
4972         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4973                 LLVMValueRef sample_position;
4974                 LLVMValueRef sample_id;
4975                 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
4976
4977                 /* fetch sample ID, then fetch its sample position,
4978                  * and place into first two channels.
4979                  */
4980                 sample_id = lp_build_emit_fetch(bld_base,
4981                                                 emit_data->inst, 1, TGSI_CHAN_X);
4982                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
4983                                              ctx->i32, "");
4984                 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
4985
4986                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
4987                                                              sample_position,
4988                                                              lp_build_const_int32(gallivm, 0), "");
4989
4990                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
4991                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
4992                                                              sample_position,
4993                                                              lp_build_const_int32(gallivm, 1), "");
4994                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
4995                 emit_data->arg_count = 2;
4996         }
4997 }
4998
4999 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5000                                 struct lp_build_tgsi_context *bld_base,
5001                                 struct lp_build_emit_data *emit_data)
5002 {
5003         struct si_shader_context *ctx = si_shader_context(bld_base);
5004         struct si_shader *shader = ctx->shader;
5005         struct gallivm_state *gallivm = bld_base->base.gallivm;
5006         LLVMValueRef interp_param;
5007         const struct tgsi_full_instruction *inst = emit_data->inst;
5008         const char *intr_name;
5009         int input_index = inst->Src[0].Register.Index;
5010         int chan;
5011         int i;
5012         LLVMValueRef attr_number;
5013         LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5014         int interp_param_idx;
5015         unsigned interp = shader->selector->info.input_interpolate[input_index];
5016         unsigned location;
5017
5018         assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5019
5020         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5021             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5022                 location = TGSI_INTERPOLATE_LOC_CENTER;
5023         else
5024                 location = TGSI_INTERPOLATE_LOC_CENTROID;
5025
5026         interp_param_idx = lookup_interp_param_index(interp, location);
5027         if (interp_param_idx == -1)
5028                 return;
5029         else if (interp_param_idx)
5030                 interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
5031         else
5032                 interp_param = NULL;
5033
5034         attr_number = lp_build_const_int32(gallivm, input_index);
5035
5036         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5037             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5038                 LLVMValueRef ij_out[2];
5039                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5040
5041                 /*
5042                  * take the I then J parameters, and the DDX/Y for it, and
5043                  * calculate the IJ inputs for the interpolator.
5044                  * temp1 = ddx * offset/sample.x + I;
5045                  * interp_param.I = ddy * offset/sample.y + temp1;
5046                  * temp1 = ddx * offset/sample.x + J;
5047                  * interp_param.J = ddy * offset/sample.y + temp1;
5048                  */
5049                 for (i = 0; i < 2; i++) {
5050                         LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5051                         LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5052                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5053                                                                       ddxy_out, ix_ll, "");
5054                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5055                                                                       ddxy_out, iy_ll, "");
5056                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5057                                                                          interp_param, ix_ll, "");
5058                         LLVMValueRef temp1, temp2;
5059
5060                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5061                                                      ctx->f32, "");
5062
5063                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5064
5065                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5066
5067                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5068
5069                         temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5070
5071                         ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5072                                                      temp2, ctx->i32, "");
5073                 }
5074                 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5075         }
5076
5077         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5078         for (chan = 0; chan < 2; chan++) {
5079                 LLVMValueRef args[4];
5080                 LLVMValueRef llvm_chan;
5081                 unsigned schan;
5082
5083                 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5084                 llvm_chan = lp_build_const_int32(gallivm, schan);
5085
5086                 args[0] = llvm_chan;
5087                 args[1] = attr_number;
5088                 args[2] = params;
5089                 args[3] = interp_param;
5090
5091                 emit_data->output[chan] =
5092                         lp_build_intrinsic(gallivm->builder, intr_name,
5093                                            ctx->f32, args, args[3] ? 4 : 3,
5094                                            LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
5095         }
5096 }
5097
5098 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5099                                        struct lp_build_emit_data *emit_data)
5100 {
5101         LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5102         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5103         unsigned stream;
5104
5105         assert(src0.File == TGSI_FILE_IMMEDIATE);
5106
5107         stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5108         return stream;
5109 }
5110
5111 /* Emit one vertex from the geometry shader */
5112 static void si_llvm_emit_vertex(
5113         const struct lp_build_tgsi_action *action,
5114         struct lp_build_tgsi_context *bld_base,
5115         struct lp_build_emit_data *emit_data)
5116 {
5117         struct si_shader_context *ctx = si_shader_context(bld_base);
5118         struct lp_build_context *uint = &bld_base->uint_bld;
5119         struct si_shader *shader = ctx->shader;
5120         struct tgsi_shader_info *info = &shader->selector->info;
5121         struct gallivm_state *gallivm = bld_base->base.gallivm;
5122         LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5123                                             SI_PARAM_GS2VS_OFFSET);
5124         LLVMValueRef gs_next_vertex;
5125         LLVMValueRef can_emit, kill;
5126         LLVMValueRef args[2];
5127         unsigned chan;
5128         int i;
5129         unsigned stream;
5130
5131         stream = si_llvm_get_stream(bld_base, emit_data);
5132
5133         /* Write vertex attribute values to GSVS ring */
5134         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5135                                        ctx->gs_next_vertex[stream],
5136                                        "");
5137
5138         /* If this thread has already emitted the declared maximum number of
5139          * vertices, kill it: excessive vertex emissions are not supposed to
5140          * have any effect, and GS threads have no externally observable
5141          * effects other than emitting vertices.
5142          */
5143         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5144                                  lp_build_const_int32(gallivm,
5145                                                       shader->selector->gs_max_out_vertices), "");
5146         kill = lp_build_select(&bld_base->base, can_emit,
5147                                lp_build_const_float(gallivm, 1.0f),
5148                                lp_build_const_float(gallivm, -1.0f));
5149
5150         lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5151                            ctx->voidt, &kill, 1, 0);
5152
5153         for (i = 0; i < info->num_outputs; i++) {
5154                 LLVMValueRef *out_ptr =
5155                         ctx->radeon_bld.soa.outputs[i];
5156
5157                 for (chan = 0; chan < 4; chan++) {
5158                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5159                         LLVMValueRef voffset =
5160                                 lp_build_const_int32(gallivm, (i * 4 + chan) *
5161                                                      shader->selector->gs_max_out_vertices);
5162
5163                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
5164                         voffset = lp_build_mul_imm(uint, voffset, 4);
5165
5166                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5167
5168                         build_tbuffer_store(ctx,
5169                                             ctx->gsvs_ring[stream],
5170                                             out_val, 1,
5171                                             voffset, soffset, 0,
5172                                             V_008F0C_BUF_DATA_FORMAT_32,
5173                                             V_008F0C_BUF_NUM_FORMAT_UINT,
5174                                             1, 0, 1, 1, 0);
5175                 }
5176         }
5177         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5178                                       lp_build_const_int32(gallivm, 1));
5179
5180         LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5181
5182         /* Signal vertex emission */
5183         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5184         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5185         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5186                            ctx->voidt, args, 2, LLVMNoUnwindAttribute);
5187 }
5188
5189 /* Cut one primitive from the geometry shader */
5190 static void si_llvm_emit_primitive(
5191         const struct lp_build_tgsi_action *action,
5192         struct lp_build_tgsi_context *bld_base,
5193         struct lp_build_emit_data *emit_data)
5194 {
5195         struct si_shader_context *ctx = si_shader_context(bld_base);
5196         struct gallivm_state *gallivm = bld_base->base.gallivm;
5197         LLVMValueRef args[2];
5198         unsigned stream;
5199
5200         /* Signal primitive cut */
5201         stream = si_llvm_get_stream(bld_base, emit_data);
5202         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5203         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5204         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5205                            ctx->voidt, args, 2, LLVMNoUnwindAttribute);
5206 }
5207
5208 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5209                                  struct lp_build_tgsi_context *bld_base,
5210                                  struct lp_build_emit_data *emit_data)
5211 {
5212         struct si_shader_context *ctx = si_shader_context(bld_base);
5213         struct gallivm_state *gallivm = bld_base->base.gallivm;
5214
5215         /* The real barrier instruction isn’t needed, because an entire patch
5216          * always fits into a single wave.
5217          */
5218         if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5219                 emit_optimization_barrier(ctx);
5220                 return;
5221         }
5222
5223         lp_build_intrinsic(gallivm->builder,
5224                            HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5225                                                : "llvm.AMDGPU.barrier.local",
5226                            ctx->voidt, NULL, 0, LLVMNoUnwindAttribute);
5227 }
5228
5229 static const struct lp_build_tgsi_action tex_action = {
5230         .fetch_args = tex_fetch_args,
5231         .emit = build_tex_intrinsic,
5232 };
5233
5234 static const struct lp_build_tgsi_action interp_action = {
5235         .fetch_args = interp_fetch_args,
5236         .emit = build_interp_intrinsic,
5237 };
5238
5239 static void si_create_function(struct si_shader_context *ctx,
5240                                LLVMTypeRef *returns, unsigned num_returns,
5241                                LLVMTypeRef *params, unsigned num_params,
5242                                int last_array_pointer, int last_sgpr)
5243 {
5244         int i;
5245
5246         radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5247                                 params, num_params);
5248         radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5249         ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5250
5251         for (i = 0; i <= last_sgpr; ++i) {
5252                 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5253
5254                 /* We tell llvm that array inputs are passed by value to allow Sinking pass
5255                  * to move load. Inputs are constant so this is fine. */
5256                 if (i <= last_array_pointer)
5257                         LLVMAddAttribute(P, LLVMByValAttribute);
5258                 else
5259                         LLVMAddAttribute(P, LLVMInRegAttribute);
5260         }
5261
5262         if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5263                 /* These were copied from some LLVM test. */
5264                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5265                                                    "less-precise-fpmad",
5266                                                    "true");
5267                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5268                                                    "no-infs-fp-math",
5269                                                    "true");
5270                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5271                                                    "no-nans-fp-math",
5272                                                    "true");
5273                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5274                                                    "unsafe-fp-math",
5275                                                    "true");
5276         }
5277 }
5278
5279 static void create_meta_data(struct si_shader_context *ctx)
5280 {
5281         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5282         LLVMValueRef args[3];
5283
5284         args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
5285         args[1] = 0;
5286         args[2] = lp_build_const_int32(gallivm, 1);
5287
5288         ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
5289
5290         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5291                                                         "amdgpu.uniform", 14);
5292
5293         ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5294 }
5295
5296 static void declare_streamout_params(struct si_shader_context *ctx,
5297                                      struct pipe_stream_output_info *so,
5298                                      LLVMTypeRef *params, LLVMTypeRef i32,
5299                                      unsigned *num_params)
5300 {
5301         int i;
5302
5303         /* Streamout SGPRs. */
5304         if (so->num_outputs) {
5305                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5306                         params[ctx->param_streamout_config = (*num_params)++] = i32;
5307                 else
5308                         ctx->param_streamout_config = ctx->param_tess_offchip;
5309
5310                 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5311         }
5312         /* A streamout buffer offset is loaded if the stride is non-zero. */
5313         for (i = 0; i < 4; i++) {
5314                 if (!so->stride[i])
5315                         continue;
5316
5317                 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5318         }
5319 }
5320
5321 static unsigned llvm_get_type_size(LLVMTypeRef type)
5322 {
5323         LLVMTypeKind kind = LLVMGetTypeKind(type);
5324
5325         switch (kind) {
5326         case LLVMIntegerTypeKind:
5327                 return LLVMGetIntTypeWidth(type) / 8;
5328         case LLVMFloatTypeKind:
5329                 return 4;
5330         case LLVMPointerTypeKind:
5331                 return 8;
5332         case LLVMVectorTypeKind:
5333                 return LLVMGetVectorSize(type) *
5334                        llvm_get_type_size(LLVMGetElementType(type));
5335         default:
5336                 assert(0);
5337                 return 0;
5338         }
5339 }
5340
5341 static void declare_tess_lds(struct si_shader_context *ctx)
5342 {
5343         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5344         LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
5345         unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5346
5347         /* The actual size is computed outside of the shader to reduce
5348          * the number of shader variants. */
5349         ctx->lds =
5350                 LLVMAddGlobalInAddressSpace(gallivm->module,
5351                                             LLVMArrayType(i32, lds_size / 4),
5352                                             "tess_lds",
5353                                             LOCAL_ADDR_SPACE);
5354 }
5355
5356 static void create_function(struct si_shader_context *ctx)
5357 {
5358         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5359         struct gallivm_state *gallivm = bld_base->base.gallivm;
5360         struct si_shader *shader = ctx->shader;
5361         LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5362         LLVMTypeRef returns[16+32*4];
5363         unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
5364         unsigned num_returns = 0;
5365
5366         v3i32 = LLVMVectorType(ctx->i32, 3);
5367
5368         params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5369         params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5370         params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5371         params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5372         params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5373         last_array_pointer = SI_PARAM_SHADER_BUFFERS;
5374
5375         switch (ctx->type) {
5376         case PIPE_SHADER_VERTEX:
5377                 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5378                 last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
5379                 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5380                 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5381                 num_params = SI_PARAM_START_INSTANCE+1;
5382
5383                 if (shader->key.vs.as_es) {
5384                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5385                 } else if (shader->key.vs.as_ls) {
5386                         params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5387                         num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5388                 } else {
5389                         if (ctx->is_gs_copy_shader) {
5390                                 last_array_pointer = SI_PARAM_RW_BUFFERS;
5391                                 num_params = SI_PARAM_RW_BUFFERS+1;
5392                         } else {
5393                                 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5394                                 num_params = SI_PARAM_VS_STATE_BITS+1;
5395                         }
5396
5397                         /* The locations of the other parameters are assigned dynamically. */
5398                         declare_streamout_params(ctx, &shader->selector->so,
5399                                                  params, ctx->i32, &num_params);
5400                 }
5401
5402                 last_sgpr = num_params-1;
5403
5404                 /* VGPRs */
5405                 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5406                 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5407                 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5408                 params[ctx->param_instance_id = num_params++] = ctx->i32;
5409
5410                 if (!ctx->is_monolithic &&
5411                     !ctx->is_gs_copy_shader) {
5412                         /* Vertex load indices. */
5413                         ctx->param_vertex_index0 = num_params;
5414
5415                         for (i = 0; i < shader->selector->info.num_inputs; i++)
5416                                 params[num_params++] = ctx->i32;
5417
5418                         /* PrimitiveID output. */
5419                         if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5420                                 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5421                                         returns[num_returns++] = ctx->f32;
5422                 }
5423                 break;
5424
5425         case PIPE_SHADER_TESS_CTRL:
5426                 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5427                 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5428                 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5429                 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5430                 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5431                 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5432                 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5433
5434                 /* VGPRs */
5435                 params[SI_PARAM_PATCH_ID] = ctx->i32;
5436                 params[SI_PARAM_REL_IDS] = ctx->i32;
5437                 num_params = SI_PARAM_REL_IDS+1;
5438
5439                 if (!ctx->is_monolithic) {
5440                         /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5441                          * placed after the user SGPRs.
5442                          */
5443                         for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5444                                 returns[num_returns++] = ctx->i32; /* SGPRs */
5445
5446                         for (i = 0; i < 3; i++)
5447                                 returns[num_returns++] = ctx->f32; /* VGPRs */
5448                 }
5449                 break;
5450
5451         case PIPE_SHADER_TESS_EVAL:
5452                 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5453                 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5454
5455                 if (shader->key.tes.as_es) {
5456                         params[ctx->param_oc_lds = num_params++] = ctx->i32;
5457                         params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5458                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5459                 } else {
5460                         params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5461                         declare_streamout_params(ctx, &shader->selector->so,
5462                                                  params, ctx->i32, &num_params);
5463                         params[ctx->param_oc_lds = num_params++] = ctx->i32;
5464                 }
5465                 last_sgpr = num_params - 1;
5466
5467                 /* VGPRs */
5468                 params[ctx->param_tes_u = num_params++] = ctx->f32;
5469                 params[ctx->param_tes_v = num_params++] = ctx->f32;
5470                 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5471                 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5472
5473                 /* PrimitiveID output. */
5474                 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5475                         for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5476                                 returns[num_returns++] = ctx->f32;
5477                 break;
5478
5479         case PIPE_SHADER_GEOMETRY:
5480                 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5481                 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5482                 last_sgpr = SI_PARAM_GS_WAVE_ID;
5483
5484                 /* VGPRs */
5485                 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5486                 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5487                 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5488                 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5489                 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5490                 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5491                 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5492                 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5493                 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5494                 break;
5495
5496         case PIPE_SHADER_FRAGMENT:
5497                 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5498                 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5499                 last_sgpr = SI_PARAM_PRIM_MASK;
5500                 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5501                 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5502                 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5503                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5504                 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5505                 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5506                 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5507                 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5508                 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5509                 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5510                 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5511                 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5512                 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5513                 params[SI_PARAM_ANCILLARY] = ctx->i32;
5514                 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5515                 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5516                 num_params = SI_PARAM_POS_FIXED_PT+1;
5517
5518                 if (!ctx->is_monolithic) {
5519                         /* Color inputs from the prolog. */
5520                         if (shader->selector->info.colors_read) {
5521                                 unsigned num_color_elements =
5522                                         util_bitcount(shader->selector->info.colors_read);
5523
5524                                 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5525                                 for (i = 0; i < num_color_elements; i++)
5526                                         params[num_params++] = ctx->f32;
5527                         }
5528
5529                         /* Outputs for the epilog. */
5530                         num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5531                         num_returns =
5532                                 num_return_sgprs +
5533                                 util_bitcount(shader->selector->info.colors_written) * 4 +
5534                                 shader->selector->info.writes_z +
5535                                 shader->selector->info.writes_stencil +
5536                                 shader->selector->info.writes_samplemask +
5537                                 1 /* SampleMaskIn */;
5538
5539                         num_returns = MAX2(num_returns,
5540                                            num_return_sgprs +
5541                                            PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5542
5543                         for (i = 0; i < num_return_sgprs; i++)
5544                                 returns[i] = ctx->i32;
5545                         for (; i < num_returns; i++)
5546                                 returns[i] = ctx->f32;
5547                 }
5548                 break;
5549
5550         case PIPE_SHADER_COMPUTE:
5551                 params[SI_PARAM_GRID_SIZE] = v3i32;
5552                 params[SI_PARAM_BLOCK_ID] = v3i32;
5553                 last_sgpr = SI_PARAM_BLOCK_ID;
5554
5555                 params[SI_PARAM_THREAD_ID] = v3i32;
5556                 num_params = SI_PARAM_THREAD_ID + 1;
5557                 break;
5558         default:
5559                 assert(0 && "unimplemented shader");
5560                 return;
5561         }
5562
5563         assert(num_params <= ARRAY_SIZE(params));
5564
5565         si_create_function(ctx, returns, num_returns, params,
5566                            num_params, last_array_pointer, last_sgpr);
5567
5568         /* Reserve register locations for VGPR inputs the PS prolog may need. */
5569         if (ctx->type == PIPE_SHADER_FRAGMENT &&
5570             !ctx->is_monolithic) {
5571                 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5572                                           "InitialPSInputAddr",
5573                                           S_0286D0_PERSP_SAMPLE_ENA(1) |
5574                                           S_0286D0_PERSP_CENTER_ENA(1) |
5575                                           S_0286D0_PERSP_CENTROID_ENA(1) |
5576                                           S_0286D0_LINEAR_SAMPLE_ENA(1) |
5577                                           S_0286D0_LINEAR_CENTER_ENA(1) |
5578                                           S_0286D0_LINEAR_CENTROID_ENA(1) |
5579                                           S_0286D0_FRONT_FACE_ENA(1) |
5580                                           S_0286D0_POS_FIXED_PT_ENA(1));
5581         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5582                 const unsigned *properties = shader->selector->info.properties;
5583                 unsigned max_work_group_size =
5584                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5585                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5586                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5587
5588                 assert(max_work_group_size);
5589
5590                 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5591                                           "amdgpu-max-work-group-size",
5592                                           max_work_group_size);
5593         }
5594
5595         shader->info.num_input_sgprs = 0;
5596         shader->info.num_input_vgprs = 0;
5597
5598         for (i = 0; i <= last_sgpr; ++i)
5599                 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5600
5601         /* Unused fragment shader inputs are eliminated by the compiler,
5602          * so we don't know yet how many there will be.
5603          */
5604         if (ctx->type != PIPE_SHADER_FRAGMENT)
5605                 for (; i < num_params; ++i)
5606                         shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5607
5608         if (bld_base->info &&
5609             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5610              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5611              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5612              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5613              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5614              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5615                 ctx->lds =
5616                         LLVMAddGlobalInAddressSpace(gallivm->module,
5617                                                     LLVMArrayType(ctx->i32, 64),
5618                                                     "ddxy_lds",
5619                                                     LOCAL_ADDR_SPACE);
5620
5621         if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5622             ctx->type == PIPE_SHADER_TESS_CTRL ||
5623             ctx->type == PIPE_SHADER_TESS_EVAL)
5624                 declare_tess_lds(ctx);
5625 }
5626
5627 static void preload_constants(struct si_shader_context *ctx)
5628 {
5629         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5630         struct gallivm_state *gallivm = bld_base->base.gallivm;
5631         const struct tgsi_shader_info *info = bld_base->info;
5632         unsigned buf;
5633         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5634
5635         for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5636                 unsigned i, num_const = info->const_file_max[buf] + 1;
5637
5638                 if (num_const == 0)
5639                         continue;
5640
5641                 /* Allocate space for the constant values */
5642                 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5643
5644                 /* Load the resource descriptor */
5645                 ctx->const_buffers[buf] =
5646                         build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5647
5648                 /* Load the constants, we rely on the code sinking to do the rest */
5649                 for (i = 0; i < num_const * 4; ++i) {
5650                         ctx->constants[buf][i] =
5651                                 buffer_load_const(gallivm->builder,
5652                                         ctx->const_buffers[buf],
5653                                         lp_build_const_int32(gallivm, i * 4),
5654                                         ctx->f32);
5655                 }
5656         }
5657 }
5658
5659 static void preload_shader_buffers(struct si_shader_context *ctx)
5660 {
5661         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5662         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5663         int buf, maxbuf;
5664
5665         maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5666                       SI_NUM_SHADER_BUFFERS - 1);
5667         for (buf = 0; buf <= maxbuf; ++buf) {
5668                 ctx->shader_buffers[buf] =
5669                         build_indexed_load_const(
5670                                 ctx, ptr, lp_build_const_int32(gallivm, buf));
5671         }
5672 }
5673
5674 static void preload_samplers(struct si_shader_context *ctx)
5675 {
5676         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5677         struct gallivm_state *gallivm = bld_base->base.gallivm;
5678         const struct tgsi_shader_info *info = bld_base->info;
5679         unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5680         LLVMValueRef offset;
5681
5682         if (num_samplers == 0)
5683                 return;
5684
5685         /* Load the resources and samplers, we rely on the code sinking to do the rest */
5686         for (i = 0; i < num_samplers; ++i) {
5687                 /* Resource */
5688                 offset = lp_build_const_int32(gallivm, i);
5689                 ctx->sampler_views[i] =
5690                         get_sampler_desc(ctx, offset, DESC_IMAGE);
5691
5692                 /* FMASK resource */
5693                 if (info->is_msaa_sampler[i])
5694                         ctx->fmasks[i] =
5695                                 get_sampler_desc(ctx, offset, DESC_FMASK);
5696                 else {
5697                         ctx->sampler_states[i] =
5698                                 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5699                         ctx->sampler_states[i] =
5700                                 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5701                                                        ctx->sampler_states[i]);
5702                 }
5703         }
5704 }
5705
5706 static void preload_images(struct si_shader_context *ctx)
5707 {
5708         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5709         struct tgsi_shader_info *info = &ctx->shader->selector->info;
5710         struct gallivm_state *gallivm = bld_base->base.gallivm;
5711         unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5712         LLVMValueRef res_ptr;
5713         unsigned i;
5714
5715         if (num_images == 0)
5716                 return;
5717
5718         res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5719
5720         for (i = 0; i < num_images; ++i) {
5721                 /* Rely on LLVM to shrink the load for buffer resources. */
5722                 LLVMValueRef rsrc =
5723                         build_indexed_load_const(ctx, res_ptr,
5724                                                  lp_build_const_int32(gallivm, i));
5725
5726                 if (info->images_writemask & (1 << i) &&
5727                     !(info->images_buffers & (1 << i)))
5728                         rsrc = force_dcc_off(ctx, rsrc);
5729
5730                 ctx->images[i] = rsrc;
5731         }
5732 }
5733
5734 static void preload_streamout_buffers(struct si_shader_context *ctx)
5735 {
5736         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5737         struct gallivm_state *gallivm = bld_base->base.gallivm;
5738         unsigned i;
5739
5740         /* Streamout can only be used if the shader is compiled as VS. */
5741         if (!ctx->shader->selector->so.num_outputs ||
5742             (ctx->type == PIPE_SHADER_VERTEX &&
5743              (ctx->shader->key.vs.as_es ||
5744               ctx->shader->key.vs.as_ls)) ||
5745             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5746              ctx->shader->key.tes.as_es))
5747                 return;
5748
5749         LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5750                                             SI_PARAM_RW_BUFFERS);
5751
5752         /* Load the resources, we rely on the code sinking to do the rest */
5753         for (i = 0; i < 4; ++i) {
5754                 if (ctx->shader->selector->so.stride[i]) {
5755                         LLVMValueRef offset = lp_build_const_int32(gallivm,
5756                                                                    SI_VS_STREAMOUT_BUF0 + i);
5757
5758                         ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5759                 }
5760         }
5761 }
5762
5763 /**
5764  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5765  * for later use.
5766  */
5767 static void preload_ring_buffers(struct si_shader_context *ctx)
5768 {
5769         struct gallivm_state *gallivm =
5770                 ctx->radeon_bld.soa.bld_base.base.gallivm;
5771
5772         LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5773                                             SI_PARAM_RW_BUFFERS);
5774
5775         if ((ctx->type == PIPE_SHADER_VERTEX &&
5776              ctx->shader->key.vs.as_es) ||
5777             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5778              ctx->shader->key.tes.as_es) ||
5779             ctx->type == PIPE_SHADER_GEOMETRY) {
5780                 unsigned ring =
5781                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5782                                                              : SI_ES_RING_ESGS;
5783                 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5784
5785                 ctx->esgs_ring =
5786                         build_indexed_load_const(ctx, buf_ptr, offset);
5787         }
5788
5789         if (ctx->is_gs_copy_shader) {
5790                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5791
5792                 ctx->gsvs_ring[0] =
5793                         build_indexed_load_const(ctx, buf_ptr, offset);
5794         }
5795         if (ctx->type == PIPE_SHADER_GEOMETRY) {
5796                 int i;
5797                 for (i = 0; i < 4; i++) {
5798                         LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5799
5800                         ctx->gsvs_ring[i] =
5801                                 build_indexed_load_const(ctx, buf_ptr, offset);
5802                 }
5803         }
5804 }
5805
5806 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5807                                          LLVMValueRef param_rw_buffers,
5808                                          unsigned param_pos_fixed_pt)
5809 {
5810         struct lp_build_tgsi_context *bld_base =
5811                 &ctx->radeon_bld.soa.bld_base;
5812         struct gallivm_state *gallivm = bld_base->base.gallivm;
5813         LLVMBuilderRef builder = gallivm->builder;
5814         LLVMValueRef slot, desc, offset, row, bit, address[2];
5815
5816         /* Use the fixed-point gl_FragCoord input.
5817          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5818          * per coordinate to get the repeating effect.
5819          */
5820         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5821         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5822
5823         /* Load the buffer descriptor. */
5824         slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5825         desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5826
5827         /* The stipple pattern is 32x32, each row has 32 bits. */
5828         offset = LLVMBuildMul(builder, address[1],
5829                               LLVMConstInt(ctx->i32, 4, 0), "");
5830         row = buffer_load_const(builder, desc, offset, ctx->i32);
5831         bit = LLVMBuildLShr(builder, row, address[0], "");
5832         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5833
5834         /* The intrinsic kills the thread if arg < 0. */
5835         bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5836                               LLVMConstReal(ctx->f32, -1), "");
5837         lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5838 }
5839
5840 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5841                                   struct si_shader_config *conf,
5842                                   unsigned symbol_offset)
5843 {
5844         unsigned i;
5845         const unsigned char *config =
5846                 radeon_shader_binary_config_start(binary, symbol_offset);
5847         bool really_needs_scratch = false;
5848
5849         /* LLVM adds SGPR spills to the scratch size.
5850          * Find out if we really need the scratch buffer.
5851          */
5852         for (i = 0; i < binary->reloc_count; i++) {
5853                 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5854
5855                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5856                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5857                         really_needs_scratch = true;
5858                         break;
5859                 }
5860         }
5861
5862         /* XXX: We may be able to emit some of these values directly rather than
5863          * extracting fields to be emitted later.
5864          */
5865
5866         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5867                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5868                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5869                 switch (reg) {
5870                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5871                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5872                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5873                 case R_00B848_COMPUTE_PGM_RSRC1:
5874                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5875                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5876                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
5877                         conf->rsrc1 = value;
5878                         break;
5879                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5880                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5881                         break;
5882                 case R_00B84C_COMPUTE_PGM_RSRC2:
5883                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5884                         conf->rsrc2 = value;
5885                         break;
5886                 case R_0286CC_SPI_PS_INPUT_ENA:
5887                         conf->spi_ps_input_ena = value;
5888                         break;
5889                 case R_0286D0_SPI_PS_INPUT_ADDR:
5890                         conf->spi_ps_input_addr = value;
5891                         break;
5892                 case R_0286E8_SPI_TMPRING_SIZE:
5893                 case R_00B860_COMPUTE_TMPRING_SIZE:
5894                         /* WAVESIZE is in units of 256 dwords. */
5895                         if (really_needs_scratch)
5896                                 conf->scratch_bytes_per_wave =
5897                                         G_00B860_WAVESIZE(value) * 256 * 4;
5898                         break;
5899                 default:
5900                         {
5901                                 static bool printed;
5902
5903                                 if (!printed) {
5904                                         fprintf(stderr, "Warning: LLVM emitted unknown "
5905                                                 "config register: 0x%x\n", reg);
5906                                         printed = true;
5907                                 }
5908                         }
5909                         break;
5910                 }
5911
5912                 if (!conf->spi_ps_input_addr)
5913                         conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5914         }
5915 }
5916
5917 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5918                         struct si_shader *shader,
5919                         struct si_shader_config *config,
5920                         uint64_t scratch_va)
5921 {
5922         unsigned i;
5923         uint32_t scratch_rsrc_dword0 = scratch_va;
5924         uint32_t scratch_rsrc_dword1 =
5925                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5926
5927         /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
5928          * correctly.
5929          */
5930         if (HAVE_LLVM >= 0x0309)
5931                 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5932         else
5933                 scratch_rsrc_dword1 |=
5934                         S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
5935
5936         for (i = 0 ; i < shader->binary.reloc_count; i++) {
5937                 const struct radeon_shader_reloc *reloc =
5938                                         &shader->binary.relocs[i];
5939                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5940                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5941                         &scratch_rsrc_dword0, 4);
5942                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5943                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5944                         &scratch_rsrc_dword1, 4);
5945                 }
5946         }
5947 }
5948
5949 static unsigned si_get_shader_binary_size(struct si_shader *shader)
5950 {
5951         unsigned size = shader->binary.code_size;
5952
5953         if (shader->prolog)
5954                 size += shader->prolog->binary.code_size;
5955         if (shader->epilog)
5956                 size += shader->epilog->binary.code_size;
5957         return size;
5958 }
5959
5960 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5961 {
5962         const struct radeon_shader_binary *prolog =
5963                 shader->prolog ? &shader->prolog->binary : NULL;
5964         const struct radeon_shader_binary *epilog =
5965                 shader->epilog ? &shader->epilog->binary : NULL;
5966         const struct radeon_shader_binary *mainb = &shader->binary;
5967         unsigned bo_size = si_get_shader_binary_size(shader) +
5968                            (!epilog ? mainb->rodata_size : 0);
5969         unsigned char *ptr;
5970
5971         assert(!prolog || !prolog->rodata_size);
5972         assert((!prolog && !epilog) || !mainb->rodata_size);
5973         assert(!epilog || !epilog->rodata_size);
5974
5975         r600_resource_reference(&shader->bo, NULL);
5976         shader->bo = si_resource_create_custom(&sscreen->b.b,
5977                                                PIPE_USAGE_IMMUTABLE,
5978                                                bo_size);
5979         if (!shader->bo)
5980                 return -ENOMEM;
5981
5982         /* Upload. */
5983         ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
5984                                         PIPE_TRANSFER_READ_WRITE);
5985
5986         if (prolog) {
5987                 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
5988                 ptr += prolog->code_size;
5989         }
5990
5991         util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
5992         ptr += mainb->code_size;
5993
5994         if (epilog)
5995                 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
5996         else if (mainb->rodata_size > 0)
5997                 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
5998
5999         sscreen->b.ws->buffer_unmap(shader->bo->buf);
6000         return 0;
6001 }
6002
6003 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6004                                        struct pipe_debug_callback *debug,
6005                                        const char *name, FILE *file)
6006 {
6007         char *line, *p;
6008         unsigned i, count;
6009
6010         if (binary->disasm_string) {
6011                 fprintf(file, "Shader %s disassembly:\n", name);
6012                 fprintf(file, "%s", binary->disasm_string);
6013
6014                 if (debug && debug->debug_message) {
6015                         /* Very long debug messages are cut off, so send the
6016                          * disassembly one line at a time. This causes more
6017                          * overhead, but on the plus side it simplifies
6018                          * parsing of resulting logs.
6019                          */
6020                         pipe_debug_message(debug, SHADER_INFO,
6021                                            "Shader Disassembly Begin");
6022
6023                         line = binary->disasm_string;
6024                         while (*line) {
6025                                 p = util_strchrnul(line, '\n');
6026                                 count = p - line;
6027
6028                                 if (count) {
6029                                         pipe_debug_message(debug, SHADER_INFO,
6030                                                            "%.*s", count, line);
6031                                 }
6032
6033                                 if (!*p)
6034                                         break;
6035                                 line = p + 1;
6036                         }
6037
6038                         pipe_debug_message(debug, SHADER_INFO,
6039                                            "Shader Disassembly End");
6040                 }
6041         } else {
6042                 fprintf(file, "Shader %s binary:\n", name);
6043                 for (i = 0; i < binary->code_size; i += 4) {
6044                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6045                                 binary->code[i + 3], binary->code[i + 2],
6046                                 binary->code[i + 1], binary->code[i]);
6047                 }
6048         }
6049 }
6050
6051 static void si_shader_dump_stats(struct si_screen *sscreen,
6052                                  struct si_shader_config *conf,
6053                                  unsigned num_inputs,
6054                                  unsigned code_size,
6055                                  struct pipe_debug_callback *debug,
6056                                  unsigned processor,
6057                                  FILE *file)
6058 {
6059         unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6060         unsigned lds_per_wave = 0;
6061         unsigned max_simd_waves = 10;
6062         /* Assuming SGPRs aren't spilled. */
6063         unsigned spilled_vgprs = conf->scratch_bytes_per_wave / 64 / 4;
6064
6065         /* Compute LDS usage for PS. */
6066         if (processor == PIPE_SHADER_FRAGMENT) {
6067                 /* The minimum usage per wave is (num_inputs * 48). The maximum
6068                  * usage is (num_inputs * 48 * 16).
6069                  * We can get anything in between and it varies between waves.
6070                  *
6071                  * The 48 bytes per input for a single primitive is equal to
6072                  * 4 bytes/component * 4 components/input * 3 points.
6073                  *
6074                  * Other stages don't know the size at compile time or don't
6075                  * allocate LDS per wave, but instead they do it per thread group.
6076                  */
6077                 lds_per_wave = conf->lds_size * lds_increment +
6078                                align(num_inputs * 48, lds_increment);
6079         }
6080
6081         /* Compute the per-SIMD wave counts. */
6082         if (conf->num_sgprs) {
6083                 if (sscreen->b.chip_class >= VI)
6084                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6085                 else
6086                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6087         }
6088
6089         if (conf->num_vgprs)
6090                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6091
6092         /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6093          * that PS can use.
6094          */
6095         if (lds_per_wave)
6096                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6097
6098         if (file != stderr ||
6099             r600_can_dump_shader(&sscreen->b, processor)) {
6100                 if (processor == PIPE_SHADER_FRAGMENT) {
6101                         fprintf(file, "*** SHADER CONFIG ***\n"
6102                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6103                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
6104                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6105                 }
6106
6107                 fprintf(file, "*** SHADER STATS ***\n"
6108                         "SGPRS: %d\n"
6109                         "VGPRS: %d\n"
6110                         "Spilled VGPRs: %d\n"
6111                         "Code Size: %d bytes\n"
6112                         "LDS: %d blocks\n"
6113                         "Scratch: %d bytes per wave\n"
6114                         "Max Waves: %d\n"
6115                         "********************\n",
6116                         conf->num_sgprs, conf->num_vgprs, spilled_vgprs, code_size,
6117                         conf->lds_size, conf->scratch_bytes_per_wave,
6118                         max_simd_waves);
6119         }
6120
6121         pipe_debug_message(debug, SHADER_INFO,
6122                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6123                            "LDS: %d Scratch: %d Max Waves: %d Spilled VGPRs: %d",
6124                            conf->num_sgprs, conf->num_vgprs, code_size,
6125                            conf->lds_size, conf->scratch_bytes_per_wave,
6126                            max_simd_waves, spilled_vgprs);
6127 }
6128
6129 static const char *si_get_shader_name(struct si_shader *shader,
6130                                       unsigned processor)
6131 {
6132         switch (processor) {
6133         case PIPE_SHADER_VERTEX:
6134                 if (shader->key.vs.as_es)
6135                         return "Vertex Shader as ES";
6136                 else if (shader->key.vs.as_ls)
6137                         return "Vertex Shader as LS";
6138                 else
6139                         return "Vertex Shader as VS";
6140         case PIPE_SHADER_TESS_CTRL:
6141                 return "Tessellation Control Shader";
6142         case PIPE_SHADER_TESS_EVAL:
6143                 if (shader->key.tes.as_es)
6144                         return "Tessellation Evaluation Shader as ES";
6145                 else
6146                         return "Tessellation Evaluation Shader as VS";
6147         case PIPE_SHADER_GEOMETRY:
6148                 if (shader->gs_copy_shader == NULL)
6149                         return "GS Copy Shader as VS";
6150                 else
6151                         return "Geometry Shader";
6152         case PIPE_SHADER_FRAGMENT:
6153                 return "Pixel Shader";
6154         case PIPE_SHADER_COMPUTE:
6155                 return "Compute Shader";
6156         default:
6157                 return "Unknown Shader";
6158         }
6159 }
6160
6161 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6162                     struct pipe_debug_callback *debug, unsigned processor,
6163                     FILE *file)
6164 {
6165         if (file != stderr ||
6166             (r600_can_dump_shader(&sscreen->b, processor) &&
6167              !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6168                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6169
6170                 if (shader->prolog)
6171                         si_shader_dump_disassembly(&shader->prolog->binary,
6172                                                    debug, "prolog", file);
6173
6174                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6175
6176                 if (shader->epilog)
6177                         si_shader_dump_disassembly(&shader->epilog->binary,
6178                                                    debug, "epilog", file);
6179                 fprintf(file, "\n");
6180         }
6181
6182         si_shader_dump_stats(sscreen, &shader->config,
6183                              shader->selector ? shader->selector->info.num_inputs : 0,
6184                              si_get_shader_binary_size(shader), debug, processor,
6185                              file);
6186 }
6187
6188 int si_compile_llvm(struct si_screen *sscreen,
6189                     struct radeon_shader_binary *binary,
6190                     struct si_shader_config *conf,
6191                     LLVMTargetMachineRef tm,
6192                     LLVMModuleRef mod,
6193                     struct pipe_debug_callback *debug,
6194                     unsigned processor,
6195                     const char *name)
6196 {
6197         int r = 0;
6198         unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6199
6200         if (r600_can_dump_shader(&sscreen->b, processor)) {
6201                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6202
6203                 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6204                         fprintf(stderr, "%s LLVM IR:\n\n", name);
6205                         LLVMDumpModule(mod);
6206                         fprintf(stderr, "\n");
6207                 }
6208         }
6209
6210         if (!si_replace_shader(count, binary)) {
6211                 r = radeon_llvm_compile(mod, binary, tm, debug);
6212                 if (r)
6213                         return r;
6214         }
6215
6216         si_shader_binary_read_config(binary, conf, 0);
6217
6218         /* Enable 64-bit and 16-bit denormals, because there is no performance
6219          * cost.
6220          *
6221          * If denormals are enabled, all floating-point output modifiers are
6222          * ignored.
6223          *
6224          * Don't enable denormals for 32-bit floats, because:
6225          * - Floating-point output modifiers would be ignored by the hw.
6226          * - Some opcodes don't support denormals, such as v_mad_f32. We would
6227          *   have to stop using those.
6228          * - SI & CI would be very slow.
6229          */
6230         conf->float_mode |= V_00B028_FP_64_DENORMS;
6231
6232         FREE(binary->config);
6233         FREE(binary->global_symbol_offsets);
6234         binary->config = NULL;
6235         binary->global_symbol_offsets = NULL;
6236
6237         /* Some shaders can't have rodata because their binaries can be
6238          * concatenated.
6239          */
6240         if (binary->rodata_size &&
6241             (processor == PIPE_SHADER_VERTEX ||
6242              processor == PIPE_SHADER_TESS_CTRL ||
6243              processor == PIPE_SHADER_TESS_EVAL ||
6244              processor == PIPE_SHADER_FRAGMENT)) {
6245                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6246                 return -EINVAL;
6247         }
6248
6249         return r;
6250 }
6251
6252 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6253 {
6254         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6255                 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6256         else
6257                 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6258 }
6259
6260 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6261 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6262                                       struct si_shader_context *ctx,
6263                                       struct si_shader *gs,
6264                                       struct pipe_debug_callback *debug)
6265 {
6266         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6267         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6268         struct lp_build_context *uint = &bld_base->uint_bld;
6269         struct si_shader_output_values *outputs;
6270         struct tgsi_shader_info *gsinfo = &gs->selector->info;
6271         LLVMValueRef args[9];
6272         int i, r;
6273
6274         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6275
6276         si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6277         ctx->type = PIPE_SHADER_VERTEX;
6278         ctx->is_gs_copy_shader = true;
6279
6280         create_meta_data(ctx);
6281         create_function(ctx);
6282         preload_streamout_buffers(ctx);
6283         preload_ring_buffers(ctx);
6284
6285         args[0] = ctx->gsvs_ring[0];
6286         args[1] = lp_build_mul_imm(uint,
6287                                    LLVMGetParam(ctx->radeon_bld.main_fn,
6288                                                 ctx->param_vertex_id),
6289                                    4);
6290         args[3] = uint->zero;
6291         args[4] = uint->one;  /* OFFEN */
6292         args[5] = uint->zero; /* IDXEN */
6293         args[6] = uint->one;  /* GLC */
6294         args[7] = uint->one;  /* SLC */
6295         args[8] = uint->zero; /* TFE */
6296
6297         /* Fetch vertex data from GSVS ring */
6298         for (i = 0; i < gsinfo->num_outputs; ++i) {
6299                 unsigned chan;
6300
6301                 outputs[i].name = gsinfo->output_semantic_name[i];
6302                 outputs[i].sid = gsinfo->output_semantic_index[i];
6303
6304                 for (chan = 0; chan < 4; chan++) {
6305                         args[2] = lp_build_const_int32(gallivm,
6306                                                        (i * 4 + chan) *
6307                                                        gs->selector->gs_max_out_vertices * 16 * 4);
6308
6309                         outputs[i].values[chan] =
6310                                 LLVMBuildBitCast(gallivm->builder,
6311                                                  lp_build_intrinsic(gallivm->builder,
6312                                                                  "llvm.SI.buffer.load.dword.i32.i32",
6313                                                                  ctx->i32, args, 9,
6314                                                                  LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
6315                                                  ctx->f32, "");
6316                 }
6317         }
6318
6319         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6320
6321         LLVMBuildRetVoid(gallivm->builder);
6322
6323         /* Dump LLVM IR before any optimization passes */
6324         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6325             r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6326                 LLVMDumpModule(bld_base->base.gallivm->module);
6327
6328         radeon_llvm_finalize_module(&ctx->radeon_bld);
6329
6330         r = si_compile_llvm(sscreen, &ctx->shader->binary,
6331                             &ctx->shader->config, ctx->tm,
6332                             bld_base->base.gallivm->module,
6333                             debug, PIPE_SHADER_GEOMETRY,
6334                             "GS Copy Shader");
6335         if (!r) {
6336                 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6337                         fprintf(stderr, "GS Copy Shader:\n");
6338                 si_shader_dump(sscreen, ctx->shader, debug,
6339                                PIPE_SHADER_GEOMETRY, stderr);
6340                 r = si_shader_binary_upload(sscreen, ctx->shader);
6341         }
6342
6343         radeon_llvm_dispose(&ctx->radeon_bld);
6344
6345         FREE(outputs);
6346         return r;
6347 }
6348
6349 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
6350 {
6351         int i;
6352
6353         fprintf(f, "SHADER KEY\n");
6354
6355         switch (shader) {
6356         case PIPE_SHADER_VERTEX:
6357                 fprintf(f, "  instance_divisors = {");
6358                 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6359                         fprintf(f, !i ? "%u" : ", %u",
6360                                 key->vs.prolog.instance_divisors[i]);
6361                 fprintf(f, "}\n");
6362                 fprintf(f, "  as_es = %u\n", key->vs.as_es);
6363                 fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
6364                 fprintf(f, "  export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6365                 break;
6366
6367         case PIPE_SHADER_TESS_CTRL:
6368                 fprintf(f, "  prim_mode = %u\n", key->tcs.epilog.prim_mode);
6369                 break;
6370
6371         case PIPE_SHADER_TESS_EVAL:
6372                 fprintf(f, "  as_es = %u\n", key->tes.as_es);
6373                 fprintf(f, "  export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6374                 break;
6375
6376         case PIPE_SHADER_GEOMETRY:
6377         case PIPE_SHADER_COMPUTE:
6378                 break;
6379
6380         case PIPE_SHADER_FRAGMENT:
6381                 fprintf(f, "  prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6382                 fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6383                 fprintf(f, "  prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6384                 fprintf(f, "  prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6385                 fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6386                 fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6387                 fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6388                 fprintf(f, "  epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6389                 fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6390                 fprintf(f, "  epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6391                 fprintf(f, "  epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6392                 break;
6393
6394         default:
6395                 assert(0);
6396         }
6397 }
6398
6399 static void si_init_shader_ctx(struct si_shader_context *ctx,
6400                                struct si_screen *sscreen,
6401                                struct si_shader *shader,
6402                                LLVMTargetMachineRef tm)
6403 {
6404         struct lp_build_tgsi_context *bld_base;
6405         struct lp_build_tgsi_action tmpl = {};
6406
6407         memset(ctx, 0, sizeof(*ctx));
6408         radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
6409         ctx->tm = tm;
6410         ctx->screen = sscreen;
6411         if (shader && shader->selector)
6412                 ctx->type = shader->selector->info.processor;
6413         else
6414                 ctx->type = -1;
6415         ctx->shader = shader;
6416
6417         ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6418         ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6419         ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6420         ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6421         ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6422         ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6423         ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6424         ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6425         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6426         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6427         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6428         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6429
6430         bld_base = &ctx->radeon_bld.soa.bld_base;
6431         if (shader && shader->selector)
6432                 bld_base->info = &shader->selector->info;
6433         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6434
6435         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6436         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6437         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6438
6439         bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6440         bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6441         bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6442         bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6443         bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6444         bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6445         bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6446         bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6447         bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6448         bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6449         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6450         bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6451         bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6452         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6453
6454         bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6455         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6456         bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6457         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6458         bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6459         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6460
6461         tmpl.fetch_args = atomic_fetch_args;
6462         tmpl.emit = atomic_emit;
6463         bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6464         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6465         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6466         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6467         bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6468         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6469         bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6470         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6471         bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6472         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6473         bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6474         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6475         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6476         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6477         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6478         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6479         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6480         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6481         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6482         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6483
6484         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6485
6486         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6487         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6488         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6489         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6490
6491         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6492         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6493         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6494
6495         bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6496         bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6497         bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6498         bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6499 }
6500
6501 int si_compile_tgsi_shader(struct si_screen *sscreen,
6502                            LLVMTargetMachineRef tm,
6503                            struct si_shader *shader,
6504                            bool is_monolithic,
6505                            struct pipe_debug_callback *debug)
6506 {
6507         struct si_shader_selector *sel = shader->selector;
6508         struct si_shader_context ctx;
6509         struct lp_build_tgsi_context *bld_base;
6510         LLVMModuleRef mod;
6511         int r = 0;
6512
6513         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6514          * conversion fails. */
6515         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6516             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6517                 if (is_monolithic)
6518                         si_dump_shader_key(sel->type, &shader->key, stderr);
6519                 tgsi_dump(sel->tokens, 0);
6520                 si_dump_streamout(&sel->so);
6521         }
6522
6523         si_init_shader_ctx(&ctx, sscreen, shader, tm);
6524         ctx.is_monolithic = is_monolithic;
6525
6526         shader->info.uses_instanceid = sel->info.uses_instanceid;
6527
6528         bld_base = &ctx.radeon_bld.soa.bld_base;
6529         ctx.radeon_bld.load_system_value = declare_system_value;
6530
6531         switch (ctx.type) {
6532         case PIPE_SHADER_VERTEX:
6533                 ctx.radeon_bld.load_input = declare_input_vs;
6534                 if (shader->key.vs.as_ls)
6535                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6536                 else if (shader->key.vs.as_es)
6537                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6538                 else
6539                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6540                 break;
6541         case PIPE_SHADER_TESS_CTRL:
6542                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6543                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6544                 bld_base->emit_store = store_output_tcs;
6545                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6546                 break;
6547         case PIPE_SHADER_TESS_EVAL:
6548                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6549                 if (shader->key.tes.as_es)
6550                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6551                 else
6552                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6553                 break;
6554         case PIPE_SHADER_GEOMETRY:
6555                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6556                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6557                 break;
6558         case PIPE_SHADER_FRAGMENT:
6559                 ctx.radeon_bld.load_input = declare_input_fs;
6560                 if (is_monolithic)
6561                         bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6562                 else
6563                         bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6564                 break;
6565         case PIPE_SHADER_COMPUTE:
6566                 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6567                 break;
6568         default:
6569                 assert(!"Unsupported shader type");
6570                 return -1;
6571         }
6572
6573         create_meta_data(&ctx);
6574         create_function(&ctx);
6575         preload_constants(&ctx);
6576         preload_shader_buffers(&ctx);
6577         preload_samplers(&ctx);
6578         preload_images(&ctx);
6579         preload_streamout_buffers(&ctx);
6580         preload_ring_buffers(&ctx);
6581
6582         if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6583             shader->key.ps.prolog.poly_stipple) {
6584                 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6585                                                  SI_PARAM_RW_BUFFERS);
6586                 si_llvm_emit_polygon_stipple(&ctx, list,
6587                                              SI_PARAM_POS_FIXED_PT);
6588         }
6589
6590         if (ctx.type == PIPE_SHADER_GEOMETRY) {
6591                 int i;
6592                 for (i = 0; i < 4; i++) {
6593                         ctx.gs_next_vertex[i] =
6594                                 lp_build_alloca(bld_base->base.gallivm,
6595                                                 ctx.i32, "");
6596                 }
6597         }
6598
6599         if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6600                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6601                 goto out;
6602         }
6603
6604         si_llvm_build_ret(&ctx, ctx.return_value);
6605         mod = bld_base->base.gallivm->module;
6606
6607         /* Dump LLVM IR before any optimization passes */
6608         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6609             r600_can_dump_shader(&sscreen->b, ctx.type))
6610                 LLVMDumpModule(mod);
6611
6612         radeon_llvm_finalize_module(&ctx.radeon_bld);
6613
6614         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6615                             mod, debug, ctx.type, "TGSI shader");
6616         if (r) {
6617                 fprintf(stderr, "LLVM failed to compile shader\n");
6618                 goto out;
6619         }
6620
6621         radeon_llvm_dispose(&ctx.radeon_bld);
6622
6623         /* Add the scratch offset to input SGPRs. */
6624         if (shader->config.scratch_bytes_per_wave)
6625                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6626
6627         /* Calculate the number of fragment input VGPRs. */
6628         if (ctx.type == PIPE_SHADER_FRAGMENT) {
6629                 shader->info.num_input_vgprs = 0;
6630                 shader->info.face_vgpr_index = -1;
6631
6632                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6633                         shader->info.num_input_vgprs += 2;
6634                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6635                         shader->info.num_input_vgprs += 2;
6636                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6637                         shader->info.num_input_vgprs += 2;
6638                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6639                         shader->info.num_input_vgprs += 3;
6640                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6641                         shader->info.num_input_vgprs += 2;
6642                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6643                         shader->info.num_input_vgprs += 2;
6644                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6645                         shader->info.num_input_vgprs += 2;
6646                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6647                         shader->info.num_input_vgprs += 1;
6648                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6649                         shader->info.num_input_vgprs += 1;
6650                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6651                         shader->info.num_input_vgprs += 1;
6652                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6653                         shader->info.num_input_vgprs += 1;
6654                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6655                         shader->info.num_input_vgprs += 1;
6656                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6657                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6658                         shader->info.num_input_vgprs += 1;
6659                 }
6660                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6661                         shader->info.num_input_vgprs += 1;
6662                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6663                         shader->info.num_input_vgprs += 1;
6664                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6665                         shader->info.num_input_vgprs += 1;
6666         }
6667
6668         if (ctx.type == PIPE_SHADER_GEOMETRY) {
6669                 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6670                 shader->gs_copy_shader->selector = shader->selector;
6671                 ctx.shader = shader->gs_copy_shader;
6672                 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6673                                                     shader, debug))) {
6674                         free(shader->gs_copy_shader);
6675                         shader->gs_copy_shader = NULL;
6676                         goto out;
6677                 }
6678         }
6679
6680 out:
6681         for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6682                 FREE(ctx.constants[i]);
6683         return r;
6684 }
6685
6686 /**
6687  * Create, compile and return a shader part (prolog or epilog).
6688  *
6689  * \param sscreen       screen
6690  * \param list          list of shader parts of the same category
6691  * \param key           shader part key
6692  * \param tm            LLVM target machine
6693  * \param debug         debug callback
6694  * \param compile       the callback responsible for compilation
6695  * \return              non-NULL on success
6696  */
6697 static struct si_shader_part *
6698 si_get_shader_part(struct si_screen *sscreen,
6699                    struct si_shader_part **list,
6700                    union si_shader_part_key *key,
6701                    LLVMTargetMachineRef tm,
6702                    struct pipe_debug_callback *debug,
6703                    bool (*compile)(struct si_screen *,
6704                                    LLVMTargetMachineRef,
6705                                    struct pipe_debug_callback *,
6706                                    struct si_shader_part *))
6707 {
6708         struct si_shader_part *result;
6709
6710         pipe_mutex_lock(sscreen->shader_parts_mutex);
6711
6712         /* Find existing. */
6713         for (result = *list; result; result = result->next) {
6714                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6715                         pipe_mutex_unlock(sscreen->shader_parts_mutex);
6716                         return result;
6717                 }
6718         }
6719
6720         /* Compile a new one. */
6721         result = CALLOC_STRUCT(si_shader_part);
6722         result->key = *key;
6723         if (!compile(sscreen, tm, debug, result)) {
6724                 FREE(result);
6725                 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6726                 return NULL;
6727         }
6728
6729         result->next = *list;
6730         *list = result;
6731         pipe_mutex_unlock(sscreen->shader_parts_mutex);
6732         return result;
6733 }
6734
6735 /**
6736  * Create a vertex shader prolog.
6737  *
6738  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6739  * All inputs are returned unmodified. The vertex load indices are
6740  * stored after them, which will used by the API VS for fetching inputs.
6741  *
6742  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6743  *   input_v0,
6744  *   input_v1,
6745  *   input_v2,
6746  *   input_v3,
6747  *   (VertexID + BaseVertex),
6748  *   (InstanceID + StartInstance),
6749  *   (InstanceID / 2 + StartInstance)
6750  */
6751 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6752                                  LLVMTargetMachineRef tm,
6753                                  struct pipe_debug_callback *debug,
6754                                  struct si_shader_part *out)
6755 {
6756         union si_shader_part_key *key = &out->key;
6757         struct si_shader shader = {};
6758         struct si_shader_context ctx;
6759         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6760         LLVMTypeRef *params, *returns;
6761         LLVMValueRef ret, func;
6762         int last_sgpr, num_params, num_returns, i;
6763         bool status = true;
6764
6765         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6766         ctx.type = PIPE_SHADER_VERTEX;
6767         ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6768         ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6769
6770         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6771         params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6772                         sizeof(LLVMTypeRef));
6773         returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6774                           key->vs_prolog.last_input + 1) *
6775                          sizeof(LLVMTypeRef));
6776         num_params = 0;
6777         num_returns = 0;
6778
6779         /* Declare input and output SGPRs. */
6780         num_params = 0;
6781         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6782                 params[num_params++] = ctx.i32;
6783                 returns[num_returns++] = ctx.i32;
6784         }
6785         last_sgpr = num_params - 1;
6786
6787         /* 4 preloaded VGPRs (outputs must be floats) */
6788         for (i = 0; i < 4; i++) {
6789                 params[num_params++] = ctx.i32;
6790                 returns[num_returns++] = ctx.f32;
6791         }
6792
6793         /* Vertex load indices. */
6794         for (i = 0; i <= key->vs_prolog.last_input; i++)
6795                 returns[num_returns++] = ctx.f32;
6796
6797         /* Create the function. */
6798         si_create_function(&ctx, returns, num_returns, params,
6799                            num_params, -1, last_sgpr);
6800         func = ctx.radeon_bld.main_fn;
6801
6802         /* Copy inputs to outputs. This should be no-op, as the registers match,
6803          * but it will prevent the compiler from overwriting them unintentionally.
6804          */
6805         ret = ctx.return_value;
6806         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6807                 LLVMValueRef p = LLVMGetParam(func, i);
6808                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6809         }
6810         for (i = num_params - 4; i < num_params; i++) {
6811                 LLVMValueRef p = LLVMGetParam(func, i);
6812                 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6813                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6814         }
6815
6816         /* Compute vertex load indices from instance divisors. */
6817         for (i = 0; i <= key->vs_prolog.last_input; i++) {
6818                 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6819                 LLVMValueRef index;
6820
6821                 if (divisor) {
6822                         /* InstanceID / Divisor + StartInstance */
6823                         index = get_instance_index_for_fetch(&ctx.radeon_bld,
6824                                                              SI_SGPR_START_INSTANCE,
6825                                                              divisor);
6826                 } else {
6827                         /* VertexID + BaseVertex */
6828                         index = LLVMBuildAdd(gallivm->builder,
6829                                              LLVMGetParam(func, ctx.param_vertex_id),
6830                                              LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6831                 }
6832
6833                 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6834                 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6835                                            num_params++, "");
6836         }
6837
6838         /* Compile. */
6839         si_llvm_build_ret(&ctx, ret);
6840         radeon_llvm_finalize_module(&ctx.radeon_bld);
6841
6842         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6843                             gallivm->module, debug, ctx.type,
6844                             "Vertex Shader Prolog"))
6845                 status = false;
6846
6847         radeon_llvm_dispose(&ctx.radeon_bld);
6848         return status;
6849 }
6850
6851 /**
6852  * Compile the vertex shader epilog. This is also used by the tessellation
6853  * evaluation shader compiled as VS.
6854  *
6855  * The input is PrimitiveID.
6856  *
6857  * If PrimitiveID is required by the pixel shader, export it.
6858  * Otherwise, do nothing.
6859  */
6860 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6861                                  LLVMTargetMachineRef tm,
6862                                  struct pipe_debug_callback *debug,
6863                                  struct si_shader_part *out)
6864 {
6865         union si_shader_part_key *key = &out->key;
6866         struct si_shader_context ctx;
6867         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6868         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6869         LLVMTypeRef params[5];
6870         int num_params, i;
6871         bool status = true;
6872
6873         si_init_shader_ctx(&ctx, sscreen, NULL, tm);
6874         ctx.type = PIPE_SHADER_VERTEX;
6875
6876         /* Declare input VGPRs. */
6877         num_params = key->vs_epilog.states.export_prim_id ?
6878                            (VS_EPILOG_PRIMID_LOC + 1) : 0;
6879         assert(num_params <= ARRAY_SIZE(params));
6880
6881         for (i = 0; i < num_params; i++)
6882                 params[i] = ctx.f32;
6883
6884         /* Create the function. */
6885         si_create_function(&ctx, NULL, 0, params, num_params,
6886                            -1, -1);
6887
6888         /* Emit exports. */
6889         if (key->vs_epilog.states.export_prim_id) {
6890                 struct lp_build_context *base = &bld_base->base;
6891                 struct lp_build_context *uint = &bld_base->uint_bld;
6892                 LLVMValueRef args[9];
6893
6894                 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
6895                 args[1] = uint->zero; /* whether the EXEC mask is valid */
6896                 args[2] = uint->zero; /* DONE bit */
6897                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
6898                                                key->vs_epilog.prim_id_param_offset);
6899                 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
6900                 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
6901                                        VS_EPILOG_PRIMID_LOC); /* X */
6902                 args[6] = uint->undef; /* Y */
6903                 args[7] = uint->undef; /* Z */
6904                 args[8] = uint->undef; /* W */
6905
6906                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
6907                                    LLVMVoidTypeInContext(base->gallivm->context),
6908                                    args, 9, 0);
6909         }
6910
6911         /* Compile. */
6912         LLVMBuildRetVoid(gallivm->builder);
6913         radeon_llvm_finalize_module(&ctx.radeon_bld);
6914
6915         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6916                             gallivm->module, debug, ctx.type,
6917                             "Vertex Shader Epilog"))
6918                 status = false;
6919
6920         radeon_llvm_dispose(&ctx.radeon_bld);
6921         return status;
6922 }
6923
6924 /**
6925  * Create & compile a vertex shader epilog. This a helper used by VS and TES.
6926  */
6927 static bool si_get_vs_epilog(struct si_screen *sscreen,
6928                              LLVMTargetMachineRef tm,
6929                              struct si_shader *shader,
6930                              struct pipe_debug_callback *debug,
6931                              struct si_vs_epilog_bits *states)
6932 {
6933         union si_shader_part_key epilog_key;
6934
6935         memset(&epilog_key, 0, sizeof(epilog_key));
6936         epilog_key.vs_epilog.states = *states;
6937
6938         /* Set up the PrimitiveID output. */
6939         if (shader->key.vs.epilog.export_prim_id) {
6940                 unsigned index = shader->selector->info.num_outputs;
6941                 unsigned offset = shader->info.nr_param_exports++;
6942
6943                 epilog_key.vs_epilog.prim_id_param_offset = offset;
6944                 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
6945                 shader->info.vs_output_param_offset[index] = offset;
6946         }
6947
6948         shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
6949                                             &epilog_key, tm, debug,
6950                                             si_compile_vs_epilog);
6951         return shader->epilog != NULL;
6952 }
6953
6954 /**
6955  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6956  */
6957 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6958                                       LLVMTargetMachineRef tm,
6959                                       struct si_shader *shader,
6960                                       struct pipe_debug_callback *debug)
6961 {
6962         struct tgsi_shader_info *info = &shader->selector->info;
6963         union si_shader_part_key prolog_key;
6964         unsigned i;
6965
6966         /* Get the prolog. */
6967         memset(&prolog_key, 0, sizeof(prolog_key));
6968         prolog_key.vs_prolog.states = shader->key.vs.prolog;
6969         prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6970         prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6971
6972         /* The prolog is a no-op if there are no inputs. */
6973         if (info->num_inputs) {
6974                 shader->prolog =
6975                         si_get_shader_part(sscreen, &sscreen->vs_prologs,
6976                                            &prolog_key, tm, debug,
6977                                            si_compile_vs_prolog);
6978                 if (!shader->prolog)
6979                         return false;
6980         }
6981
6982         /* Get the epilog. */
6983         if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
6984             !si_get_vs_epilog(sscreen, tm, shader, debug,
6985                               &shader->key.vs.epilog))
6986                 return false;
6987
6988         /* Set the instanceID flag. */
6989         for (i = 0; i < info->num_inputs; i++)
6990                 if (prolog_key.vs_prolog.states.instance_divisors[i])
6991                         shader->info.uses_instanceid = true;
6992
6993         return true;
6994 }
6995
6996 /**
6997  * Select and compile (or reuse) TES parts (epilog).
6998  */
6999 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7000                                        LLVMTargetMachineRef tm,
7001                                        struct si_shader *shader,
7002                                        struct pipe_debug_callback *debug)
7003 {
7004         if (shader->key.tes.as_es)
7005                 return true;
7006
7007         /* TES compiled as VS. */
7008         return si_get_vs_epilog(sscreen, tm, shader, debug,
7009                                 &shader->key.tes.epilog);
7010 }
7011
7012 /**
7013  * Compile the TCS epilog. This writes tesselation factors to memory based on
7014  * the output primitive type of the tesselator (determined by TES).
7015  */
7016 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7017                                   LLVMTargetMachineRef tm,
7018                                   struct pipe_debug_callback *debug,
7019                                   struct si_shader_part *out)
7020 {
7021         union si_shader_part_key *key = &out->key;
7022         struct si_shader shader = {};
7023         struct si_shader_context ctx;
7024         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7025         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7026         LLVMTypeRef params[16];
7027         LLVMValueRef func;
7028         int last_array_pointer, last_sgpr, num_params;
7029         bool status = true;
7030
7031         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7032         ctx.type = PIPE_SHADER_TESS_CTRL;
7033         shader.key.tcs.epilog = key->tcs_epilog.states;
7034
7035         /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7036         params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7037         last_array_pointer = SI_PARAM_RW_BUFFERS;
7038         params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7039         params[SI_PARAM_SAMPLERS] = ctx.i64;
7040         params[SI_PARAM_IMAGES] = ctx.i64;
7041         params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7042         params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7043         params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7044         params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7045         params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7046         params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7047         params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7048         last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7049         num_params = last_sgpr + 1;
7050
7051         params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7052         params[num_params++] = ctx.i32; /* invocation ID within the patch */
7053         params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7054
7055         /* Create the function. */
7056         si_create_function(&ctx, NULL, 0, params, num_params,
7057                            last_array_pointer, last_sgpr);
7058         declare_tess_lds(&ctx);
7059         func = ctx.radeon_bld.main_fn;
7060
7061         si_write_tess_factors(bld_base,
7062                               LLVMGetParam(func, last_sgpr + 1),
7063                               LLVMGetParam(func, last_sgpr + 2),
7064                               LLVMGetParam(func, last_sgpr + 3));
7065
7066         /* Compile. */
7067         LLVMBuildRetVoid(gallivm->builder);
7068         radeon_llvm_finalize_module(&ctx.radeon_bld);
7069
7070         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7071                             gallivm->module, debug, ctx.type,
7072                             "Tessellation Control Shader Epilog"))
7073                 status = false;
7074
7075         radeon_llvm_dispose(&ctx.radeon_bld);
7076         return status;
7077 }
7078
7079 /**
7080  * Select and compile (or reuse) TCS parts (epilog).
7081  */
7082 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7083                                        LLVMTargetMachineRef tm,
7084                                        struct si_shader *shader,
7085                                        struct pipe_debug_callback *debug)
7086 {
7087         union si_shader_part_key epilog_key;
7088
7089         /* Get the epilog. */
7090         memset(&epilog_key, 0, sizeof(epilog_key));
7091         epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7092
7093         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7094                                             &epilog_key, tm, debug,
7095                                             si_compile_tcs_epilog);
7096         return shader->epilog != NULL;
7097 }
7098
7099 /**
7100  * Compile the pixel shader prolog. This handles:
7101  * - two-side color selection and interpolation
7102  * - overriding interpolation parameters for the API PS
7103  * - polygon stippling
7104  *
7105  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7106  * overriden by other states. (e.g. per-sample interpolation)
7107  * Interpolated colors are stored after the preloaded VGPRs.
7108  */
7109 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7110                                  LLVMTargetMachineRef tm,
7111                                  struct pipe_debug_callback *debug,
7112                                  struct si_shader_part *out)
7113 {
7114         union si_shader_part_key *key = &out->key;
7115         struct si_shader shader = {};
7116         struct si_shader_context ctx;
7117         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7118         LLVMTypeRef *params;
7119         LLVMValueRef ret, func;
7120         int last_sgpr, num_params, num_returns, i, num_color_channels;
7121         bool status = true;
7122
7123         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7124         ctx.type = PIPE_SHADER_FRAGMENT;
7125         shader.key.ps.prolog = key->ps_prolog.states;
7126
7127         /* Number of inputs + 8 color elements. */
7128         params = alloca((key->ps_prolog.num_input_sgprs +
7129                          key->ps_prolog.num_input_vgprs + 8) *
7130                         sizeof(LLVMTypeRef));
7131
7132         /* Declare inputs. */
7133         num_params = 0;
7134         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7135                 params[num_params++] = ctx.i32;
7136         last_sgpr = num_params - 1;
7137
7138         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7139                 params[num_params++] = ctx.f32;
7140
7141         /* Declare outputs (same as inputs + add colors if needed) */
7142         num_returns = num_params;
7143         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7144         for (i = 0; i < num_color_channels; i++)
7145                 params[num_returns++] = ctx.f32;
7146
7147         /* Create the function. */
7148         si_create_function(&ctx, params, num_returns, params,
7149                            num_params, -1, last_sgpr);
7150         func = ctx.radeon_bld.main_fn;
7151
7152         /* Copy inputs to outputs. This should be no-op, as the registers match,
7153          * but it will prevent the compiler from overwriting them unintentionally.
7154          */
7155         ret = ctx.return_value;
7156         for (i = 0; i < num_params; i++) {
7157                 LLVMValueRef p = LLVMGetParam(func, i);
7158                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7159         }
7160
7161         /* Polygon stippling. */
7162         if (key->ps_prolog.states.poly_stipple) {
7163                 /* POS_FIXED_PT is always last. */
7164                 unsigned pos = key->ps_prolog.num_input_sgprs +
7165                                key->ps_prolog.num_input_vgprs - 1;
7166                 LLVMValueRef ptr[2], list;
7167
7168                 /* Get the pointer to rw buffers. */
7169                 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7170                 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7171                 list = lp_build_gather_values(gallivm, ptr, 2);
7172                 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7173                 list = LLVMBuildIntToPtr(gallivm->builder, list,
7174                                           const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7175
7176                 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7177         }
7178
7179         /* Interpolate colors. */
7180         for (i = 0; i < 2; i++) {
7181                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7182                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7183                                      key->ps_prolog.face_vgpr_index;
7184                 LLVMValueRef interp[2], color[4];
7185                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7186
7187                 if (!writemask)
7188                         continue;
7189
7190                 /* If the interpolation qualifier is not CONSTANT (-1). */
7191                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7192                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7193                                                key->ps_prolog.color_interp_vgpr_index[i];
7194
7195                         interp[0] = LLVMGetParam(func, interp_vgpr);
7196                         interp[1] = LLVMGetParam(func, interp_vgpr + 1);
7197                         interp_ij = lp_build_gather_values(gallivm, interp, 2);
7198                         interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7199                                                      ctx.v2i32, "");
7200                 }
7201
7202                 /* Use the absolute location of the input. */
7203                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7204
7205                 if (key->ps_prolog.states.color_two_side) {
7206                         face = LLVMGetParam(func, face_vgpr);
7207                         face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7208                 }
7209
7210                 interp_fs_input(&ctx,
7211                                 key->ps_prolog.color_attr_index[i],
7212                                 TGSI_SEMANTIC_COLOR, i,
7213                                 key->ps_prolog.num_interp_inputs,
7214                                 key->ps_prolog.colors_read, interp_ij,
7215                                 prim_mask, face, color);
7216
7217                 while (writemask) {
7218                         unsigned chan = u_bit_scan(&writemask);
7219                         ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7220                                                    num_params++, "");
7221                 }
7222         }
7223
7224         /* Force per-sample interpolation. */
7225         if (key->ps_prolog.states.force_persp_sample_interp) {
7226                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7227                 LLVMValueRef persp_sample[2];
7228
7229                 /* Read PERSP_SAMPLE. */
7230                 for (i = 0; i < 2; i++)
7231                         persp_sample[i] = LLVMGetParam(func, base + i);
7232                 /* Overwrite PERSP_CENTER. */
7233                 for (i = 0; i < 2; i++)
7234                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7235                                                    persp_sample[i], base + 2 + i, "");
7236                 /* Overwrite PERSP_CENTROID. */
7237                 for (i = 0; i < 2; i++)
7238                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7239                                                    persp_sample[i], base + 4 + i, "");
7240         }
7241         if (key->ps_prolog.states.force_linear_sample_interp) {
7242                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7243                 LLVMValueRef linear_sample[2];
7244
7245                 /* Read LINEAR_SAMPLE. */
7246                 for (i = 0; i < 2; i++)
7247                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7248                 /* Overwrite LINEAR_CENTER. */
7249                 for (i = 0; i < 2; i++)
7250                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7251                                                    linear_sample[i], base + 8 + i, "");
7252                 /* Overwrite LINEAR_CENTROID. */
7253                 for (i = 0; i < 2; i++)
7254                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7255                                                    linear_sample[i], base + 10 + i, "");
7256         }
7257
7258         /* Tell LLVM to insert WQM instruction sequence when needed. */
7259         if (key->ps_prolog.wqm) {
7260                 LLVMAddTargetDependentFunctionAttr(func,
7261                                                    "amdgpu-ps-wqm-outputs", "");
7262         }
7263
7264         /* Compile. */
7265         si_llvm_build_ret(&ctx, ret);
7266         radeon_llvm_finalize_module(&ctx.radeon_bld);
7267
7268         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7269                             gallivm->module, debug, ctx.type,
7270                             "Fragment Shader Prolog"))
7271                 status = false;
7272
7273         radeon_llvm_dispose(&ctx.radeon_bld);
7274         return status;
7275 }
7276
7277 /**
7278  * Compile the pixel shader epilog. This handles everything that must be
7279  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7280  */
7281 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7282                                  LLVMTargetMachineRef tm,
7283                                  struct pipe_debug_callback *debug,
7284                                  struct si_shader_part *out)
7285 {
7286         union si_shader_part_key *key = &out->key;
7287         struct si_shader shader = {};
7288         struct si_shader_context ctx;
7289         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7290         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7291         LLVMTypeRef params[16+8*4+3];
7292         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7293         int last_array_pointer, last_sgpr, num_params, i;
7294         bool status = true;
7295
7296         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7297         ctx.type = PIPE_SHADER_FRAGMENT;
7298         shader.key.ps.epilog = key->ps_epilog.states;
7299
7300         /* Declare input SGPRs. */
7301         params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7302         params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7303         params[SI_PARAM_SAMPLERS] = ctx.i64;
7304         params[SI_PARAM_IMAGES] = ctx.i64;
7305         params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7306         params[SI_PARAM_ALPHA_REF] = ctx.f32;
7307         last_array_pointer = -1;
7308         last_sgpr = SI_PARAM_ALPHA_REF;
7309
7310         /* Declare input VGPRs. */
7311         num_params = (last_sgpr + 1) +
7312                      util_bitcount(key->ps_epilog.colors_written) * 4 +
7313                      key->ps_epilog.writes_z +
7314                      key->ps_epilog.writes_stencil +
7315                      key->ps_epilog.writes_samplemask;
7316
7317         num_params = MAX2(num_params,
7318                           last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7319
7320         assert(num_params <= ARRAY_SIZE(params));
7321
7322         for (i = last_sgpr + 1; i < num_params; i++)
7323                 params[i] = ctx.f32;
7324
7325         /* Create the function. */
7326         si_create_function(&ctx, NULL, 0, params, num_params,
7327                            last_array_pointer, last_sgpr);
7328         /* Disable elimination of unused inputs. */
7329         radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7330                                   "InitialPSInputAddr", 0xffffff);
7331
7332         /* Process colors. */
7333         unsigned vgpr = last_sgpr + 1;
7334         unsigned colors_written = key->ps_epilog.colors_written;
7335         int last_color_export = -1;
7336
7337         /* Find the last color export. */
7338         if (!key->ps_epilog.writes_z &&
7339             !key->ps_epilog.writes_stencil &&
7340             !key->ps_epilog.writes_samplemask) {
7341                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7342
7343                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7344                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7345                         /* Just set this if any of the colorbuffers are enabled. */
7346                         if (spi_format &
7347                             ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7348                                 last_color_export = 0;
7349                 } else {
7350                         for (i = 0; i < 8; i++)
7351                                 if (colors_written & (1 << i) &&
7352                                     (spi_format >> (i * 4)) & 0xf)
7353                                         last_color_export = i;
7354                 }
7355         }
7356
7357         while (colors_written) {
7358                 LLVMValueRef color[4];
7359                 int mrt = u_bit_scan(&colors_written);
7360
7361                 for (i = 0; i < 4; i++)
7362                         color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7363
7364                 si_export_mrt_color(bld_base, color, mrt,
7365                                     num_params - 1,
7366                                     mrt == last_color_export);
7367         }
7368
7369         /* Process depth, stencil, samplemask. */
7370         if (key->ps_epilog.writes_z)
7371                 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7372         if (key->ps_epilog.writes_stencil)
7373                 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7374         if (key->ps_epilog.writes_samplemask)
7375                 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7376
7377         if (depth || stencil || samplemask)
7378                 si_export_mrt_z(bld_base, depth, stencil, samplemask);
7379         else if (last_color_export == -1)
7380                 si_export_null(bld_base);
7381
7382         /* Compile. */
7383         LLVMBuildRetVoid(gallivm->builder);
7384         radeon_llvm_finalize_module(&ctx.radeon_bld);
7385
7386         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7387                             gallivm->module, debug, ctx.type,
7388                             "Fragment Shader Epilog"))
7389                 status = false;
7390
7391         radeon_llvm_dispose(&ctx.radeon_bld);
7392         return status;
7393 }
7394
7395 /**
7396  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7397  */
7398 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7399                                       LLVMTargetMachineRef tm,
7400                                       struct si_shader *shader,
7401                                       struct pipe_debug_callback *debug)
7402 {
7403         struct tgsi_shader_info *info = &shader->selector->info;
7404         union si_shader_part_key prolog_key;
7405         union si_shader_part_key epilog_key;
7406         unsigned i;
7407
7408         /* Get the prolog. */
7409         memset(&prolog_key, 0, sizeof(prolog_key));
7410         prolog_key.ps_prolog.states = shader->key.ps.prolog;
7411         prolog_key.ps_prolog.colors_read = info->colors_read;
7412         prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7413         prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7414         prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7415                 (prolog_key.ps_prolog.colors_read ||
7416                  prolog_key.ps_prolog.states.force_persp_sample_interp ||
7417                  prolog_key.ps_prolog.states.force_linear_sample_interp);
7418
7419         if (info->colors_read) {
7420                 unsigned *color = shader->selector->color_attr_index;
7421
7422                 if (shader->key.ps.prolog.color_two_side) {
7423                         /* BCOLORs are stored after the last input. */
7424                         prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7425                         prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7426                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7427                 }
7428
7429                 for (i = 0; i < 2; i++) {
7430                         unsigned location = info->input_interpolate_loc[color[i]];
7431
7432                         if (!(info->colors_read & (0xf << i*4)))
7433                                 continue;
7434
7435                         prolog_key.ps_prolog.color_attr_index[i] = color[i];
7436
7437                         switch (info->input_interpolate[color[i]]) {
7438                         case TGSI_INTERPOLATE_CONSTANT:
7439                                 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7440                                 break;
7441                         case TGSI_INTERPOLATE_PERSPECTIVE:
7442                         case TGSI_INTERPOLATE_COLOR:
7443                                 /* Force the interpolation location for colors here. */
7444                                 if (shader->key.ps.prolog.force_persp_sample_interp)
7445                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7446
7447                                 switch (location) {
7448                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7449                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7450                                         shader->config.spi_ps_input_ena |=
7451                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
7452                                         break;
7453                                 case TGSI_INTERPOLATE_LOC_CENTER:
7454                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7455                                         shader->config.spi_ps_input_ena |=
7456                                                 S_0286CC_PERSP_CENTER_ENA(1);
7457                                         break;
7458                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7459                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7460                                         shader->config.spi_ps_input_ena |=
7461                                                 S_0286CC_PERSP_CENTROID_ENA(1);
7462                                         break;
7463                                 default:
7464                                         assert(0);
7465                                 }
7466                                 break;
7467                         case TGSI_INTERPOLATE_LINEAR:
7468                                 /* Force the interpolation location for colors here. */
7469                                 if (shader->key.ps.prolog.force_linear_sample_interp)
7470                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7471
7472                                 switch (location) {
7473                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7474                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7475                                         shader->config.spi_ps_input_ena |=
7476                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
7477                                         break;
7478                                 case TGSI_INTERPOLATE_LOC_CENTER:
7479                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7480                                         shader->config.spi_ps_input_ena |=
7481                                                 S_0286CC_LINEAR_CENTER_ENA(1);
7482                                         break;
7483                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7484                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7485                                         shader->config.spi_ps_input_ena |=
7486                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
7487                                         break;
7488                                 default:
7489                                         assert(0);
7490                                 }
7491                                 break;
7492                         default:
7493                                 assert(0);
7494                         }
7495                 }
7496         }
7497
7498         /* The prolog is a no-op if these aren't set. */
7499         if (prolog_key.ps_prolog.colors_read ||
7500             prolog_key.ps_prolog.states.force_persp_sample_interp ||
7501             prolog_key.ps_prolog.states.force_linear_sample_interp ||
7502             prolog_key.ps_prolog.states.poly_stipple) {
7503                 shader->prolog =
7504                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
7505                                            &prolog_key, tm, debug,
7506                                            si_compile_ps_prolog);
7507                 if (!shader->prolog)
7508                         return false;
7509         }
7510
7511         /* Get the epilog. */
7512         memset(&epilog_key, 0, sizeof(epilog_key));
7513         epilog_key.ps_epilog.colors_written = info->colors_written;
7514         epilog_key.ps_epilog.writes_z = info->writes_z;
7515         epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7516         epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7517         epilog_key.ps_epilog.states = shader->key.ps.epilog;
7518
7519         shader->epilog =
7520                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7521                                    &epilog_key, tm, debug,
7522                                    si_compile_ps_epilog);
7523         if (!shader->epilog)
7524                 return false;
7525
7526         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7527         if (shader->key.ps.prolog.poly_stipple) {
7528                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7529                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7530         }
7531
7532         /* Set up the enable bits for per-sample shading if needed. */
7533         if (shader->key.ps.prolog.force_persp_sample_interp &&
7534             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7535              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7536                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7537                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7538                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7539         }
7540         if (shader->key.ps.prolog.force_linear_sample_interp &&
7541             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7542              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7543                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7544                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7545                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7546         }
7547
7548         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7549         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7550             !(shader->config.spi_ps_input_ena & 0xf)) {
7551                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7552                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7553         }
7554
7555         /* At least one pair of interpolation weights must be enabled. */
7556         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7557                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7558                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7559         }
7560
7561         /* The sample mask input is always enabled, because the API shader always
7562          * passes it through to the epilog. Disable it here if it's unused.
7563          */
7564         if (!shader->key.ps.epilog.poly_line_smoothing &&
7565             !shader->selector->info.reads_samplemask)
7566                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7567
7568         return true;
7569 }
7570
7571 static void si_fix_num_sgprs(struct si_shader *shader)
7572 {
7573         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7574
7575         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7576 }
7577
7578 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7579                      struct si_shader *shader,
7580                      struct pipe_debug_callback *debug)
7581 {
7582         struct si_shader *mainp = shader->selector->main_shader_part;
7583         int r;
7584
7585         /* LS, ES, VS are compiled on demand if the main part hasn't been
7586          * compiled for that stage.
7587          */
7588         if (!mainp ||
7589             (shader->selector->type == PIPE_SHADER_VERTEX &&
7590              (shader->key.vs.as_es != mainp->key.vs.as_es ||
7591               shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7592             (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7593              shader->key.tes.as_es != mainp->key.tes.as_es) ||
7594             (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7595              shader->key.tcs.epilog.inputs_to_copy) ||
7596             shader->selector->type == PIPE_SHADER_COMPUTE) {
7597                 /* Monolithic shader (compiled as a whole, has many variants,
7598                  * may take a long time to compile).
7599                  */
7600                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7601                 if (r)
7602                         return r;
7603         } else {
7604                 /* The shader consists of 2-3 parts:
7605                  *
7606                  * - the middle part is the user shader, it has 1 variant only
7607                  *   and it was compiled during the creation of the shader
7608                  *   selector
7609                  * - the prolog part is inserted at the beginning
7610                  * - the epilog part is inserted at the end
7611                  *
7612                  * The prolog and epilog have many (but simple) variants.
7613                  */
7614
7615                 /* Copy the compiled TGSI shader data over. */
7616                 shader->is_binary_shared = true;
7617                 shader->binary = mainp->binary;
7618                 shader->config = mainp->config;
7619                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7620                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7621                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7622                 memcpy(shader->info.vs_output_param_offset,
7623                        mainp->info.vs_output_param_offset,
7624                        sizeof(mainp->info.vs_output_param_offset));
7625                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7626                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7627                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7628
7629                 /* Select prologs and/or epilogs. */
7630                 switch (shader->selector->type) {
7631                 case PIPE_SHADER_VERTEX:
7632                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7633                                 return -1;
7634                         break;
7635                 case PIPE_SHADER_TESS_CTRL:
7636                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7637                                 return -1;
7638                         break;
7639                 case PIPE_SHADER_TESS_EVAL:
7640                         if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7641                                 return -1;
7642                         break;
7643                 case PIPE_SHADER_FRAGMENT:
7644                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7645                                 return -1;
7646
7647                         /* Make sure we have at least as many VGPRs as there
7648                          * are allocated inputs.
7649                          */
7650                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7651                                                         shader->info.num_input_vgprs);
7652                         break;
7653                 }
7654
7655                 /* Update SGPR and VGPR counts. */
7656                 if (shader->prolog) {
7657                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7658                                                         shader->prolog->config.num_sgprs);
7659                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7660                                                         shader->prolog->config.num_vgprs);
7661                 }
7662                 if (shader->epilog) {
7663                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7664                                                         shader->epilog->config.num_sgprs);
7665                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7666                                                         shader->epilog->config.num_vgprs);
7667                 }
7668         }
7669
7670         si_fix_num_sgprs(shader);
7671         si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7672                        stderr);
7673
7674         /* Upload. */
7675         r = si_shader_binary_upload(sscreen, shader);
7676         if (r) {
7677                 fprintf(stderr, "LLVM failed to upload shader\n");
7678                 return r;
7679         }
7680
7681         return 0;
7682 }
7683
7684 void si_shader_destroy(struct si_shader *shader)
7685 {
7686         if (shader->gs_copy_shader) {
7687                 si_shader_destroy(shader->gs_copy_shader);
7688                 FREE(shader->gs_copy_shader);
7689         }
7690
7691         if (shader->scratch_bo)
7692                 r600_resource_reference(&shader->scratch_bo, NULL);
7693
7694         r600_resource_reference(&shader->bo, NULL);
7695
7696         if (!shader->is_binary_shared)
7697                 radeon_shader_binary_clean(&shader->binary);
7698 }