src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_bitarit.h"
  35 #include "gallivm/lp_bld_flow.h"
  36 #include "radeon/r600_cs.h"
  37 #include "radeon/radeon_llvm.h"
  38 #include "radeon/radeon_elf_util.h"
  39 #include "radeon/radeon_llvm_emit.h"
  40 #include "util/u_memory.h"
  41 #include "util/u_pstipple.h"
  42 #include "util/u_string.h"
  43 #include "tgsi/tgsi_parse.h"
  44 #include "tgsi/tgsi_build.h"
  45 #include "tgsi/tgsi_util.h"
  46 #include "tgsi/tgsi_dump.h"
  47
  48 #include "si_pipe.h"
  49 #include "si_shader.h"
  50 #include "sid.h"
  51
  52 #include <errno.h>
  53
  54 static const char *scratch_rsrc_dword0_symbol =
  55         "SCRATCH_RSRC_DWORD0";
  56
  57 static const char *scratch_rsrc_dword1_symbol =
  58         "SCRATCH_RSRC_DWORD1";
  59
  60 struct si_shader_output_values
  61 {
  62         LLVMValueRef values[4];
  63         unsigned name;
  64         unsigned sid;
  65 };
  66
  67 struct si_shader_context
  68 {
  69         struct radeon_llvm_context radeon_bld;
  70         struct si_shader *shader;
  71         struct si_screen *screen;
  72
  73         unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
  74         bool is_gs_copy_shader;
  75
  76         /* Whether to generate the optimized shader variant compiled as a whole
  77          * (without a prolog and epilog)
  78          */
  79         bool is_monolithic;
  80
  81         int param_streamout_config;
  82         int param_streamout_write_index;
  83         int param_streamout_offset[4];
  84         int param_vertex_id;
  85         int param_rel_auto_id;
  86         int param_vs_prim_id;
  87         int param_instance_id;
  88         int param_vertex_index0;
  89         int param_tes_u;
  90         int param_tes_v;
  91         int param_tes_rel_patch_id;
  92         int param_tes_patch_id;
  93         int param_es2gs_offset;
  94         int param_oc_lds;
  95
  96         /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
  97          * 0x800000 for VS, 0x1 for ES.
  98          */
  99         int param_tess_offchip;
 100
 101         LLVMTargetMachineRef tm;
 102
 103         unsigned uniform_md_kind;
 104         LLVMValueRef const_md;
 105         LLVMValueRef empty_md;
 106         LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
 107         LLVMValueRef lds;
 108         LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
 109         LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
 110         LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
 111         LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
 112         LLVMValueRef fmasks[SI_NUM_SAMPLERS];
 113         LLVMValueRef images[SI_NUM_IMAGES];
 114         LLVMValueRef so_buffers[4];
 115         LLVMValueRef esgs_ring;
 116         LLVMValueRef gsvs_ring[4];
 117         LLVMValueRef gs_next_vertex[4];
 118         LLVMValueRef return_value;
 119
 120         LLVMTypeRef voidt;
 121         LLVMTypeRef i1;
 122         LLVMTypeRef i8;
 123         LLVMTypeRef i32;
 124         LLVMTypeRef i64;
 125         LLVMTypeRef i128;
 126         LLVMTypeRef f32;
 127         LLVMTypeRef v16i8;
 128         LLVMTypeRef v2i32;
 129         LLVMTypeRef v4i32;
 130         LLVMTypeRef v4f32;
 131         LLVMTypeRef v8i32;
 132
 133         LLVMValueRef shared_memory;
 134 };
 135
 136 static struct si_shader_context *si_shader_context(
 137         struct lp_build_tgsi_context *bld_base)
 138 {
 139         return (struct si_shader_context *)bld_base;
 140 }
 141
 142 static void si_init_shader_ctx(struct si_shader_context *ctx,
 143                                struct si_screen *sscreen,
 144                                struct si_shader *shader,
 145                                LLVMTargetMachineRef tm);
 146
 147 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 148                                  struct lp_build_tgsi_context *bld_base,
 149                                  struct lp_build_emit_data *emit_data);
 150
 151 /* Ideally pass the sample mask input to the PS epilog as v13, which
 152  * is its usual location, so that the shader doesn't have to add v_mov.
 153  */
 154 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
 155
 156 /* The VS location of the PrimitiveID input is the same in the epilog,
 157  * so that the main shader part doesn't have to move it.
 158  */
 159 #define VS_EPILOG_PRIMID_LOC 2
 160
 161 #define PERSPECTIVE_BASE 0
 162 #define LINEAR_BASE 9
 163
 164 #define SAMPLE_OFFSET 0
 165 #define CENTER_OFFSET 2
 166 #define CENTROID_OFSET 4
 167
 168 #define USE_SGPR_MAX_SUFFIX_LEN 5
 169 #define CONST_ADDR_SPACE 2
 170 #define LOCAL_ADDR_SPACE 3
 171 #define USER_SGPR_ADDR_SPACE 8
 172
 173
 174 #define SENDMSG_GS 2
 175 #define SENDMSG_GS_DONE 3
 176
 177 #define SENDMSG_GS_OP_NOP      (0 << 4)
 178 #define SENDMSG_GS_OP_CUT      (1 << 4)
 179 #define SENDMSG_GS_OP_EMIT     (2 << 4)
 180 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
 181
 182 /**
 183  * Returns a unique index for a semantic name and index. The index must be
 184  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 185  * calculated.
 186  */
 187 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 188 {
 189         switch (semantic_name) {
 190         case TGSI_SEMANTIC_POSITION:
 191                 return 0;
 192         case TGSI_SEMANTIC_PSIZE:
 193                 return 1;
 194         case TGSI_SEMANTIC_CLIPDIST:
 195                 assert(index <= 1);
 196                 return 2 + index;
 197         case TGSI_SEMANTIC_GENERIC:
 198                 if (index <= 63-4)
 199                         return 4 + index;
 200                 else
 201                         /* same explanation as in the default statement,
 202                          * the only user hitting this is st/nine.
 203                          */
 204                         return 0;
 205
 206         /* patch indices are completely separate and thus start from 0 */
 207         case TGSI_SEMANTIC_TESSOUTER:
 208                 return 0;
 209         case TGSI_SEMANTIC_TESSINNER:
 210                 return 1;
 211         case TGSI_SEMANTIC_PATCH:
 212                 return 2 + index;
 213
 214         default:
 215                 /* Don't fail here. The result of this function is only used
 216                  * for LS, TCS, TES, and GS, where legacy GL semantics can't
 217                  * occur, but this function is called for all vertex shaders
 218                  * before it's known whether LS will be compiled or not.
 219                  */
 220                 return 0;
 221         }
 222 }
 223
 224 /**
 225  * Get the value of a shader input parameter and extract a bitfield.
 226  */
 227 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
 228                                  unsigned param, unsigned rshift,
 229                                  unsigned bitwidth)
 230 {
 231         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 232         LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
 233                                           param);
 234
 235         if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
 236                 value = bitcast(&ctx->radeon_bld.soa.bld_base,
 237                                 TGSI_TYPE_UNSIGNED, value);
 238
 239         if (rshift)
 240                 value = LLVMBuildLShr(gallivm->builder, value,
 241                                       lp_build_const_int32(gallivm, rshift), "");
 242
 243         if (rshift + bitwidth < 32) {
 244                 unsigned mask = (1 << bitwidth) - 1;
 245                 value = LLVMBuildAnd(gallivm->builder, value,
 246                                      lp_build_const_int32(gallivm, mask), "");
 247         }
 248
 249         return value;
 250 }
 251
 252 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
 253 {
 254         switch (ctx->type) {
 255         case PIPE_SHADER_TESS_CTRL:
 256                 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
 257
 258         case PIPE_SHADER_TESS_EVAL:
 259                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 260                                     ctx->param_tes_rel_patch_id);
 261
 262         default:
 263                 assert(0);
 264                 return NULL;
 265         }
 266 }
 267
 268 /* Tessellation shaders pass outputs to the next shader using LDS.
 269  *
 270  * LS outputs = TCS inputs
 271  * TCS outputs = TES inputs
 272  *
 273  * The LDS layout is:
 274  * - TCS inputs for patch 0
 275  * - TCS inputs for patch 1
 276  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 277  * - ...
 278  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 279  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 280  * - TCS outputs for patch 1
 281  * - Per-patch TCS outputs for patch 1
 282  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 283  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 284  * - ...
 285  *
 286  * All three shaders VS(LS), TCS, TES share the same LDS space.
 287  */
 288
 289 static LLVMValueRef
 290 get_tcs_in_patch_stride(struct si_shader_context *ctx)
 291 {
 292         if (ctx->type == PIPE_SHADER_VERTEX)
 293                 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
 294         else if (ctx->type == PIPE_SHADER_TESS_CTRL)
 295                 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
 296         else {
 297                 assert(0);
 298                 return NULL;
 299         }
 300 }
 301
 302 static LLVMValueRef
 303 get_tcs_out_patch_stride(struct si_shader_context *ctx)
 304 {
 305         return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
 306 }
 307
 308 static LLVMValueRef
 309 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 310 {
 311         return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
 312                                 unpack_param(ctx,
 313                                              SI_PARAM_TCS_OUT_OFFSETS,
 314                                              0, 16),
 315                                 4);
 316 }
 317
 318 static LLVMValueRef
 319 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 320 {
 321         return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
 322                                 unpack_param(ctx,
 323                                              SI_PARAM_TCS_OUT_OFFSETS,
 324                                              16, 16),
 325                                 4);
 326 }
 327
 328 static LLVMValueRef
 329 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
 330 {
 331         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 332         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
 333         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 334
 335         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 336 }
 337
 338 static LLVMValueRef
 339 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
 340 {
 341         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 342         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
 343         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 344         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 345
 346         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 347                             LLVMBuildMul(gallivm->builder, patch_stride,
 348                                          rel_patch_id, ""),
 349                             "");
 350 }
 351
 352 static LLVMValueRef
 353 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
 354 {
 355         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 356         LLVMValueRef patch0_patch_data_offset =
 357                 get_tcs_out_patch0_patch_data_offset(ctx);
 358         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
 359         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
 360
 361         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 362                             LLVMBuildMul(gallivm->builder, patch_stride,
 363                                          rel_patch_id, ""),
 364                             "");
 365 }
 366
 367 static void build_indexed_store(struct si_shader_context *ctx,
 368                                 LLVMValueRef base_ptr, LLVMValueRef index,
 369                                 LLVMValueRef value)
 370 {
 371         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 372         struct gallivm_state *gallivm = bld_base->base.gallivm;
 373         LLVMValueRef indices[2], pointer;
 374
 375         indices[0] = bld_base->uint_bld.zero;
 376         indices[1] = index;
 377
 378         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 379         LLVMBuildStore(gallivm->builder, value, pointer);
 380 }
 381
 382 /**
 383  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
 384  * It's equivalent to doing a load from &base_ptr[index].
 385  *
 386  * \param base_ptr  Where the array starts.
 387  * \param index     The element index into the array.
 388  * \param uniform   Whether the base_ptr and index can be assumed to be
 389  *                  dynamically uniform
 390  */
 391 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
 392                                        LLVMValueRef base_ptr, LLVMValueRef index,
 393                                        bool uniform)
 394 {
 395         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 396         struct gallivm_state *gallivm = bld_base->base.gallivm;
 397         LLVMValueRef indices[2], pointer;
 398
 399         indices[0] = bld_base->uint_bld.zero;
 400         indices[1] = index;
 401
 402         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 403         if (uniform)
 404                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
 405         return LLVMBuildLoad(gallivm->builder, pointer, "");
 406 }
 407
 408 /**
 409  * Do a load from &base_ptr[index], but also add a flag that it's loading
 410  * a constant from a dynamically uniform index.
 411  */
 412 static LLVMValueRef build_indexed_load_const(
 413         struct si_shader_context *ctx,
 414         LLVMValueRef base_ptr, LLVMValueRef index)
 415 {
 416         LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
 417         LLVMSetMetadata(result, 1, ctx->const_md);
 418         return result;
 419 }
 420
 421 static LLVMValueRef get_instance_index_for_fetch(
 422         struct radeon_llvm_context *radeon_bld,
 423         unsigned param_start_instance, unsigned divisor)
 424 {
 425         struct si_shader_context *ctx =
 426                 si_shader_context(&radeon_bld->soa.bld_base);
 427         struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
 428
 429         LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
 430                                            ctx->param_instance_id);
 431
 432         /* The division must be done before START_INSTANCE is added. */
 433         if (divisor > 1)
 434                 result = LLVMBuildUDiv(gallivm->builder, result,
 435                                 lp_build_const_int32(gallivm, divisor), "");
 436
 437         return LLVMBuildAdd(gallivm->builder, result,
 438                             LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 439 }
 440
 441 static void declare_input_vs(
 442         struct radeon_llvm_context *radeon_bld,
 443         unsigned input_index,
 444         const struct tgsi_full_declaration *decl)
 445 {
 446         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 447         struct gallivm_state *gallivm = base->gallivm;
 448         struct si_shader_context *ctx =
 449                 si_shader_context(&radeon_bld->soa.bld_base);
 450         unsigned divisor =
 451                 ctx->shader->key.vs.prolog.instance_divisors[input_index];
 452
 453         unsigned chan;
 454
 455         LLVMValueRef t_list_ptr;
 456         LLVMValueRef t_offset;
 457         LLVMValueRef t_list;
 458         LLVMValueRef attribute_offset;
 459         LLVMValueRef buffer_index;
 460         LLVMValueRef args[3];
 461         LLVMValueRef input;
 462
 463         /* Load the T list */
 464         t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
 465
 466         t_offset = lp_build_const_int32(gallivm, input_index);
 467
 468         t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
 469
 470         /* Build the attribute offset */
 471         attribute_offset = lp_build_const_int32(gallivm, 0);
 472
 473         if (!ctx->is_monolithic) {
 474                 buffer_index = LLVMGetParam(radeon_bld->main_fn,
 475                                             ctx->param_vertex_index0 +
 476                                             input_index);
 477         } else if (divisor) {
 478                 /* Build index from instance ID, start instance and divisor */
 479                 ctx->shader->info.uses_instanceid = true;
 480                 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
 481                                                             SI_PARAM_START_INSTANCE,
 482                                                             divisor);
 483         } else {
 484                 /* Load the buffer index for vertices. */
 485                 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
 486                                                       ctx->param_vertex_id);
 487                 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
 488                                                         SI_PARAM_BASE_VERTEX);
 489                 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
 490         }
 491
 492         args[0] = t_list;
 493         args[1] = attribute_offset;
 494         args[2] = buffer_index;
 495         input = lp_build_intrinsic(gallivm->builder,
 496                 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
 497                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 498
 499         /* Break up the vec4 into individual components */
 500         for (chan = 0; chan < 4; chan++) {
 501                 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 502                 /* XXX: Use a helper function for this.  There is one in
 503                  * tgsi_llvm.c. */
 504                 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
 505                                 LLVMBuildExtractElement(gallivm->builder,
 506                                 input, llvm_chan, "");
 507         }
 508 }
 509
 510 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 511                                      unsigned swizzle)
 512 {
 513         struct si_shader_context *ctx = si_shader_context(bld_base);
 514
 515         if (swizzle > 0)
 516                 return bld_base->uint_bld.zero;
 517
 518         switch (ctx->type) {
 519         case PIPE_SHADER_VERTEX:
 520                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 521                                     ctx->param_vs_prim_id);
 522         case PIPE_SHADER_TESS_CTRL:
 523                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 524                                     SI_PARAM_PATCH_ID);
 525         case PIPE_SHADER_TESS_EVAL:
 526                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 527                                     ctx->param_tes_patch_id);
 528         case PIPE_SHADER_GEOMETRY:
 529                 return LLVMGetParam(ctx->radeon_bld.main_fn,
 530                                     SI_PARAM_PRIMITIVE_ID);
 531         default:
 532                 assert(0);
 533                 return bld_base->uint_bld.zero;
 534         }
 535 }
 536
 537 /**
 538  * Return the value of tgsi_ind_register for indexing.
 539  * This is the indirect index with the constant offset added to it.
 540  */
 541 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 542                                        const struct tgsi_ind_register *ind,
 543                                        int rel_index)
 544 {
 545         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 546         LLVMValueRef result;
 547
 548         result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
 549         result = LLVMBuildLoad(gallivm->builder, result, "");
 550         result = LLVMBuildAdd(gallivm->builder, result,
 551                               lp_build_const_int32(gallivm, rel_index), "");
 552         return result;
 553 }
 554
 555 /**
 556  * Like get_indirect_index, but restricts the return value to a (possibly
 557  * undefined) value inside [0..num).
 558  */
 559 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
 560                                                const struct tgsi_ind_register *ind,
 561                                                int rel_index, unsigned num)
 562 {
 563         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 564         LLVMBuilderRef builder = gallivm->builder;
 565         LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
 566         LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
 567         LLVMValueRef cc;
 568
 569         /* LLVM 3.8: If indirect resource indexing is used:
 570          * - SI & CIK hang
 571          * - VI crashes
 572          */
 573         if (HAVE_LLVM <= 0x0308)
 574                 return LLVMGetUndef(ctx->i32);
 575
 576         if (util_is_power_of_two(num)) {
 577                 result = LLVMBuildAnd(builder, result, c_max, "");
 578         } else {
 579                 /* In theory, this MAX pattern should result in code that is
 580                  * as good as the bit-wise AND above.
 581                  *
 582                  * In practice, LLVM generates worse code (at the time of
 583                  * writing), because its value tracking is not strong enough.
 584                  */
 585                 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
 586                 result = LLVMBuildSelect(builder, cc, result, c_max, "");
 587         }
 588
 589         return result;
 590 }
 591
 592
 593 /**
 594  * Calculate a dword address given an input or output register and a stride.
 595  */
 596 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
 597                                    const struct tgsi_full_dst_register *dst,
 598                                    const struct tgsi_full_src_register *src,
 599                                    LLVMValueRef vertex_dw_stride,
 600                                    LLVMValueRef base_addr)
 601 {
 602         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 603         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 604         ubyte *name, *index, *array_first;
 605         int first, param;
 606         struct tgsi_full_dst_register reg;
 607
 608         /* Set the register description. The address computation is the same
 609          * for sources and destinations. */
 610         if (src) {
 611                 reg.Register.File = src->Register.File;
 612                 reg.Register.Index = src->Register.Index;
 613                 reg.Register.Indirect = src->Register.Indirect;
 614                 reg.Register.Dimension = src->Register.Dimension;
 615                 reg.Indirect = src->Indirect;
 616                 reg.Dimension = src->Dimension;
 617                 reg.DimIndirect = src->DimIndirect;
 618         } else
 619                 reg = *dst;
 620
 621         /* If the register is 2-dimensional (e.g. an array of vertices
 622          * in a primitive), calculate the base address of the vertex. */
 623         if (reg.Register.Dimension) {
 624                 LLVMValueRef index;
 625
 626                 if (reg.Dimension.Indirect)
 627                         index = get_indirect_index(ctx, &reg.DimIndirect,
 628                                                    reg.Dimension.Index);
 629                 else
 630                         index = lp_build_const_int32(gallivm, reg.Dimension.Index);
 631
 632                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 633                                          LLVMBuildMul(gallivm->builder, index,
 634                                                       vertex_dw_stride, ""), "");
 635         }
 636
 637         /* Get information about the register. */
 638         if (reg.Register.File == TGSI_FILE_INPUT) {
 639                 name = info->input_semantic_name;
 640                 index = info->input_semantic_index;
 641                 array_first = info->input_array_first;
 642         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 643                 name = info->output_semantic_name;
 644                 index = info->output_semantic_index;
 645                 array_first = info->output_array_first;
 646         } else {
 647                 assert(0);
 648                 return NULL;
 649         }
 650
 651         if (reg.Register.Indirect) {
 652                 /* Add the relative address of the element. */
 653                 LLVMValueRef ind_index;
 654
 655                 if (reg.Indirect.ArrayID)
 656                         first = array_first[reg.Indirect.ArrayID];
 657                 else
 658                         first = reg.Register.Index;
 659
 660                 ind_index = get_indirect_index(ctx, &reg.Indirect,
 661                                            reg.Register.Index - first);
 662
 663                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 664                                     LLVMBuildMul(gallivm->builder, ind_index,
 665                                                  lp_build_const_int32(gallivm, 4), ""), "");
 666
 667                 param = si_shader_io_get_unique_index(name[first], index[first]);
 668         } else {
 669                 param = si_shader_io_get_unique_index(name[reg.Register.Index],
 670                                                       index[reg.Register.Index]);
 671         }
 672
 673         /* Add the base address of the element. */
 674         return LLVMBuildAdd(gallivm->builder, base_addr,
 675                             lp_build_const_int32(gallivm, param * 4), "");
 676 }
 677
 678 /* The offchip buffer layout for TCS->TES is
 679  *
 680  * - attribute 0 of patch 0 vertex 0
 681  * - attribute 0 of patch 0 vertex 1
 682  * - attribute 0 of patch 0 vertex 2
 683  *   ...
 684  * - attribute 0 of patch 1 vertex 0
 685  * - attribute 0 of patch 1 vertex 1
 686  *   ...
 687  * - attribute 1 of patch 0 vertex 0
 688  * - attribute 1 of patch 0 vertex 1
 689  *   ...
 690  * - per patch attribute 0 of patch 0
 691  * - per patch attribute 0 of patch 1
 692  *   ...
 693  *
 694  * Note that every attribute has 4 components.
 695  */
 696 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
 697                                                LLVMValueRef vertex_index,
 698                                                LLVMValueRef param_index)
 699 {
 700         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 701         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
 702         LLVMValueRef param_stride, constant16;
 703
 704         vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
 705         num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
 706         total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
 707                                       num_patches, "");
 708
 709         constant16 = lp_build_const_int32(gallivm, 16);
 710         if (vertex_index) {
 711                 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
 712                                          vertices_per_patch, "");
 713
 714                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 715                                          vertex_index, "");
 716
 717                 param_stride = total_vertices;
 718         } else {
 719                 base_addr = get_rel_patch_id(ctx);
 720                 param_stride = num_patches;
 721         }
 722
 723         base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 724                                  LLVMBuildMul(gallivm->builder, param_index,
 725                                               param_stride, ""), "");
 726
 727         base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
 728
 729         if (!vertex_index) {
 730                 LLVMValueRef patch_data_offset =
 731                            unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
 732
 733                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 734                                          patch_data_offset, "");
 735         }
 736         return base_addr;
 737 }
 738
 739 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
 740                                        struct si_shader_context *ctx,
 741                                        const struct tgsi_full_dst_register *dst,
 742                                        const struct tgsi_full_src_register *src)
 743 {
 744         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
 745         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 746         ubyte *name, *index, *array_first;
 747         struct tgsi_full_src_register reg;
 748         LLVMValueRef vertex_index = NULL;
 749         LLVMValueRef param_index = NULL;
 750         unsigned param_index_base, param_base;
 751
 752         reg = src ? *src : tgsi_full_src_register_from_dst(dst);
 753
 754         if (reg.Register.Dimension) {
 755
 756                 if (reg.Dimension.Indirect)
 757                         vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
 758                                                           reg.Dimension.Index);
 759                 else
 760                         vertex_index = lp_build_const_int32(gallivm,
 761                                                             reg.Dimension.Index);
 762         }
 763
 764         /* Get information about the register. */
 765         if (reg.Register.File == TGSI_FILE_INPUT) {
 766                 name = info->input_semantic_name;
 767                 index = info->input_semantic_index;
 768                 array_first = info->input_array_first;
 769         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 770                 name = info->output_semantic_name;
 771                 index = info->output_semantic_index;
 772                 array_first = info->output_array_first;
 773         } else {
 774                 assert(0);
 775                 return NULL;
 776         }
 777
 778         if (reg.Register.Indirect) {
 779                 if (reg.Indirect.ArrayID)
 780                         param_base = array_first[reg.Indirect.ArrayID];
 781                 else
 782                         param_base = reg.Register.Index;
 783
 784                 param_index = get_indirect_index(ctx, &reg.Indirect,
 785                                                  reg.Register.Index - param_base);
 786
 787         } else {
 788                 param_base = reg.Register.Index;
 789                 param_index = lp_build_const_int32(gallivm, 0);
 790         }
 791
 792         param_index_base = si_shader_io_get_unique_index(name[param_base],
 793                                                          index[param_base]);
 794
 795         param_index = LLVMBuildAdd(gallivm->builder, param_index,
 796                                    lp_build_const_int32(gallivm, param_index_base),
 797                                    "");
 798
 799         return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
 800 }
 801
 802 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
 803  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
 804  * or v4i32 (num_channels=3,4). */
 805 static void build_tbuffer_store(struct si_shader_context *ctx,
 806                                 LLVMValueRef rsrc,
 807                                 LLVMValueRef vdata,
 808                                 unsigned num_channels,
 809                                 LLVMValueRef vaddr,
 810                                 LLVMValueRef soffset,
 811                                 unsigned inst_offset,
 812                                 unsigned dfmt,
 813                                 unsigned nfmt,
 814                                 unsigned offen,
 815                                 unsigned idxen,
 816                                 unsigned glc,
 817                                 unsigned slc,
 818                                 unsigned tfe)
 819 {
 820         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 821         LLVMValueRef args[] = {
 822                 rsrc,
 823                 vdata,
 824                 LLVMConstInt(ctx->i32, num_channels, 0),
 825                 vaddr,
 826                 soffset,
 827                 LLVMConstInt(ctx->i32, inst_offset, 0),
 828                 LLVMConstInt(ctx->i32, dfmt, 0),
 829                 LLVMConstInt(ctx->i32, nfmt, 0),
 830                 LLVMConstInt(ctx->i32, offen, 0),
 831                 LLVMConstInt(ctx->i32, idxen, 0),
 832                 LLVMConstInt(ctx->i32, glc, 0),
 833                 LLVMConstInt(ctx->i32, slc, 0),
 834                 LLVMConstInt(ctx->i32, tfe, 0)
 835         };
 836
 837         /* The instruction offset field has 12 bits */
 838         assert(offen || inst_offset < (1 << 12));
 839
 840         /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
 841         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 842         const char *types[] = {"i32", "v2i32", "v4i32"};
 843         char name[256];
 844         snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
 845
 846         lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
 847                            args, ARRAY_SIZE(args), 0);
 848 }
 849
 850 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
 851                                      LLVMValueRef rsrc,
 852                                      LLVMValueRef vdata,
 853                                      unsigned num_channels,
 854                                      LLVMValueRef vaddr,
 855                                      LLVMValueRef soffset,
 856                                      unsigned inst_offset)
 857 {
 858         static unsigned dfmt[] = {
 859                 V_008F0C_BUF_DATA_FORMAT_32,
 860                 V_008F0C_BUF_DATA_FORMAT_32_32,
 861                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
 862                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
 863         };
 864         assert(num_channels >= 1 && num_channels <= 4);
 865
 866         build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
 867                             inst_offset, dfmt[num_channels-1],
 868                             V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
 869 }
 870
 871 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
 872                                       LLVMValueRef rsrc,
 873                                       int num_channels,
 874                                       LLVMValueRef vindex,
 875                                       LLVMValueRef voffset,
 876                                       LLVMValueRef soffset,
 877                                       unsigned inst_offset,
 878                                       unsigned glc,
 879                                       unsigned slc)
 880 {
 881         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 882         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 883
 884         if (HAVE_LLVM >= 0x309) {
 885                 LLVMValueRef args[] = {
 886                         LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
 887                         vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
 888                         LLVMConstInt(ctx->i32, inst_offset, 0),
 889                         LLVMConstInt(ctx->i1, glc, 0),
 890                         LLVMConstInt(ctx->i1, slc, 0)
 891                 };
 892
 893                 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
 894                                        ctx->v4f32};
 895                 const char *type_names[] = {"f32", "v2f32", "v4f32"};
 896                 char name[256];
 897
 898                 if (voffset) {
 899                         args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
 900                                                "");
 901                 }
 902
 903                 if (soffset) {
 904                         args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
 905                                                "");
 906                 }
 907
 908                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
 909                          type_names[func]);
 910
 911                 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
 912                                           ARRAY_SIZE(args), LLVMReadOnlyAttribute |
 913                                           LLVMNoUnwindAttribute);
 914         } else {
 915                 LLVMValueRef args[] = {
 916                         LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
 917                         voffset ? voffset : vindex,
 918                         soffset,
 919                         LLVMConstInt(ctx->i32, inst_offset, 0),
 920                         LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
 921                         LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
 922                         LLVMConstInt(ctx->i32, glc, 0),
 923                         LLVMConstInt(ctx->i32, slc, 0),
 924                         LLVMConstInt(ctx->i32, 0, 0), // TFE
 925                 };
 926
 927                 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
 928                                        ctx->v4i32};
 929                 const char *type_names[] = {"i32", "v2i32", "v4i32"};
 930                 const char *arg_type = "i32";
 931                 char name[256];
 932
 933                 if (voffset && vindex) {
 934                         LLVMValueRef vaddr[] = {vindex, voffset};
 935
 936                         arg_type = "v2i32";
 937                         args[1] = lp_build_gather_values(gallivm, vaddr, 2);
 938                 }
 939
 940                 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
 941                          type_names[func], arg_type);
 942
 943                 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
 944                                           ARRAY_SIZE(args), LLVMReadOnlyAttribute |
 945                                           LLVMNoUnwindAttribute);
 946         }
 947 }
 948
 949 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
 950                                 enum tgsi_opcode_type type, unsigned swizzle,
 951                                 LLVMValueRef buffer, LLVMValueRef offset,
 952                                 LLVMValueRef base)
 953 {
 954         struct si_shader_context *ctx = si_shader_context(bld_base);
 955         struct gallivm_state *gallivm = bld_base->base.gallivm;
 956         LLVMValueRef value, value2;
 957         LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
 958         LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
 959
 960         if (swizzle == ~0) {
 961                 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
 962                                           0, 1, 0);
 963
 964                 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 965         }
 966
 967         if (!tgsi_type_is_64bit(type)) {
 968                 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
 969                                           0, 1, 0);
 970
 971                 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
 972                 return LLVMBuildExtractElement(gallivm->builder, value,
 973                                     lp_build_const_int32(gallivm, swizzle), "");
 974         }
 975
 976         value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
 977                                   swizzle * 4, 1, 0);
 978
 979         value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
 980                                    swizzle * 4 + 4, 1, 0);
 981
 982         return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
 983 }
 984
 985 /**
 986  * Load from LDS.
 987  *
 988  * \param type          output value type
 989  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 990  * \param dw_addr       address in dwords
 991  */
 992 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 993                              enum tgsi_opcode_type type, unsigned swizzle,
 994                              LLVMValueRef dw_addr)
 995 {
 996         struct si_shader_context *ctx = si_shader_context(bld_base);
 997         struct gallivm_state *gallivm = bld_base->base.gallivm;
 998         LLVMValueRef value;
 999
1000         if (swizzle == ~0) {
1001                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1002
1003                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1004                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
1005
1006                 return lp_build_gather_values(bld_base->base.gallivm, values,
1007                                               TGSI_NUM_CHANNELS);
1008         }
1009
1010         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1011                             lp_build_const_int32(gallivm, swizzle));
1012
1013         value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1014         if (tgsi_type_is_64bit(type)) {
1015                 LLVMValueRef value2;
1016                 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1017                                        lp_build_const_int32(gallivm, swizzle + 1));
1018                 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1019                 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1020         }
1021
1022         return LLVMBuildBitCast(gallivm->builder, value,
1023                                 tgsi2llvmtype(bld_base, type), "");
1024 }
1025
1026 /**
1027  * Store to LDS.
1028  *
1029  * \param swizzle       offset (typically 0..3)
1030  * \param dw_addr       address in dwords
1031  * \param value         value to store
1032  */
1033 static void lds_store(struct lp_build_tgsi_context *bld_base,
1034                       unsigned swizzle, LLVMValueRef dw_addr,
1035                       LLVMValueRef value)
1036 {
1037         struct si_shader_context *ctx = si_shader_context(bld_base);
1038         struct gallivm_state *gallivm = bld_base->base.gallivm;
1039
1040         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1041                             lp_build_const_int32(gallivm, swizzle));
1042
1043         value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1044         build_indexed_store(ctx, ctx->lds,
1045                             dw_addr, value);
1046 }
1047
1048 static LLVMValueRef fetch_input_tcs(
1049         struct lp_build_tgsi_context *bld_base,
1050         const struct tgsi_full_src_register *reg,
1051         enum tgsi_opcode_type type, unsigned swizzle)
1052 {
1053         struct si_shader_context *ctx = si_shader_context(bld_base);
1054         LLVMValueRef dw_addr, stride;
1055
1056         stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1057         dw_addr = get_tcs_in_current_patch_offset(ctx);
1058         dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1059
1060         return lds_load(bld_base, type, swizzle, dw_addr);
1061 }
1062
1063 static LLVMValueRef fetch_output_tcs(
1064                 struct lp_build_tgsi_context *bld_base,
1065                 const struct tgsi_full_src_register *reg,
1066                 enum tgsi_opcode_type type, unsigned swizzle)
1067 {
1068         struct si_shader_context *ctx = si_shader_context(bld_base);
1069         LLVMValueRef dw_addr, stride;
1070
1071         if (reg->Register.Dimension) {
1072                 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1073                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1074                 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1075         } else {
1076                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1077                 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1078         }
1079
1080         return lds_load(bld_base, type, swizzle, dw_addr);
1081 }
1082
1083 static LLVMValueRef fetch_input_tes(
1084         struct lp_build_tgsi_context *bld_base,
1085         const struct tgsi_full_src_register *reg,
1086         enum tgsi_opcode_type type, unsigned swizzle)
1087 {
1088         struct si_shader_context *ctx = si_shader_context(bld_base);
1089         struct gallivm_state *gallivm = bld_base->base.gallivm;
1090         LLVMValueRef rw_buffers, buffer, base, addr;
1091
1092         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1093                                   SI_PARAM_RW_BUFFERS);
1094         buffer = build_indexed_load_const(ctx, rw_buffers,
1095                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1096
1097         base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1098         addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1099
1100         return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1101 }
1102
1103 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1104                              const struct tgsi_full_instruction *inst,
1105                              const struct tgsi_opcode_info *info,
1106                              LLVMValueRef dst[4])
1107 {
1108         struct si_shader_context *ctx = si_shader_context(bld_base);
1109         struct gallivm_state *gallivm = bld_base->base.gallivm;
1110         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1111         unsigned chan_index;
1112         LLVMValueRef dw_addr, stride;
1113         LLVMValueRef rw_buffers, buffer, base, buf_addr;
1114         LLVMValueRef values[4];
1115
1116         /* Only handle per-patch and per-vertex outputs here.
1117          * Vectors will be lowered to scalars and this function will be called again.
1118          */
1119         if (reg->Register.File != TGSI_FILE_OUTPUT ||
1120             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1121                 radeon_llvm_emit_store(bld_base, inst, info, dst);
1122                 return;
1123         }
1124
1125         if (reg->Register.Dimension) {
1126                 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1127                 dw_addr = get_tcs_out_current_patch_offset(ctx);
1128                 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1129         } else {
1130                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1131                 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1132         }
1133
1134         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1135                                   SI_PARAM_RW_BUFFERS);
1136         buffer = build_indexed_load_const(ctx, rw_buffers,
1137                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1138
1139         base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1140         buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1141
1142
1143         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1144                 LLVMValueRef value = dst[chan_index];
1145
1146                 if (inst->Instruction.Saturate)
1147                         value = radeon_llvm_saturate(bld_base, value);
1148
1149                 lds_store(bld_base, chan_index, dw_addr, value);
1150
1151                 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1152                 values[chan_index] = value;
1153
1154                 if (inst->Dst[0].Register.WriteMask != 0xF) {
1155                         build_tbuffer_store_dwords(ctx, buffer, value, 1,
1156                                                    buf_addr, base,
1157                                                    4 * chan_index);
1158                 }
1159         }
1160
1161         if (inst->Dst[0].Register.WriteMask == 0xF) {
1162                 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1163                                                             values, 4);
1164                 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1165                                            base, 0);
1166         }
1167 }
1168
1169 static LLVMValueRef fetch_input_gs(
1170         struct lp_build_tgsi_context *bld_base,
1171         const struct tgsi_full_src_register *reg,
1172         enum tgsi_opcode_type type,
1173         unsigned swizzle)
1174 {
1175         struct lp_build_context *base = &bld_base->base;
1176         struct si_shader_context *ctx = si_shader_context(bld_base);
1177         struct si_shader *shader = ctx->shader;
1178         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1179         struct gallivm_state *gallivm = base->gallivm;
1180         LLVMValueRef vtx_offset;
1181         LLVMValueRef args[9];
1182         unsigned vtx_offset_param;
1183         struct tgsi_shader_info *info = &shader->selector->info;
1184         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1185         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1186         unsigned param;
1187         LLVMValueRef value;
1188
1189         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1190                 return get_primitive_id(bld_base, swizzle);
1191
1192         if (!reg->Register.Dimension)
1193                 return NULL;
1194
1195         if (swizzle == ~0) {
1196                 LLVMValueRef values[TGSI_NUM_CHANNELS];
1197                 unsigned chan;
1198                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1199                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1200                 }
1201                 return lp_build_gather_values(bld_base->base.gallivm, values,
1202                                               TGSI_NUM_CHANNELS);
1203         }
1204
1205         /* Get the vertex offset parameter */
1206         vtx_offset_param = reg->Dimension.Index;
1207         if (vtx_offset_param < 2) {
1208                 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1209         } else {
1210                 assert(vtx_offset_param < 6);
1211                 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1212         }
1213         vtx_offset = lp_build_mul_imm(uint,
1214                                       LLVMGetParam(ctx->radeon_bld.main_fn,
1215                                                    vtx_offset_param),
1216                                       4);
1217
1218         param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1219         args[0] = ctx->esgs_ring;
1220         args[1] = vtx_offset;
1221         args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1222         args[3] = uint->zero;
1223         args[4] = uint->one;  /* OFFEN */
1224         args[5] = uint->zero; /* IDXEN */
1225         args[6] = uint->one;  /* GLC */
1226         args[7] = uint->zero; /* SLC */
1227         args[8] = uint->zero; /* TFE */
1228
1229         value = lp_build_intrinsic(gallivm->builder,
1230                                    "llvm.SI.buffer.load.dword.i32.i32",
1231                                    ctx->i32, args, 9,
1232                                    LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
1233         if (tgsi_type_is_64bit(type)) {
1234                 LLVMValueRef value2;
1235                 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1236                 value2 = lp_build_intrinsic(gallivm->builder,
1237                                             "llvm.SI.buffer.load.dword.i32.i32",
1238                                             ctx->i32, args, 9,
1239                                             LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
1240                 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1241                                                     value, value2);
1242         }
1243         return LLVMBuildBitCast(gallivm->builder,
1244                                 value,
1245                                 tgsi2llvmtype(bld_base, type), "");
1246 }
1247
1248 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1249 {
1250         switch (interpolate) {
1251         case TGSI_INTERPOLATE_CONSTANT:
1252                 return 0;
1253
1254         case TGSI_INTERPOLATE_LINEAR:
1255                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1256                         return SI_PARAM_LINEAR_SAMPLE;
1257                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1258                         return SI_PARAM_LINEAR_CENTROID;
1259                 else
1260                         return SI_PARAM_LINEAR_CENTER;
1261                 break;
1262         case TGSI_INTERPOLATE_COLOR:
1263         case TGSI_INTERPOLATE_PERSPECTIVE:
1264                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1265                         return SI_PARAM_PERSP_SAMPLE;
1266                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1267                         return SI_PARAM_PERSP_CENTROID;
1268                 else
1269                         return SI_PARAM_PERSP_CENTER;
1270                 break;
1271         default:
1272                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1273                 return -1;
1274         }
1275 }
1276
1277 /* This shouldn't be used by explicit INTERP opcodes. */
1278 static unsigned select_interp_param(struct si_shader_context *ctx,
1279                                     unsigned param)
1280 {
1281         if (!ctx->is_monolithic)
1282                 return param;
1283
1284         /* If the shader doesn't use center/centroid, just return the parameter.
1285          *
1286          * If the shader only uses one set of (i,j), "si_emit_spi_ps_input" can
1287          * switch between center/centroid and sample without shader changes.
1288          */
1289         if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1290                 switch (param) {
1291                 case SI_PARAM_PERSP_CENTROID:
1292                 case SI_PARAM_PERSP_CENTER:
1293                         return SI_PARAM_PERSP_SAMPLE;
1294                 }
1295         }
1296         if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1297                 switch (param) {
1298                 case SI_PARAM_LINEAR_CENTROID:
1299                 case SI_PARAM_LINEAR_CENTER:
1300                         return SI_PARAM_LINEAR_SAMPLE;
1301                 }
1302         }
1303         if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1304                 switch (param) {
1305                 case SI_PARAM_PERSP_CENTROID:
1306                 case SI_PARAM_PERSP_SAMPLE:
1307                         return SI_PARAM_PERSP_CENTER;
1308                 }
1309         }
1310         if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1311                 switch (param) {
1312                 case SI_PARAM_LINEAR_CENTROID:
1313                 case SI_PARAM_LINEAR_SAMPLE:
1314                         return SI_PARAM_PERSP_CENTER;
1315                 }
1316         }
1317
1318         return param;
1319 }
1320
1321 /**
1322  * Interpolate a fragment shader input.
1323  *
1324  * @param ctx           context
1325  * @param input_index           index of the input in hardware
1326  * @param semantic_name         TGSI_SEMANTIC_*
1327  * @param semantic_index        semantic index
1328  * @param num_interp_inputs     number of all interpolated inputs (= BCOLOR offset)
1329  * @param colors_read_mask      color components read (4 bits for each color, 8 bits in total)
1330  * @param interp_param          interpolation weights (i,j)
1331  * @param prim_mask             SI_PARAM_PRIM_MASK
1332  * @param face                  SI_PARAM_FRONT_FACE
1333  * @param result                the return value (4 components)
1334  */
1335 static void interp_fs_input(struct si_shader_context *ctx,
1336                             unsigned input_index,
1337                             unsigned semantic_name,
1338                             unsigned semantic_index,
1339                             unsigned num_interp_inputs,
1340                             unsigned colors_read_mask,
1341                             LLVMValueRef interp_param,
1342                             LLVMValueRef prim_mask,
1343                             LLVMValueRef face,
1344                             LLVMValueRef result[4])
1345 {
1346         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1347         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1348         struct gallivm_state *gallivm = base->gallivm;
1349         const char *intr_name;
1350         LLVMValueRef attr_number;
1351
1352         unsigned chan;
1353
1354         attr_number = lp_build_const_int32(gallivm, input_index);
1355
1356         /* fs.constant returns the param from the middle vertex, so it's not
1357          * really useful for flat shading. It's meant to be used for custom
1358          * interpolation (but the intrinsic can't fetch from the other two
1359          * vertices).
1360          *
1361          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1362          * to do the right thing. The only reason we use fs.constant is that
1363          * fs.interp cannot be used on integers, because they can be equal
1364          * to NaN.
1365          */
1366         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1367
1368         if (semantic_name == TGSI_SEMANTIC_COLOR &&
1369             ctx->shader->key.ps.prolog.color_two_side) {
1370                 LLVMValueRef args[4];
1371                 LLVMValueRef is_face_positive;
1372                 LLVMValueRef back_attr_number;
1373
1374                 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1375                  * otherwise it's at offset "num_inputs".
1376                  */
1377                 unsigned back_attr_offset = num_interp_inputs;
1378                 if (semantic_index == 1 && colors_read_mask & 0xf)
1379                         back_attr_offset += 1;
1380
1381                 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1382
1383                 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1384                                                  face, uint->zero, "");
1385
1386                 args[2] = prim_mask;
1387                 args[3] = interp_param;
1388                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1389                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1390                         LLVMValueRef front, back;
1391
1392                         args[0] = llvm_chan;
1393                         args[1] = attr_number;
1394                         front = lp_build_intrinsic(gallivm->builder, intr_name,
1395                                                 ctx->f32, args, args[3] ? 4 : 3,
1396                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1397
1398                         args[1] = back_attr_number;
1399                         back = lp_build_intrinsic(gallivm->builder, intr_name,
1400                                                ctx->f32, args, args[3] ? 4 : 3,
1401                                                LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1402
1403                         result[chan] = LLVMBuildSelect(gallivm->builder,
1404                                                 is_face_positive,
1405                                                 front,
1406                                                 back,
1407                                                 "");
1408                 }
1409         } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1410                 LLVMValueRef args[4];
1411
1412                 args[0] = uint->zero;
1413                 args[1] = attr_number;
1414                 args[2] = prim_mask;
1415                 args[3] = interp_param;
1416                 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1417                                         ctx->f32, args, args[3] ? 4 : 3,
1418                                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1419                 result[1] =
1420                 result[2] = lp_build_const_float(gallivm, 0.0f);
1421                 result[3] = lp_build_const_float(gallivm, 1.0f);
1422         } else {
1423                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1424                         LLVMValueRef args[4];
1425                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1426
1427                         args[0] = llvm_chan;
1428                         args[1] = attr_number;
1429                         args[2] = prim_mask;
1430                         args[3] = interp_param;
1431                         result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1432                                                 ctx->f32, args, args[3] ? 4 : 3,
1433                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1434                 }
1435         }
1436 }
1437
1438 /* LLVMGetParam with bc_optimize resolved. */
1439 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1440                                      int interp_param_idx)
1441 {
1442         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1443         LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1444         LLVMValueRef param = NULL;
1445
1446         /* Handle PRIM_MASK[31] (bc_optimize). */
1447         if (ctx->is_monolithic &&
1448             ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1449               interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1450              (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1451               interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1452                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1453                  * The hw doesn't compute CENTROID if the whole wave only
1454                  * contains fully-covered quads.
1455                  */
1456                 LLVMValueRef bc_optimize =
1457                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1458                 bc_optimize = LLVMBuildLShr(builder,
1459                                             bc_optimize,
1460                                             LLVMConstInt(ctx->i32, 31, 0), "");
1461                 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1462
1463                 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1464                     interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1465                         param = LLVMBuildSelect(builder, bc_optimize,
1466                                                 LLVMGetParam(main_fn,
1467                                                              SI_PARAM_PERSP_CENTER),
1468                                                 LLVMGetParam(main_fn,
1469                                                              SI_PARAM_PERSP_CENTROID),
1470                                                 "");
1471                 }
1472                 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1473                     interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1474                         param = LLVMBuildSelect(builder, bc_optimize,
1475                                                 LLVMGetParam(main_fn,
1476                                                              SI_PARAM_LINEAR_CENTER),
1477                                                 LLVMGetParam(main_fn,
1478                                                              SI_PARAM_LINEAR_CENTROID),
1479                                                 "");
1480                 }
1481         }
1482
1483         if (!param)
1484                 param = LLVMGetParam(main_fn, interp_param_idx);
1485         return param;
1486 }
1487
1488 static void declare_input_fs(
1489         struct radeon_llvm_context *radeon_bld,
1490         unsigned input_index,
1491         const struct tgsi_full_declaration *decl)
1492 {
1493         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1494         struct si_shader_context *ctx =
1495                 si_shader_context(&radeon_bld->soa.bld_base);
1496         struct si_shader *shader = ctx->shader;
1497         LLVMValueRef main_fn = radeon_bld->main_fn;
1498         LLVMValueRef interp_param = NULL;
1499         int interp_param_idx;
1500
1501         /* Get colors from input VGPRs (set by the prolog). */
1502         if (!ctx->is_monolithic &&
1503             decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1504                 unsigned i = decl->Semantic.Index;
1505                 unsigned colors_read = shader->selector->info.colors_read;
1506                 unsigned mask = colors_read >> (i * 4);
1507                 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1508                                   (i ? util_bitcount(colors_read & 0xf) : 0);
1509
1510                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1511                         mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1512                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1513                         mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1514                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1515                         mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1516                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1517                         mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1518                 return;
1519         }
1520
1521         interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1522                                                      decl->Interp.Location);
1523         if (interp_param_idx == -1)
1524                 return;
1525         else if (interp_param_idx) {
1526                 interp_param_idx = select_interp_param(ctx,
1527                                                        interp_param_idx);
1528                 interp_param = get_interp_param(ctx, interp_param_idx);
1529         }
1530
1531         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1532             decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1533             ctx->shader->key.ps.prolog.flatshade_colors)
1534                 interp_param = NULL; /* load the constant color */
1535
1536         interp_fs_input(ctx, input_index, decl->Semantic.Name,
1537                         decl->Semantic.Index, shader->selector->info.num_inputs,
1538                         shader->selector->info.colors_read, interp_param,
1539                         LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1540                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1541                         &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1542 }
1543
1544 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1545 {
1546         return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1547                             SI_PARAM_ANCILLARY, 8, 4);
1548 }
1549
1550 /**
1551  * Set range metadata on an instruction.  This can only be used on load and
1552  * call instructions.  If you know an instruction can only produce the values
1553  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1554  * \p lo is the minimum value inclusive.
1555  * \p hi is the maximum value exclusive.
1556  */
1557 static void set_range_metadata(LLVMValueRef value, unsigned lo, unsigned hi)
1558 {
1559         const char *range_md_string = "range";
1560         LLVMValueRef range_md, md_args[2];
1561         LLVMTypeRef type = LLVMTypeOf(value);
1562         LLVMContextRef context = LLVMGetTypeContext(type);
1563         unsigned md_range_id = LLVMGetMDKindIDInContext(context,
1564                                 range_md_string, strlen(range_md_string));
1565
1566         md_args[0] = LLVMConstInt(type, lo, false);
1567         md_args[1] = LLVMConstInt(type, hi, false);
1568         range_md = LLVMMDNodeInContext(context, md_args, 2);
1569         LLVMSetMetadata(value, md_range_id, range_md);
1570 }
1571
1572 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1573 {
1574         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1575         LLVMValueRef tid;
1576
1577         if (HAVE_LLVM < 0x0308) {
1578                 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1579                                 ctx->i32,   NULL, 0, LLVMReadNoneAttribute);
1580         } else {
1581                 LLVMValueRef tid_args[2];
1582                 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1583                 tid_args[1] = lp_build_const_int32(gallivm, 0);
1584                 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1585                                         "llvm.amdgcn.mbcnt.lo", ctx->i32,
1586                                         tid_args, 2, LLVMReadNoneAttribute);
1587
1588                 tid = lp_build_intrinsic(gallivm->builder,
1589                                         "llvm.amdgcn.mbcnt.hi", ctx->i32,
1590                                         tid_args, 2, LLVMReadNoneAttribute);
1591         }
1592         set_range_metadata(tid, 0, 64);
1593         return tid;
1594 }
1595
1596 /**
1597  * Load a dword from a constant buffer.
1598  */
1599 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1600                                       LLVMValueRef offset, LLVMTypeRef return_type)
1601 {
1602         LLVMValueRef args[2] = {resource, offset};
1603
1604         return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1605                                LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1606 }
1607
1608 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1609 {
1610         struct si_shader_context *ctx =
1611                 si_shader_context(&radeon_bld->soa.bld_base);
1612         struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1613         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1614         LLVMBuilderRef builder = gallivm->builder;
1615         LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1616         LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1617         LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1618
1619         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1620         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1621         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1622
1623         LLVMValueRef pos[4] = {
1624                 buffer_load_const(builder, resource, offset0, ctx->f32),
1625                 buffer_load_const(builder, resource, offset1, ctx->f32),
1626                 lp_build_const_float(gallivm, 0),
1627                 lp_build_const_float(gallivm, 0)
1628         };
1629
1630         return lp_build_gather_values(gallivm, pos, 4);
1631 }
1632
1633 static void declare_system_value(
1634         struct radeon_llvm_context *radeon_bld,
1635         unsigned index,
1636         const struct tgsi_full_declaration *decl)
1637 {
1638         struct si_shader_context *ctx =
1639                 si_shader_context(&radeon_bld->soa.bld_base);
1640         struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1641         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1642         LLVMValueRef value = 0;
1643
1644         switch (decl->Semantic.Name) {
1645         case TGSI_SEMANTIC_INSTANCEID:
1646                 value = LLVMGetParam(radeon_bld->main_fn,
1647                                      ctx->param_instance_id);
1648                 break;
1649
1650         case TGSI_SEMANTIC_VERTEXID:
1651                 value = LLVMBuildAdd(gallivm->builder,
1652                                      LLVMGetParam(radeon_bld->main_fn,
1653                                                   ctx->param_vertex_id),
1654                                      LLVMGetParam(radeon_bld->main_fn,
1655                                                   SI_PARAM_BASE_VERTEX), "");
1656                 break;
1657
1658         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1659                 value = LLVMGetParam(radeon_bld->main_fn,
1660                                      ctx->param_vertex_id);
1661                 break;
1662
1663         case TGSI_SEMANTIC_BASEVERTEX:
1664                 value = LLVMGetParam(radeon_bld->main_fn,
1665                                      SI_PARAM_BASE_VERTEX);
1666                 break;
1667
1668         case TGSI_SEMANTIC_INVOCATIONID:
1669                 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1670                         value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1671                 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1672                         value = LLVMGetParam(radeon_bld->main_fn,
1673                                              SI_PARAM_GS_INSTANCE_ID);
1674                 else
1675                         assert(!"INVOCATIONID not implemented");
1676                 break;
1677
1678         case TGSI_SEMANTIC_POSITION:
1679         {
1680                 LLVMValueRef pos[4] = {
1681                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1682                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1683                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1684                         lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1685                                                  LLVMGetParam(radeon_bld->main_fn,
1686                                                               SI_PARAM_POS_W_FLOAT)),
1687                 };
1688                 value = lp_build_gather_values(gallivm, pos, 4);
1689                 break;
1690         }
1691
1692         case TGSI_SEMANTIC_FACE:
1693                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1694                 break;
1695
1696         case TGSI_SEMANTIC_SAMPLEID:
1697                 value = get_sample_id(radeon_bld);
1698                 break;
1699
1700         case TGSI_SEMANTIC_SAMPLEPOS: {
1701                 LLVMValueRef pos[4] = {
1702                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1703                         LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1704                         lp_build_const_float(gallivm, 0),
1705                         lp_build_const_float(gallivm, 0)
1706                 };
1707                 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1708                                                   TGSI_OPCODE_FRC, pos[0]);
1709                 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1710                                                   TGSI_OPCODE_FRC, pos[1]);
1711                 value = lp_build_gather_values(gallivm, pos, 4);
1712                 break;
1713         }
1714
1715         case TGSI_SEMANTIC_SAMPLEMASK:
1716                 /* This can only occur with the OpenGL Core profile, which
1717                  * doesn't support smoothing.
1718                  */
1719                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1720                 break;
1721
1722         case TGSI_SEMANTIC_TESSCOORD:
1723         {
1724                 LLVMValueRef coord[4] = {
1725                         LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1726                         LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1727                         bld->zero,
1728                         bld->zero
1729                 };
1730
1731                 /* For triangles, the vector should be (u, v, 1-u-v). */
1732                 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1733                     PIPE_PRIM_TRIANGLES)
1734                         coord[2] = lp_build_sub(bld, bld->one,
1735                                                 lp_build_add(bld, coord[0], coord[1]));
1736
1737                 value = lp_build_gather_values(gallivm, coord, 4);
1738                 break;
1739         }
1740
1741         case TGSI_SEMANTIC_VERTICESIN:
1742                 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1743                 break;
1744
1745         case TGSI_SEMANTIC_TESSINNER:
1746         case TGSI_SEMANTIC_TESSOUTER:
1747         {
1748                 LLVMValueRef rw_buffers, buffer, base, addr;
1749                 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1750
1751                 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1752                                         SI_PARAM_RW_BUFFERS);
1753                 buffer = build_indexed_load_const(ctx, rw_buffers,
1754                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1755
1756                 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1757                 addr = get_tcs_tes_buffer_address(ctx, NULL,
1758                                           lp_build_const_int32(gallivm, param));
1759
1760                 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1761                                     ~0, buffer, base, addr);
1762
1763                 break;
1764         }
1765
1766         case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1767         case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1768         {
1769                 LLVMValueRef buf, slot, val[4];
1770                 int i, offset;
1771
1772                 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1773                 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1774                 buf = build_indexed_load_const(ctx, buf, slot);
1775                 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1776
1777                 for (i = 0; i < 4; i++)
1778                         val[i] = buffer_load_const(gallivm->builder, buf,
1779                                                    lp_build_const_int32(gallivm, (offset + i) * 4),
1780                                                    ctx->f32);
1781                 value = lp_build_gather_values(gallivm, val, 4);
1782                 break;
1783         }
1784
1785         case TGSI_SEMANTIC_PRIMID:
1786                 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1787                 break;
1788
1789         case TGSI_SEMANTIC_GRID_SIZE:
1790                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1791                 break;
1792
1793         case TGSI_SEMANTIC_BLOCK_SIZE:
1794         {
1795                 LLVMValueRef values[3];
1796                 unsigned i;
1797                 unsigned *properties = ctx->shader->selector->info.properties;
1798                 unsigned sizes[3] = {
1799                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1800                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1801                         properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1802                 };
1803
1804                 for (i = 0; i < 3; ++i)
1805                         values[i] = lp_build_const_int32(gallivm, sizes[i]);
1806
1807                 value = lp_build_gather_values(gallivm, values, 3);
1808                 break;
1809         }
1810
1811         case TGSI_SEMANTIC_BLOCK_ID:
1812                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1813                 break;
1814
1815         case TGSI_SEMANTIC_THREAD_ID:
1816                 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1817                 break;
1818
1819 #if HAVE_LLVM >= 0x0309
1820         case TGSI_SEMANTIC_HELPER_INVOCATION:
1821                 value = lp_build_intrinsic(gallivm->builder,
1822                                            "llvm.amdgcn.ps.live",
1823                                            ctx->i1, NULL, 0,
1824                                            LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1825                 value = LLVMBuildNot(gallivm->builder, value, "");
1826                 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1827                 break;
1828 #endif
1829
1830         default:
1831                 assert(!"unknown system value");
1832                 return;
1833         }
1834
1835         radeon_bld->system_values[index] = value;
1836 }
1837
1838 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1839                                    const struct tgsi_full_declaration *decl)
1840 {
1841         struct si_shader_context *ctx =
1842                 si_shader_context(&radeon_bld->soa.bld_base);
1843         struct si_shader_selector *sel = ctx->shader->selector;
1844         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1845
1846         LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1847         LLVMValueRef var;
1848
1849         assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1850         assert(decl->Range.First == decl->Range.Last);
1851         assert(!ctx->shared_memory);
1852
1853         var = LLVMAddGlobalInAddressSpace(gallivm->module,
1854                                           LLVMArrayType(ctx->i8, sel->local_size),
1855                                           "compute_lds",
1856                                           LOCAL_ADDR_SPACE);
1857         LLVMSetAlignment(var, 4);
1858
1859         ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1860 }
1861
1862 static LLVMValueRef fetch_constant(
1863         struct lp_build_tgsi_context *bld_base,
1864         const struct tgsi_full_src_register *reg,
1865         enum tgsi_opcode_type type,
1866         unsigned swizzle)
1867 {
1868         struct si_shader_context *ctx = si_shader_context(bld_base);
1869         struct lp_build_context *base = &bld_base->base;
1870         const struct tgsi_ind_register *ireg = &reg->Indirect;
1871         unsigned buf, idx;
1872
1873         LLVMValueRef addr, bufp;
1874         LLVMValueRef result;
1875
1876         if (swizzle == LP_CHAN_ALL) {
1877                 unsigned chan;
1878                 LLVMValueRef values[4];
1879                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1880                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1881
1882                 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1883         }
1884
1885         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1886         idx = reg->Register.Index * 4 + swizzle;
1887
1888         if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1889                 if (!tgsi_type_is_64bit(type))
1890                         return bitcast(bld_base, type, ctx->constants[buf][idx]);
1891                 else {
1892                         return radeon_llvm_emit_fetch_64bit(bld_base, type,
1893                                                             ctx->constants[buf][idx],
1894                                                             ctx->constants[buf][idx + 1]);
1895                 }
1896         }
1897
1898         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1899                 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1900                 LLVMValueRef index;
1901                 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1902                                                    reg->Dimension.Index,
1903                                                    SI_NUM_CONST_BUFFERS);
1904                 bufp = build_indexed_load_const(ctx, ptr, index);
1905         } else
1906                 bufp = ctx->const_buffers[buf];
1907
1908         addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1909         addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1910         addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1911         addr = lp_build_add(&bld_base->uint_bld, addr,
1912                             lp_build_const_int32(base->gallivm, idx * 4));
1913
1914         result = buffer_load_const(base->gallivm->builder, bufp,
1915                                    addr, ctx->f32);
1916
1917         if (!tgsi_type_is_64bit(type))
1918                 result = bitcast(bld_base, type, result);
1919         else {
1920                 LLVMValueRef addr2, result2;
1921                 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1922                 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1923                 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1924                 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1925                                      lp_build_const_int32(base->gallivm, idx * 4));
1926
1927                 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1928                                    addr2, ctx->f32);
1929
1930                 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1931                                                       result, result2);
1932         }
1933         return result;
1934 }
1935
1936 /* Upper 16 bits must be zero. */
1937 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1938                                            LLVMValueRef val[2])
1939 {
1940         return LLVMBuildOr(gallivm->builder, val[0],
1941                            LLVMBuildShl(gallivm->builder, val[1],
1942                                         lp_build_const_int32(gallivm, 16),
1943                                         ""), "");
1944 }
1945
1946 /* Upper 16 bits are ignored and will be dropped. */
1947 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1948                                                     LLVMValueRef val[2])
1949 {
1950         LLVMValueRef v[2] = {
1951                 LLVMBuildAnd(gallivm->builder, val[0],
1952                              lp_build_const_int32(gallivm, 0xffff), ""),
1953                 val[1],
1954         };
1955         return si_llvm_pack_two_int16(gallivm, v);
1956 }
1957
1958 /* Initialize arguments for the shader export intrinsic */
1959 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1960                                      LLVMValueRef *values,
1961                                      unsigned target,
1962                                      LLVMValueRef *args)
1963 {
1964         struct si_shader_context *ctx = si_shader_context(bld_base);
1965         struct lp_build_context *uint =
1966                                 &ctx->radeon_bld.soa.bld_base.uint_bld;
1967         struct lp_build_context *base = &bld_base->base;
1968         struct gallivm_state *gallivm = base->gallivm;
1969         LLVMBuilderRef builder = base->gallivm->builder;
1970         LLVMValueRef val[4];
1971         unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1972         unsigned chan;
1973         bool is_int8;
1974
1975         /* Default is 0xf. Adjusted below depending on the format. */
1976         args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1977
1978         /* Specify whether the EXEC mask represents the valid mask */
1979         args[1] = uint->zero;
1980
1981         /* Specify whether this is the last export */
1982         args[2] = uint->zero;
1983
1984         /* Specify the target we are exporting */
1985         args[3] = lp_build_const_int32(base->gallivm, target);
1986
1987         if (ctx->type == PIPE_SHADER_FRAGMENT) {
1988                 const union si_shader_key *key = &ctx->shader->key;
1989                 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1990                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1991
1992                 assert(cbuf >= 0 && cbuf < 8);
1993                 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1994                 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1995         }
1996
1997         args[4] = uint->zero; /* COMPR flag */
1998         args[5] = base->undef;
1999         args[6] = base->undef;
2000         args[7] = base->undef;
2001         args[8] = base->undef;
2002
2003         switch (spi_shader_col_format) {
2004         case V_028714_SPI_SHADER_ZERO:
2005                 args[0] = uint->zero; /* writemask */
2006                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2007                 break;
2008
2009         case V_028714_SPI_SHADER_32_R:
2010                 args[0] = uint->one; /* writemask */
2011                 args[5] = values[0];
2012                 break;
2013
2014         case V_028714_SPI_SHADER_32_GR:
2015                 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
2016                 args[5] = values[0];
2017                 args[6] = values[1];
2018                 break;
2019
2020         case V_028714_SPI_SHADER_32_AR:
2021                 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2022                 args[5] = values[0];
2023                 args[8] = values[3];
2024                 break;
2025
2026         case V_028714_SPI_SHADER_FP16_ABGR:
2027                 args[4] = uint->one; /* COMPR flag */
2028
2029                 for (chan = 0; chan < 2; chan++) {
2030                         LLVMValueRef pack_args[2] = {
2031                                 values[2 * chan],
2032                                 values[2 * chan + 1]
2033                         };
2034                         LLVMValueRef packed;
2035
2036                         packed = lp_build_intrinsic(base->gallivm->builder,
2037                                                     "llvm.SI.packf16",
2038                                                     ctx->i32, pack_args, 2,
2039                                                     LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
2040                         args[chan + 5] =
2041                                 LLVMBuildBitCast(base->gallivm->builder,
2042                                                  packed, ctx->f32, "");
2043                 }
2044                 break;
2045
2046         case V_028714_SPI_SHADER_UNORM16_ABGR:
2047                 for (chan = 0; chan < 4; chan++) {
2048                         val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2049                         val[chan] = LLVMBuildFMul(builder, val[chan],
2050                                                   lp_build_const_float(gallivm, 65535), "");
2051                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2052                                                   lp_build_const_float(gallivm, 0.5), "");
2053                         val[chan] = LLVMBuildFPToUI(builder, val[chan],
2054                                                     ctx->i32, "");
2055                 }
2056
2057                 args[4] = uint->one; /* COMPR flag */
2058                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2059                                   si_llvm_pack_two_int16(gallivm, val));
2060                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2061                                   si_llvm_pack_two_int16(gallivm, val+2));
2062                 break;
2063
2064         case V_028714_SPI_SHADER_SNORM16_ABGR:
2065                 for (chan = 0; chan < 4; chan++) {
2066                         /* Clamp between [-1, 1]. */
2067                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2068                                                               values[chan],
2069                                                               lp_build_const_float(gallivm, 1));
2070                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2071                                                               val[chan],
2072                                                               lp_build_const_float(gallivm, -1));
2073                         /* Convert to a signed integer in [-32767, 32767]. */
2074                         val[chan] = LLVMBuildFMul(builder, val[chan],
2075                                                   lp_build_const_float(gallivm, 32767), "");
2076                         /* If positive, add 0.5, else add -0.5. */
2077                         val[chan] = LLVMBuildFAdd(builder, val[chan],
2078                                         LLVMBuildSelect(builder,
2079                                                 LLVMBuildFCmp(builder, LLVMRealOGE,
2080                                                               val[chan], base->zero, ""),
2081                                                 lp_build_const_float(gallivm, 0.5),
2082                                                 lp_build_const_float(gallivm, -0.5), ""), "");
2083                         val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2084                 }
2085
2086                 args[4] = uint->one; /* COMPR flag */
2087                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2088                                   si_llvm_pack_two_int32_as_int16(gallivm, val));
2089                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2090                                   si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2091                 break;
2092
2093         case V_028714_SPI_SHADER_UINT16_ABGR: {
2094                 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2095                                                         255 : 65535);
2096                 /* Clamp. */
2097                 for (chan = 0; chan < 4; chan++) {
2098                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2099                         val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2100                                                               val[chan], max);
2101                 }
2102
2103                 args[4] = uint->one; /* COMPR flag */
2104                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2105                                   si_llvm_pack_two_int16(gallivm, val));
2106                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2107                                   si_llvm_pack_two_int16(gallivm, val+2));
2108                 break;
2109         }
2110
2111         case V_028714_SPI_SHADER_SINT16_ABGR: {
2112                 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2113                                                         127 : 32767);
2114                 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2115                                                         -128 : -32768);
2116                 /* Clamp. */
2117                 for (chan = 0; chan < 4; chan++) {
2118                         val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2119                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2120                                                               TGSI_OPCODE_IMIN,
2121                                                               val[chan], max);
2122                         val[chan] = lp_build_emit_llvm_binary(bld_base,
2123                                                               TGSI_OPCODE_IMAX,
2124                                                               val[chan], min);
2125                 }
2126
2127                 args[4] = uint->one; /* COMPR flag */
2128                 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2129                                   si_llvm_pack_two_int32_as_int16(gallivm, val));
2130                 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2131                                   si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2132                 break;
2133         }
2134
2135         case V_028714_SPI_SHADER_32_ABGR:
2136                 memcpy(&args[5], values, sizeof(values[0]) * 4);
2137                 break;
2138         }
2139 }
2140
2141 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2142                           LLVMValueRef alpha)
2143 {
2144         struct si_shader_context *ctx = si_shader_context(bld_base);
2145         struct gallivm_state *gallivm = bld_base->base.gallivm;
2146
2147         if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2148                 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2149                                 SI_PARAM_ALPHA_REF);
2150
2151                 LLVMValueRef alpha_pass =
2152                         lp_build_cmp(&bld_base->base,
2153                                      ctx->shader->key.ps.epilog.alpha_func,
2154                                      alpha, alpha_ref);
2155                 LLVMValueRef arg =
2156                         lp_build_select(&bld_base->base,
2157                                         alpha_pass,
2158                                         lp_build_const_float(gallivm, 1.0f),
2159                                         lp_build_const_float(gallivm, -1.0f));
2160
2161                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2162                                    ctx->voidt, &arg, 1, 0);
2163         } else {
2164                 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2165                                    ctx->voidt, NULL, 0, 0);
2166         }
2167 }
2168
2169 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2170                                                   LLVMValueRef alpha,
2171                                                   unsigned samplemask_param)
2172 {
2173         struct si_shader_context *ctx = si_shader_context(bld_base);
2174         struct gallivm_state *gallivm = bld_base->base.gallivm;
2175         LLVMValueRef coverage;
2176
2177         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2178         coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2179                                 samplemask_param);
2180         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2181
2182         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2183                                    ctx->i32,
2184                                    &coverage, 1, LLVMReadNoneAttribute);
2185
2186         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2187                                    ctx->f32, "");
2188
2189         coverage = LLVMBuildFMul(gallivm->builder, coverage,
2190                                  lp_build_const_float(gallivm,
2191                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2192
2193         return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2194 }
2195
2196 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2197                                     LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2198 {
2199         struct si_shader_context *ctx = si_shader_context(bld_base);
2200         struct lp_build_context *base = &bld_base->base;
2201         struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2202         unsigned reg_index;
2203         unsigned chan;
2204         unsigned const_chan;
2205         LLVMValueRef base_elt;
2206         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2207         LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2208                                                            SI_VS_CONST_CLIP_PLANES);
2209         LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2210
2211         for (reg_index = 0; reg_index < 2; reg_index ++) {
2212                 LLVMValueRef *args = pos[2 + reg_index];
2213
2214                 args[5] =
2215                 args[6] =
2216                 args[7] =
2217                 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2218
2219                 /* Compute dot products of position and user clip plane vectors */
2220                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2221                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2222                                 args[1] = lp_build_const_int32(base->gallivm,
2223                                                                ((reg_index * 4 + chan) * 4 +
2224                                                                 const_chan) * 4);
2225                                 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
2226                                                       args[1], ctx->f32);
2227                                 args[5 + chan] =
2228                                         lp_build_add(base, args[5 + chan],
2229                                                      lp_build_mul(base, base_elt,
2230                                                                   out_elts[const_chan]));
2231                         }
2232                 }
2233
2234                 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2235                 args[1] = uint->zero;
2236                 args[2] = uint->zero;
2237                 args[3] = lp_build_const_int32(base->gallivm,
2238                                                V_008DFC_SQ_EXP_POS + 2 + reg_index);
2239                 args[4] = uint->zero;
2240         }
2241 }
2242
2243 static void si_dump_streamout(struct pipe_stream_output_info *so)
2244 {
2245         unsigned i;
2246
2247         if (so->num_outputs)
2248                 fprintf(stderr, "STREAMOUT\n");
2249
2250         for (i = 0; i < so->num_outputs; i++) {
2251                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2252                                 so->output[i].start_component;
2253                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2254                         i, so->output[i].output_buffer,
2255                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2256                         so->output[i].register_index,
2257                         mask & 1 ? "x" : "",
2258                         mask & 2 ? "y" : "",
2259                         mask & 4 ? "z" : "",
2260                         mask & 8 ? "w" : "");
2261         }
2262 }
2263
2264 /* On SI, the vertex shader is responsible for writing streamout data
2265  * to buffers. */
2266 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2267                                    struct si_shader_output_values *outputs,
2268                                    unsigned noutput)
2269 {
2270         struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2271         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2272         LLVMBuilderRef builder = gallivm->builder;
2273         int i, j;
2274         struct lp_build_if_state if_ctx;
2275
2276         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2277         LLVMValueRef so_vtx_count =
2278                 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2279
2280         LLVMValueRef tid = get_thread_id(ctx);
2281
2282         /* can_emit = tid < so_vtx_count; */
2283         LLVMValueRef can_emit =
2284                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2285
2286         LLVMValueRef stream_id =
2287                 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2288
2289         /* Emit the streamout code conditionally. This actually avoids
2290          * out-of-bounds buffer access. The hw tells us via the SGPR
2291          * (so_vtx_count) which threads are allowed to emit streamout data. */
2292         lp_build_if(&if_ctx, gallivm, can_emit);
2293         {
2294                 /* The buffer offset is computed as follows:
2295                  *   ByteOffset = streamout_offset[buffer_id]*4 +
2296                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
2297                  *                attrib_offset
2298                  */
2299
2300                 LLVMValueRef so_write_index =
2301                         LLVMGetParam(ctx->radeon_bld.main_fn,
2302                                      ctx->param_streamout_write_index);
2303
2304                 /* Compute (streamout_write_index + thread_id). */
2305                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2306
2307                 /* Compute the write offset for each enabled buffer. */
2308                 LLVMValueRef so_write_offset[4] = {};
2309                 for (i = 0; i < 4; i++) {
2310                         if (!so->stride[i])
2311                                 continue;
2312
2313                         LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2314                                                               ctx->param_streamout_offset[i]);
2315                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2316
2317                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2318                                                           LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2319                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2320                 }
2321
2322                 /* Write streamout data. */
2323                 for (i = 0; i < so->num_outputs; i++) {
2324                         unsigned buf_idx = so->output[i].output_buffer;
2325                         unsigned reg = so->output[i].register_index;
2326                         unsigned start = so->output[i].start_component;
2327                         unsigned num_comps = so->output[i].num_components;
2328                         unsigned stream = so->output[i].stream;
2329                         LLVMValueRef out[4];
2330                         struct lp_build_if_state if_ctx_stream;
2331
2332                         assert(num_comps && num_comps <= 4);
2333                         if (!num_comps || num_comps > 4)
2334                                 continue;
2335
2336                         if (reg >= noutput)
2337                                 continue;
2338
2339                         /* Load the output as int. */
2340                         for (j = 0; j < num_comps; j++) {
2341                                 out[j] = LLVMBuildBitCast(builder,
2342                                                           outputs[reg].values[start+j],
2343                                                 ctx->i32, "");
2344                         }
2345
2346                         /* Pack the output. */
2347                         LLVMValueRef vdata = NULL;
2348
2349                         switch (num_comps) {
2350                         case 1: /* as i32 */
2351                                 vdata = out[0];
2352                                 break;
2353                         case 2: /* as v2i32 */
2354                         case 3: /* as v4i32 (aligned to 4) */
2355                         case 4: /* as v4i32 */
2356                                 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2357                                 for (j = 0; j < num_comps; j++) {
2358                                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2359                                                                        LLVMConstInt(ctx->i32, j, 0), "");
2360                                 }
2361                                 break;
2362                         }
2363
2364                         LLVMValueRef can_emit_stream =
2365                                 LLVMBuildICmp(builder, LLVMIntEQ,
2366                                               stream_id,
2367                                               lp_build_const_int32(gallivm, stream), "");
2368
2369                         lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2370                         build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2371                                                    vdata, num_comps,
2372                                                    so_write_offset[buf_idx],
2373                                                    LLVMConstInt(ctx->i32, 0, 0),
2374                                                    so->output[i].dst_offset*4);
2375                         lp_build_endif(&if_ctx_stream);
2376                 }
2377         }
2378         lp_build_endif(&if_ctx);
2379 }
2380
2381
2382 /* Generate export instructions for hardware VS shader stage */
2383 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2384                               struct si_shader_output_values *outputs,
2385                               unsigned noutput)
2386 {
2387         struct si_shader_context *ctx = si_shader_context(bld_base);
2388         struct si_shader *shader = ctx->shader;
2389         struct lp_build_context *base = &bld_base->base;
2390         struct lp_build_context *uint =
2391                                 &ctx->radeon_bld.soa.bld_base.uint_bld;
2392         LLVMValueRef args[9];
2393         LLVMValueRef pos_args[4][9] = { { 0 } };
2394         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2395         unsigned semantic_name, semantic_index;
2396         unsigned target;
2397         unsigned param_count = 0;
2398         unsigned pos_idx;
2399         int i;
2400
2401         if (outputs && ctx->shader->selector->so.num_outputs) {
2402                 si_llvm_emit_streamout(ctx, outputs, noutput);
2403         }
2404
2405         for (i = 0; i < noutput; i++) {
2406                 semantic_name = outputs[i].name;
2407                 semantic_index = outputs[i].sid;
2408
2409 handle_semantic:
2410                 /* Select the correct target */
2411                 switch(semantic_name) {
2412                 case TGSI_SEMANTIC_PSIZE:
2413                         psize_value = outputs[i].values[0];
2414                         continue;
2415                 case TGSI_SEMANTIC_EDGEFLAG:
2416                         edgeflag_value = outputs[i].values[0];
2417                         continue;
2418                 case TGSI_SEMANTIC_LAYER:
2419                         layer_value = outputs[i].values[0];
2420                         semantic_name = TGSI_SEMANTIC_GENERIC;
2421                         goto handle_semantic;
2422                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2423                         viewport_index_value = outputs[i].values[0];
2424                         semantic_name = TGSI_SEMANTIC_GENERIC;
2425                         goto handle_semantic;
2426                 case TGSI_SEMANTIC_POSITION:
2427                         target = V_008DFC_SQ_EXP_POS;
2428                         break;
2429                 case TGSI_SEMANTIC_COLOR:
2430                 case TGSI_SEMANTIC_BCOLOR:
2431                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2432                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2433                         shader->info.vs_output_param_offset[i] = param_count;
2434                         param_count++;
2435                         break;
2436                 case TGSI_SEMANTIC_CLIPDIST:
2437                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2438                         break;
2439                 case TGSI_SEMANTIC_CLIPVERTEX:
2440                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2441                         continue;
2442                 case TGSI_SEMANTIC_PRIMID:
2443                 case TGSI_SEMANTIC_FOG:
2444                 case TGSI_SEMANTIC_TEXCOORD:
2445                 case TGSI_SEMANTIC_GENERIC:
2446                         target = V_008DFC_SQ_EXP_PARAM + param_count;
2447                         assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2448                         shader->info.vs_output_param_offset[i] = param_count;
2449                         param_count++;
2450                         break;
2451                 default:
2452                         target = 0;
2453                         fprintf(stderr,
2454                                 "Warning: SI unhandled vs output type:%d\n",
2455                                 semantic_name);
2456                 }
2457
2458                 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2459
2460                 if (target >= V_008DFC_SQ_EXP_POS &&
2461                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
2462                         memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2463                                args, sizeof(args));
2464                 } else {
2465                         lp_build_intrinsic(base->gallivm->builder,
2466                                            "llvm.SI.export", ctx->voidt,
2467                                            args, 9, 0);
2468                 }
2469
2470                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2471                         semantic_name = TGSI_SEMANTIC_GENERIC;
2472                         goto handle_semantic;
2473                 }
2474         }
2475
2476         shader->info.nr_param_exports = param_count;
2477
2478         /* We need to add the position output manually if it's missing. */
2479         if (!pos_args[0][0]) {
2480                 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2481                 pos_args[0][1] = uint->zero; /* EXEC mask */
2482                 pos_args[0][2] = uint->zero; /* last export? */
2483                 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2484                 pos_args[0][4] = uint->zero; /* COMPR flag */
2485                 pos_args[0][5] = base->zero; /* X */
2486                 pos_args[0][6] = base->zero; /* Y */
2487                 pos_args[0][7] = base->zero; /* Z */
2488                 pos_args[0][8] = base->one;  /* W */
2489         }
2490
2491         /* Write the misc vector (point size, edgeflag, layer, viewport). */
2492         if (shader->selector->info.writes_psize ||
2493             shader->selector->info.writes_edgeflag ||
2494             shader->selector->info.writes_viewport_index ||
2495             shader->selector->info.writes_layer) {
2496                 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2497                                                       shader->selector->info.writes_psize |
2498                                                       (shader->selector->info.writes_edgeflag << 1) |
2499                                                       (shader->selector->info.writes_layer << 2) |
2500                                                       (shader->selector->info.writes_viewport_index << 3));
2501                 pos_args[1][1] = uint->zero; /* EXEC mask */
2502                 pos_args[1][2] = uint->zero; /* last export? */
2503                 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2504                 pos_args[1][4] = uint->zero; /* COMPR flag */
2505                 pos_args[1][5] = base->zero; /* X */
2506                 pos_args[1][6] = base->zero; /* Y */
2507                 pos_args[1][7] = base->zero; /* Z */
2508                 pos_args[1][8] = base->zero; /* W */
2509
2510                 if (shader->selector->info.writes_psize)
2511                         pos_args[1][5] = psize_value;
2512
2513                 if (shader->selector->info.writes_edgeflag) {
2514                         /* The output is a float, but the hw expects an integer
2515                          * with the first bit containing the edge flag. */
2516                         edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2517                                                          edgeflag_value,
2518                                                          ctx->i32, "");
2519                         edgeflag_value = lp_build_min(&bld_base->int_bld,
2520                                                       edgeflag_value,
2521                                                       bld_base->int_bld.one);
2522
2523                         /* The LLVM intrinsic expects a float. */
2524                         pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2525                                                           edgeflag_value,
2526                                                           ctx->f32, "");
2527                 }
2528
2529                 if (shader->selector->info.writes_layer)
2530                         pos_args[1][7] = layer_value;
2531
2532                 if (shader->selector->info.writes_viewport_index)
2533                         pos_args[1][8] = viewport_index_value;
2534         }
2535
2536         for (i = 0; i < 4; i++)
2537                 if (pos_args[i][0])
2538                         shader->info.nr_pos_exports++;
2539
2540         pos_idx = 0;
2541         for (i = 0; i < 4; i++) {
2542                 if (!pos_args[i][0])
2543                         continue;
2544
2545                 /* Specify the target we are exporting */
2546                 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2547
2548                 if (pos_idx == shader->info.nr_pos_exports)
2549                         /* Specify that this is the last export */
2550                         pos_args[i][2] = uint->one;
2551
2552                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2553                                    ctx->voidt, pos_args[i], 9, 0);
2554         }
2555 }
2556
2557 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2558 {
2559         struct si_shader_context *ctx = si_shader_context(bld_base);
2560         struct gallivm_state *gallivm = bld_base->base.gallivm;
2561         LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2562         LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2563         uint64_t inputs;
2564
2565         invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2566
2567         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2568         buffer = build_indexed_load_const(ctx, rw_buffers,
2569                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2570
2571         buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2572
2573         lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2574         lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2575                                          lds_vertex_stride, "");
2576         lds_base = get_tcs_in_current_patch_offset(ctx);
2577         lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2578
2579         inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2580         while (inputs) {
2581                 unsigned i = u_bit_scan64(&inputs);
2582
2583                 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2584                                             lp_build_const_int32(gallivm, 4 * i),
2585                                              "");
2586
2587                 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2588                                               invocation_id,
2589                                               lp_build_const_int32(gallivm, i));
2590
2591                 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2592                                               lds_ptr);
2593
2594                 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2595                                            buffer_offset, 0);
2596         }
2597 }
2598
2599 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2600                                   LLVMValueRef rel_patch_id,
2601                                   LLVMValueRef invocation_id,
2602                                   LLVMValueRef tcs_out_current_patch_data_offset)
2603 {
2604         struct si_shader_context *ctx = si_shader_context(bld_base);
2605         struct gallivm_state *gallivm = bld_base->base.gallivm;
2606         struct si_shader *shader = ctx->shader;
2607         unsigned tess_inner_index, tess_outer_index;
2608         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2609         LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2610         unsigned stride, outer_comps, inner_comps, i;
2611         struct lp_build_if_state if_ctx, inner_if_ctx;
2612
2613         si_llvm_emit_barrier(NULL, bld_base, NULL);
2614
2615         /* Do this only for invocation 0, because the tess levels are per-patch,
2616          * not per-vertex.
2617          *
2618          * This can't jump, because invocation 0 executes this. It should
2619          * at least mask out the loads and stores for other invocations.
2620          */
2621         lp_build_if(&if_ctx, gallivm,
2622                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2623                                   invocation_id, bld_base->uint_bld.zero, ""));
2624
2625         /* Determine the layout of one tess factor element in the buffer. */
2626         switch (shader->key.tcs.epilog.prim_mode) {
2627         case PIPE_PRIM_LINES:
2628                 stride = 2; /* 2 dwords, 1 vec2 store */
2629                 outer_comps = 2;
2630                 inner_comps = 0;
2631                 break;
2632         case PIPE_PRIM_TRIANGLES:
2633                 stride = 4; /* 4 dwords, 1 vec4 store */
2634                 outer_comps = 3;
2635                 inner_comps = 1;
2636                 break;
2637         case PIPE_PRIM_QUADS:
2638                 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2639                 outer_comps = 4;
2640                 inner_comps = 2;
2641                 break;
2642         default:
2643                 assert(0);
2644                 return;
2645         }
2646
2647         /* Load tess_inner and tess_outer from LDS.
2648          * Any invocation can write them, so we can't get them from a temporary.
2649          */
2650         tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2651         tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2652
2653         lds_base = tcs_out_current_patch_data_offset;
2654         lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2655                                  lp_build_const_int32(gallivm,
2656                                                       tess_inner_index * 4), "");
2657         lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2658                                  lp_build_const_int32(gallivm,
2659                                                       tess_outer_index * 4), "");
2660
2661         for (i = 0; i < outer_comps; i++)
2662                 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2663         for (i = 0; i < inner_comps; i++)
2664                 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2665
2666         /* Convert the outputs to vectors for stores. */
2667         vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2668         vec1 = NULL;
2669
2670         if (stride > 4)
2671                 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2672
2673         /* Get the buffer. */
2674         rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2675                                   SI_PARAM_RW_BUFFERS);
2676         buffer = build_indexed_load_const(ctx, rw_buffers,
2677                         lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2678
2679         /* Get the offset. */
2680         tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2681                                SI_PARAM_TESS_FACTOR_OFFSET);
2682         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2683                                   lp_build_const_int32(gallivm, 4 * stride), "");
2684
2685         lp_build_if(&inner_if_ctx, gallivm,
2686                     LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2687                                   rel_patch_id, bld_base->uint_bld.zero, ""));
2688
2689         /* Store the dynamic HS control word. */
2690         build_tbuffer_store_dwords(ctx, buffer,
2691                                    lp_build_const_int32(gallivm, 0x80000000),
2692                                    1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2693
2694         lp_build_endif(&inner_if_ctx);
2695
2696         /* Store the tessellation factors. */
2697         build_tbuffer_store_dwords(ctx, buffer, vec0,
2698                                    MIN2(stride, 4), byteoffset, tf_base, 4);
2699         if (vec1)
2700                 build_tbuffer_store_dwords(ctx, buffer, vec1,
2701                                            stride - 4, byteoffset, tf_base, 20);
2702         lp_build_endif(&if_ctx);
2703 }
2704
2705 /* This only writes the tessellation factor levels. */
2706 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2707 {
2708         struct si_shader_context *ctx = si_shader_context(bld_base);
2709         LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2710
2711         rel_patch_id = get_rel_patch_id(ctx);
2712         invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2713         tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2714
2715         if (!ctx->is_monolithic) {
2716                 /* Return epilog parameters from this function. */
2717                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2718                 LLVMValueRef ret = ctx->return_value;
2719                 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2720                 unsigned vgpr;
2721
2722                 /* RW_BUFFERS pointer */
2723                 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2724                                           SI_PARAM_RW_BUFFERS);
2725                 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2726                 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2727                 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2728                                               bld_base->uint_bld.zero, "");
2729                 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2730                                               bld_base->uint_bld.one, "");
2731                 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2732                 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2733
2734                 /* Tess factor buffer soffset is after user SGPRs. */
2735                 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2736                                           SI_PARAM_TESS_FACTOR_OFFSET);
2737                 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2738                                            SI_TCS_NUM_USER_SGPR + 1, "");
2739
2740                 /* VGPRs */
2741                 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2742                 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2743                 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2744
2745                 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2746                 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2747                 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2748                 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2749                 ctx->return_value = ret;
2750                 return;
2751         }
2752
2753         si_copy_tcs_inputs(bld_base);
2754         si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2755 }
2756
2757 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2758 {
2759         struct si_shader_context *ctx = si_shader_context(bld_base);
2760         struct si_shader *shader = ctx->shader;
2761         struct tgsi_shader_info *info = &shader->selector->info;
2762         struct gallivm_state *gallivm = bld_base->base.gallivm;
2763         unsigned i, chan;
2764         LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2765                                               ctx->param_rel_auto_id);
2766         LLVMValueRef vertex_dw_stride =
2767                 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2768         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2769                                                  vertex_dw_stride, "");
2770
2771         /* Write outputs to LDS. The next shader (TCS aka HS) will read
2772          * its inputs from it. */
2773         for (i = 0; i < info->num_outputs; i++) {
2774                 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2775                 unsigned name = info->output_semantic_name[i];
2776                 unsigned index = info->output_semantic_index[i];
2777                 int param = si_shader_io_get_unique_index(name, index);
2778                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2779                                         lp_build_const_int32(gallivm, param * 4), "");
2780
2781                 for (chan = 0; chan < 4; chan++) {
2782                         lds_store(bld_base, chan, dw_addr,
2783                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2784                 }
2785         }
2786 }
2787
2788 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2789 {
2790         struct si_shader_context *ctx = si_shader_context(bld_base);
2791         struct gallivm_state *gallivm = bld_base->base.gallivm;
2792         struct si_shader *es = ctx->shader;
2793         struct tgsi_shader_info *info = &es->selector->info;
2794         LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2795                                             ctx->param_es2gs_offset);
2796         unsigned chan;
2797         int i;
2798
2799         for (i = 0; i < info->num_outputs; i++) {
2800                 LLVMValueRef *out_ptr =
2801                         ctx->radeon_bld.soa.outputs[i];
2802                 int param_index;
2803
2804                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2805                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2806                         continue;
2807
2808                 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2809                                                             info->output_semantic_index[i]);
2810
2811                 for (chan = 0; chan < 4; chan++) {
2812                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2813                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2814
2815                         build_tbuffer_store(ctx,
2816                                             ctx->esgs_ring,
2817                                             out_val, 1,
2818                                             LLVMGetUndef(ctx->i32), soffset,
2819                                             (4 * param_index + chan) * 4,
2820                                             V_008F0C_BUF_DATA_FORMAT_32,
2821                                             V_008F0C_BUF_NUM_FORMAT_UINT,
2822                                             0, 0, 1, 1, 0);
2823                 }
2824         }
2825 }
2826
2827 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2828 {
2829         struct si_shader_context *ctx = si_shader_context(bld_base);
2830         struct gallivm_state *gallivm = bld_base->base.gallivm;
2831         LLVMValueRef args[2];
2832
2833         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2834         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2835         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2836                            ctx->voidt, args, 2, LLVMNoUnwindAttribute);
2837 }
2838
2839 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2840 {
2841         struct si_shader_context *ctx = si_shader_context(bld_base);
2842         struct gallivm_state *gallivm = bld_base->base.gallivm;
2843         struct tgsi_shader_info *info = &ctx->shader->selector->info;
2844         struct si_shader_output_values *outputs = NULL;
2845         int i,j;
2846
2847         assert(!ctx->is_gs_copy_shader);
2848
2849         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2850
2851         /* Vertex color clamping.
2852          *
2853          * This uses a state constant loaded in a user data SGPR and
2854          * an IF statement is added that clamps all colors if the constant
2855          * is true.
2856          */
2857         if (ctx->type == PIPE_SHADER_VERTEX) {
2858                 struct lp_build_if_state if_ctx;
2859                 LLVMValueRef cond = NULL;
2860                 LLVMValueRef addr, val;
2861
2862                 for (i = 0; i < info->num_outputs; i++) {
2863                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2864                             info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2865                                 continue;
2866
2867                         /* We've found a color. */
2868                         if (!cond) {
2869                                 /* The state is in the first bit of the user SGPR. */
2870                                 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2871                                                     SI_PARAM_VS_STATE_BITS);
2872                                 cond = LLVMBuildTrunc(gallivm->builder, cond,
2873                                                       ctx->i1, "");
2874                                 lp_build_if(&if_ctx, gallivm, cond);
2875                         }
2876
2877                         for (j = 0; j < 4; j++) {
2878                                 addr = ctx->radeon_bld.soa.outputs[i][j];
2879                                 val = LLVMBuildLoad(gallivm->builder, addr, "");
2880                                 val = radeon_llvm_saturate(bld_base, val);
2881                                 LLVMBuildStore(gallivm->builder, val, addr);
2882                         }
2883                 }
2884
2885                 if (cond)
2886                         lp_build_endif(&if_ctx);
2887         }
2888
2889         for (i = 0; i < info->num_outputs; i++) {
2890                 outputs[i].name = info->output_semantic_name[i];
2891                 outputs[i].sid = info->output_semantic_index[i];
2892
2893                 for (j = 0; j < 4; j++)
2894                         outputs[i].values[j] =
2895                                 LLVMBuildLoad(gallivm->builder,
2896                                               ctx->radeon_bld.soa.outputs[i][j],
2897                                               "");
2898         }
2899
2900         if (ctx->is_monolithic) {
2901                 /* Export PrimitiveID when PS needs it. */
2902                 if (si_vs_exports_prim_id(ctx->shader)) {
2903                         outputs[i].name = TGSI_SEMANTIC_PRIMID;
2904                         outputs[i].sid = 0;
2905                         outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2906                                                        get_primitive_id(bld_base, 0));
2907                         outputs[i].values[1] = bld_base->base.undef;
2908                         outputs[i].values[2] = bld_base->base.undef;
2909                         outputs[i].values[3] = bld_base->base.undef;
2910                         i++;
2911                 }
2912         } else {
2913                 /* Return the primitive ID from the LLVM function. */
2914                 ctx->return_value =
2915                         LLVMBuildInsertValue(gallivm->builder,
2916                                              ctx->return_value,
2917                                              bitcast(bld_base, TGSI_TYPE_FLOAT,
2918                                                      get_primitive_id(bld_base, 0)),
2919                                              VS_EPILOG_PRIMID_LOC, "");
2920         }
2921
2922         si_llvm_export_vs(bld_base, outputs, i);
2923         FREE(outputs);
2924 }
2925
2926 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2927                            LLVMValueRef depth, LLVMValueRef stencil,
2928                            LLVMValueRef samplemask)
2929 {
2930         struct si_shader_context *ctx = si_shader_context(bld_base);
2931         struct lp_build_context *base = &bld_base->base;
2932         struct lp_build_context *uint = &bld_base->uint_bld;
2933         LLVMValueRef args[9];
2934         unsigned mask = 0;
2935
2936         assert(depth || stencil || samplemask);
2937
2938         args[1] = uint->one; /* whether the EXEC mask is valid */
2939         args[2] = uint->one; /* DONE bit */
2940
2941         /* Specify the target we are exporting */
2942         args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2943
2944         args[4] = uint->zero; /* COMP flag */
2945         args[5] = base->undef; /* R, depth */
2946         args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2947         args[7] = base->undef; /* B, sample mask */
2948         args[8] = base->undef; /* A, alpha to mask */
2949
2950         if (depth) {
2951                 args[5] = depth;
2952                 mask |= 0x1;
2953         }
2954
2955         if (stencil) {
2956                 args[6] = stencil;
2957                 mask |= 0x2;
2958         }
2959
2960         if (samplemask) {
2961                 args[7] = samplemask;
2962                 mask |= 0x4;
2963         }
2964
2965         /* SI (except OLAND) has a bug that it only looks
2966          * at the X writemask component. */
2967         if (ctx->screen->b.chip_class == SI &&
2968             ctx->screen->b.family != CHIP_OLAND)
2969                 mask |= 0x1;
2970
2971         /* Specify which components to enable */
2972         args[0] = lp_build_const_int32(base->gallivm, mask);
2973
2974         lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2975                            ctx->voidt, args, 9, 0);
2976 }
2977
2978 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2979                                 LLVMValueRef *color, unsigned index,
2980                                 unsigned samplemask_param,
2981                                 bool is_last)
2982 {
2983         struct si_shader_context *ctx = si_shader_context(bld_base);
2984         struct lp_build_context *base = &bld_base->base;
2985         int i;
2986
2987         /* Clamp color */
2988         if (ctx->shader->key.ps.epilog.clamp_color)
2989                 for (i = 0; i < 4; i++)
2990                         color[i] = radeon_llvm_saturate(bld_base, color[i]);
2991
2992         /* Alpha to one */
2993         if (ctx->shader->key.ps.epilog.alpha_to_one)
2994                 color[3] = base->one;
2995
2996         /* Alpha test */
2997         if (index == 0 &&
2998             ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2999                 si_alpha_test(bld_base, color[3]);
3000
3001         /* Line & polygon smoothing */
3002         if (ctx->shader->key.ps.epilog.poly_line_smoothing)
3003                 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3004                                                          samplemask_param);
3005
3006         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3007         if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3008                 LLVMValueRef args[8][9];
3009                 int c, last = -1;
3010
3011                 /* Get the export arguments, also find out what the last one is. */
3012                 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3013                         si_llvm_init_export_args(bld_base, color,
3014                                                  V_008DFC_SQ_EXP_MRT + c, args[c]);
3015                         if (args[c][0] != bld_base->uint_bld.zero)
3016                                 last = c;
3017                 }
3018
3019                 /* Emit all exports. */
3020                 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3021                         if (is_last && last == c) {
3022                                 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3023                                 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3024                         } else if (args[c][0] == bld_base->uint_bld.zero)
3025                                 continue; /* unnecessary NULL export */
3026
3027                         lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3028                                            ctx->voidt, args[c], 9, 0);
3029                 }
3030         } else {
3031                 LLVMValueRef args[9];
3032
3033                 /* Export */
3034                 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3035                                          args);
3036                 if (is_last) {
3037                         args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3038                         args[2] = bld_base->uint_bld.one; /* DONE bit */
3039                 } else if (args[0] == bld_base->uint_bld.zero)
3040                         return; /* unnecessary NULL export */
3041
3042                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3043                                    ctx->voidt, args, 9, 0);
3044         }
3045 }
3046
3047 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3048 {
3049         struct si_shader_context *ctx = si_shader_context(bld_base);
3050         struct lp_build_context *base = &bld_base->base;
3051         struct lp_build_context *uint = &bld_base->uint_bld;
3052         LLVMValueRef args[9];
3053
3054         args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3055         args[1] = uint->one; /* whether the EXEC mask is valid */
3056         args[2] = uint->one; /* DONE bit */
3057         args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3058         args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3059         args[5] = uint->undef; /* R */
3060         args[6] = uint->undef; /* G */
3061         args[7] = uint->undef; /* B */
3062         args[8] = uint->undef; /* A */
3063
3064         lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3065                            ctx->voidt, args, 9, 0);
3066 }
3067
3068 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3069 {
3070         struct si_shader_context *ctx = si_shader_context(bld_base);
3071         struct si_shader *shader = ctx->shader;
3072         struct lp_build_context *base = &bld_base->base;
3073         struct tgsi_shader_info *info = &shader->selector->info;
3074         LLVMBuilderRef builder = base->gallivm->builder;
3075         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3076         int last_color_export = -1;
3077         int i;
3078
3079         /* Determine the last export. If MRTZ is present, it's always last.
3080          * Otherwise, find the last color export.
3081          */
3082         if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3083                 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3084
3085                 /* Don't export NULL and return if alpha-test is enabled. */
3086                 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3087                     shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3088                     (spi_format & 0xf) == 0)
3089                         spi_format |= V_028714_SPI_SHADER_32_AR;
3090
3091                 for (i = 0; i < info->num_outputs; i++) {
3092                         unsigned index = info->output_semantic_index[i];
3093
3094                         if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3095                                 continue;
3096
3097                         /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3098                         if (shader->key.ps.epilog.last_cbuf > 0) {
3099                                 /* Just set this if any of the colorbuffers are enabled. */
3100                                 if (spi_format &
3101                                     ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3102                                         last_color_export = i;
3103                                 continue;
3104                         }
3105
3106                         if ((spi_format >> (index * 4)) & 0xf)
3107                                 last_color_export = i;
3108                 }
3109
3110                 /* If there are no outputs, export NULL. */
3111                 if (last_color_export == -1) {
3112                         si_export_null(bld_base);
3113                         return;
3114                 }
3115         }
3116
3117         for (i = 0; i < info->num_outputs; i++) {
3118                 unsigned semantic_name = info->output_semantic_name[i];
3119                 unsigned semantic_index = info->output_semantic_index[i];
3120                 unsigned j;
3121                 LLVMValueRef color[4] = {};
3122
3123                 /* Select the correct target */
3124                 switch (semantic_name) {
3125                 case TGSI_SEMANTIC_POSITION:
3126                         depth = LLVMBuildLoad(builder,
3127                                               ctx->radeon_bld.soa.outputs[i][2], "");
3128                         break;
3129                 case TGSI_SEMANTIC_STENCIL:
3130                         stencil = LLVMBuildLoad(builder,
3131                                                 ctx->radeon_bld.soa.outputs[i][1], "");
3132                         break;
3133                 case TGSI_SEMANTIC_SAMPLEMASK:
3134                         samplemask = LLVMBuildLoad(builder,
3135                                                    ctx->radeon_bld.soa.outputs[i][0], "");
3136                         break;
3137                 case TGSI_SEMANTIC_COLOR:
3138                         for (j = 0; j < 4; j++)
3139                                 color[j] = LLVMBuildLoad(builder,
3140                                                          ctx->radeon_bld.soa.outputs[i][j], "");
3141
3142                         si_export_mrt_color(bld_base, color, semantic_index,
3143                                             SI_PARAM_SAMPLE_COVERAGE,
3144                                             last_color_export == i);
3145                         break;
3146                 default:
3147                         fprintf(stderr,
3148                                 "Warning: SI unhandled fs output type:%d\n",
3149                                 semantic_name);
3150                 }
3151         }
3152
3153         if (depth || stencil || samplemask)
3154                 si_export_mrt_z(bld_base, depth, stencil, samplemask);
3155 }
3156
3157 /**
3158  * Return PS outputs in this order:
3159  *
3160  * v[0:3] = color0.xyzw
3161  * v[4:7] = color1.xyzw
3162  * ...
3163  * vN+0 = Depth
3164  * vN+1 = Stencil
3165  * vN+2 = SampleMask
3166  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3167  *
3168  * The alpha-ref SGPR is returned via its original location.
3169  */
3170 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3171 {
3172         struct si_shader_context *ctx = si_shader_context(bld_base);
3173         struct si_shader *shader = ctx->shader;
3174         struct lp_build_context *base = &bld_base->base;
3175         struct tgsi_shader_info *info = &shader->selector->info;
3176         LLVMBuilderRef builder = base->gallivm->builder;
3177         unsigned i, j, first_vgpr, vgpr;
3178
3179         LLVMValueRef color[8][4] = {};
3180         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3181         LLVMValueRef ret;
3182
3183         /* Read the output values. */
3184         for (i = 0; i < info->num_outputs; i++) {
3185                 unsigned semantic_name = info->output_semantic_name[i];
3186                 unsigned semantic_index = info->output_semantic_index[i];
3187
3188                 switch (semantic_name) {
3189                 case TGSI_SEMANTIC_COLOR:
3190                         assert(semantic_index < 8);
3191                         for (j = 0; j < 4; j++) {
3192                                 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3193                                 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3194                                 color[semantic_index][j] = result;
3195                         }
3196                         break;
3197                 case TGSI_SEMANTIC_POSITION:
3198                         depth = LLVMBuildLoad(builder,
3199                                               ctx->radeon_bld.soa.outputs[i][2], "");
3200                         break;
3201                 case TGSI_SEMANTIC_STENCIL:
3202                         stencil = LLVMBuildLoad(builder,
3203                                                 ctx->radeon_bld.soa.outputs[i][1], "");
3204                         break;
3205                 case TGSI_SEMANTIC_SAMPLEMASK:
3206                         samplemask = LLVMBuildLoad(builder,
3207                                                    ctx->radeon_bld.soa.outputs[i][0], "");
3208                         break;
3209                 default:
3210                         fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3211                                 semantic_name);
3212                 }
3213         }
3214
3215         /* Fill the return structure. */
3216         ret = ctx->return_value;
3217
3218         /* Set SGPRs. */
3219         ret = LLVMBuildInsertValue(builder, ret,
3220                                    bitcast(bld_base, TGSI_TYPE_SIGNED,
3221                                            LLVMGetParam(ctx->radeon_bld.main_fn,
3222                                                         SI_PARAM_ALPHA_REF)),
3223                                    SI_SGPR_ALPHA_REF, "");
3224
3225         /* Set VGPRs */
3226         first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3227         for (i = 0; i < ARRAY_SIZE(color); i++) {
3228                 if (!color[i][0])
3229                         continue;
3230
3231                 for (j = 0; j < 4; j++)
3232                         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3233         }
3234         if (depth)
3235                 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3236         if (stencil)
3237                 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3238         if (samplemask)
3239                 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3240
3241         /* Add the input sample mask for smoothing at the end. */
3242         if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3243                 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3244         ret = LLVMBuildInsertValue(builder, ret,
3245                                    LLVMGetParam(ctx->radeon_bld.main_fn,
3246                                                 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3247
3248         ctx->return_value = ret;
3249 }
3250
3251 /**
3252  * Given a v8i32 resource descriptor for a buffer, extract the size of the
3253  * buffer in number of elements and return it as an i32.
3254  */
3255 static LLVMValueRef get_buffer_size(
3256         struct lp_build_tgsi_context *bld_base,
3257         LLVMValueRef descriptor)
3258 {
3259         struct si_shader_context *ctx = si_shader_context(bld_base);
3260         struct gallivm_state *gallivm = bld_base->base.gallivm;
3261         LLVMBuilderRef builder = gallivm->builder;
3262         LLVMValueRef size =
3263                 LLVMBuildExtractElement(builder, descriptor,
3264                                         lp_build_const_int32(gallivm, 6), "");
3265
3266         if (ctx->screen->b.chip_class >= VI) {
3267                 /* On VI, the descriptor contains the size in bytes,
3268                  * but TXQ must return the size in elements.
3269                  * The stride is always non-zero for resources using TXQ.
3270                  */
3271                 LLVMValueRef stride =
3272                         LLVMBuildExtractElement(builder, descriptor,
3273                                                 lp_build_const_int32(gallivm, 5), "");
3274                 stride = LLVMBuildLShr(builder, stride,
3275                                        lp_build_const_int32(gallivm, 16), "");
3276                 stride = LLVMBuildAnd(builder, stride,
3277                                       lp_build_const_int32(gallivm, 0x3FFF), "");
3278
3279                 size = LLVMBuildUDiv(builder, size, stride, "");
3280         }
3281
3282         return size;
3283 }
3284
3285 /**
3286  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3287  * intrinsic names).
3288  */
3289 static void build_int_type_name(
3290         LLVMTypeRef type,
3291         char *buf, unsigned bufsize)
3292 {
3293         assert(bufsize >= 6);
3294
3295         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3296                 snprintf(buf, bufsize, "v%ui32",
3297                          LLVMGetVectorSize(type));
3298         else
3299                 strcpy(buf, "i32");
3300 }
3301
3302 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3303                                 struct lp_build_tgsi_context *bld_base,
3304                                 struct lp_build_emit_data *emit_data);
3305
3306 /* Prevent optimizations (at least of memory accesses) across the current
3307  * point in the program by emitting empty inline assembly that is marked as
3308  * having side effects.
3309  */
3310 static void emit_optimization_barrier(struct si_shader_context *ctx)
3311 {
3312         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3313         LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3314         LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3315         LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3316 }
3317
3318 static void emit_waitcnt(struct si_shader_context *ctx)
3319 {
3320         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3321         LLVMBuilderRef builder = gallivm->builder;
3322         LLVMValueRef args[1] = {
3323                 lp_build_const_int32(gallivm, 0xf70)
3324         };
3325         lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3326                            ctx->voidt, args, 1, LLVMNoUnwindAttribute);
3327 }
3328
3329 static void membar_emit(
3330                 const struct lp_build_tgsi_action *action,
3331                 struct lp_build_tgsi_context *bld_base,
3332                 struct lp_build_emit_data *emit_data)
3333 {
3334         struct si_shader_context *ctx = si_shader_context(bld_base);
3335
3336         emit_waitcnt(ctx);
3337 }
3338
3339 static LLVMValueRef
3340 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3341                          const struct tgsi_full_src_register *reg)
3342 {
3343         LLVMValueRef ind_index;
3344         LLVMValueRef rsrc_ptr;
3345
3346         if (!reg->Register.Indirect)
3347                 return ctx->shader_buffers[reg->Register.Index];
3348
3349         ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3350                                                reg->Register.Index,
3351                                                SI_NUM_SHADER_BUFFERS);
3352
3353         rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3354         return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3355 }
3356
3357 static bool tgsi_is_array_sampler(unsigned target)
3358 {
3359         return target == TGSI_TEXTURE_1D_ARRAY ||
3360                target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3361                target == TGSI_TEXTURE_2D_ARRAY ||
3362                target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3363                target == TGSI_TEXTURE_CUBE_ARRAY ||
3364                target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3365                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3366 }
3367
3368 static bool tgsi_is_array_image(unsigned target)
3369 {
3370         return target == TGSI_TEXTURE_3D ||
3371                target == TGSI_TEXTURE_CUBE ||
3372                target == TGSI_TEXTURE_1D_ARRAY ||
3373                target == TGSI_TEXTURE_2D_ARRAY ||
3374                target == TGSI_TEXTURE_CUBE_ARRAY ||
3375                target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3376 }
3377
3378 /**
3379  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3380  *
3381  * At least on Tonga, executing image stores on images with DCC enabled and
3382  * non-trivial can eventually lead to lockups. This can occur when an
3383  * application binds an image as read-only but then uses a shader that writes
3384  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3385  * program termination) in this case, but it doesn't cost much to be a bit
3386  * nicer: disabling DCC in the shader still leads to undefined results but
3387  * avoids the lockup.
3388  */
3389 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3390                                   LLVMValueRef rsrc)
3391 {
3392         if (ctx->screen->b.chip_class <= CIK) {
3393                 return rsrc;
3394         } else {
3395                 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3396                 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3397                 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3398                 LLVMValueRef tmp;
3399
3400                 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3401                 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3402                 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3403         }
3404 }
3405
3406 /**
3407  * Load the resource descriptor for \p image.
3408  */
3409 static void
3410 image_fetch_rsrc(
3411         struct lp_build_tgsi_context *bld_base,
3412         const struct tgsi_full_src_register *image,
3413         bool dcc_off,
3414         LLVMValueRef *rsrc)
3415 {
3416         struct si_shader_context *ctx = si_shader_context(bld_base);
3417
3418         assert(image->Register.File == TGSI_FILE_IMAGE);
3419
3420         if (!image->Register.Indirect) {
3421                 /* Fast path: use preloaded resources */
3422                 *rsrc = ctx->images[image->Register.Index];
3423         } else {
3424                 /* Indexing and manual load */
3425                 LLVMValueRef ind_index;
3426                 LLVMValueRef rsrc_ptr;
3427                 LLVMValueRef tmp;
3428
3429                 /* From the GL_ARB_shader_image_load_store extension spec:
3430                  *
3431                  *    If a shader performs an image load, store, or atomic
3432                  *    operation using an image variable declared as an array,
3433                  *    and if the index used to select an individual element is
3434                  *    negative or greater than or equal to the size of the
3435                  *    array, the results of the operation are undefined but may
3436                  *    not lead to termination.
3437                  */
3438                 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3439                                                        image->Register.Index,
3440                                                        SI_NUM_IMAGES);
3441
3442                 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3443                 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3444                 if (dcc_off)
3445                         tmp = force_dcc_off(ctx, tmp);
3446                 *rsrc = tmp;
3447         }
3448 }
3449
3450 static LLVMValueRef image_fetch_coords(
3451                 struct lp_build_tgsi_context *bld_base,
3452                 const struct tgsi_full_instruction *inst,
3453                 unsigned src)
3454 {
3455         struct gallivm_state *gallivm = bld_base->base.gallivm;
3456         LLVMBuilderRef builder = gallivm->builder;
3457         unsigned target = inst->Memory.Texture;
3458         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3459         LLVMValueRef coords[4];
3460         LLVMValueRef tmp;
3461         int chan;
3462
3463         for (chan = 0; chan < num_coords; ++chan) {
3464                 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3465                 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3466                 coords[chan] = tmp;
3467         }
3468
3469         if (num_coords == 1)
3470                 return coords[0];
3471
3472         if (num_coords == 3) {
3473                 /* LLVM has difficulties lowering 3-element vectors. */
3474                 coords[3] = bld_base->uint_bld.undef;
3475                 num_coords = 4;
3476         }
3477
3478         return lp_build_gather_values(gallivm, coords, num_coords);
3479 }
3480
3481 /**
3482  * Append the extra mode bits that are used by image load and store.
3483  */
3484 static void image_append_args(
3485                 struct si_shader_context *ctx,
3486                 struct lp_build_emit_data * emit_data,
3487                 unsigned target,
3488                 bool atomic)
3489 {
3490         const struct tgsi_full_instruction *inst = emit_data->inst;
3491         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3492         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3493
3494         emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3495         emit_data->args[emit_data->arg_count++] =
3496                 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3497         if (!atomic) {
3498                 emit_data->args[emit_data->arg_count++] =
3499                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3500                         i1true : i1false; /* glc */
3501         }
3502         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3503 }
3504
3505 /**
3506  * Given a 256 bit resource, extract the top half (which stores the buffer
3507  * resource in the case of textures and images).
3508  */
3509 static LLVMValueRef extract_rsrc_top_half(
3510                 struct si_shader_context *ctx,
3511                 LLVMValueRef rsrc)
3512 {
3513         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3514         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3515         LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3516
3517         rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3518         rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3519         rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3520
3521         return rsrc;
3522 }
3523
3524 /**
3525  * Append the resource and indexing arguments for buffer intrinsics.
3526  *
3527  * \param rsrc the v4i32 buffer resource
3528  * \param index index into the buffer (stride-based)
3529  * \param offset byte offset into the buffer
3530  */
3531 static void buffer_append_args(
3532                 struct si_shader_context *ctx,
3533                 struct lp_build_emit_data *emit_data,
3534                 LLVMValueRef rsrc,
3535                 LLVMValueRef index,
3536                 LLVMValueRef offset,
3537                 bool atomic)
3538 {
3539         const struct tgsi_full_instruction *inst = emit_data->inst;
3540         LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3541         LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3542
3543         emit_data->args[emit_data->arg_count++] = rsrc;
3544         emit_data->args[emit_data->arg_count++] = index; /* vindex */
3545         emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3546         if (!atomic) {
3547                 emit_data->args[emit_data->arg_count++] =
3548                         inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3549                         i1true : i1false; /* glc */
3550         }
3551         emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3552 }
3553
3554 static void load_fetch_args(
3555                 struct lp_build_tgsi_context * bld_base,
3556                 struct lp_build_emit_data * emit_data)
3557 {
3558         struct si_shader_context *ctx = si_shader_context(bld_base);
3559         struct gallivm_state *gallivm = bld_base->base.gallivm;
3560         const struct tgsi_full_instruction * inst = emit_data->inst;
3561         unsigned target = inst->Memory.Texture;
3562         LLVMValueRef rsrc;
3563
3564         emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3565
3566         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3567                 LLVMBuilderRef builder = gallivm->builder;
3568                 LLVMValueRef offset;
3569                 LLVMValueRef tmp;
3570
3571                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3572
3573                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3574                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3575
3576                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3577                                    offset, false);
3578         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3579                 LLVMValueRef coords;
3580
3581                 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3582                 coords = image_fetch_coords(bld_base, inst, 1);
3583
3584                 if (target == TGSI_TEXTURE_BUFFER) {
3585                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3586                         buffer_append_args(ctx, emit_data, rsrc, coords,
3587                                         bld_base->uint_bld.zero, false);
3588                 } else {
3589                         emit_data->args[0] = coords;
3590                         emit_data->args[1] = rsrc;
3591                         emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3592                         emit_data->arg_count = 3;
3593
3594                         image_append_args(ctx, emit_data, target, false);
3595                 }
3596         }
3597 }
3598
3599 static void load_emit_buffer(struct si_shader_context *ctx,
3600                              struct lp_build_emit_data *emit_data)
3601 {
3602         const struct tgsi_full_instruction *inst = emit_data->inst;
3603         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3604         LLVMBuilderRef builder = gallivm->builder;
3605         uint writemask = inst->Dst[0].Register.WriteMask;
3606         uint count = util_last_bit(writemask);
3607         const char *intrinsic_name;
3608         LLVMTypeRef dst_type;
3609
3610         switch (count) {
3611         case 1:
3612                 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3613                 dst_type = ctx->f32;
3614                 break;
3615         case 2:
3616                 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3617                 dst_type = LLVMVectorType(ctx->f32, 2);
3618                 break;
3619         default: // 3 & 4
3620                 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3621                 dst_type = ctx->v4f32;
3622                 count = 4;
3623         }
3624
3625         emit_data->output[emit_data->chan] = lp_build_intrinsic(
3626                         builder, intrinsic_name, dst_type,
3627                         emit_data->args, emit_data->arg_count,
3628                         LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3629 }
3630
3631 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3632                                    const struct tgsi_full_instruction *inst,
3633                                    LLVMTypeRef type, int arg)
3634 {
3635         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3636         LLVMBuilderRef builder = gallivm->builder;
3637         LLVMValueRef offset, ptr;
3638         int addr_space;
3639
3640         offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3641         offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3642
3643         ptr = ctx->shared_memory;
3644         ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3645         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3646         ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3647
3648         return ptr;
3649 }
3650
3651 static void load_emit_memory(
3652                 struct si_shader_context *ctx,
3653                 struct lp_build_emit_data *emit_data)
3654 {
3655         const struct tgsi_full_instruction *inst = emit_data->inst;
3656         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3657         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3658         LLVMBuilderRef builder = gallivm->builder;
3659         unsigned writemask = inst->Dst[0].Register.WriteMask;
3660         LLVMValueRef channels[4], ptr, derived_ptr, index;
3661         int chan;
3662
3663         ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3664
3665         for (chan = 0; chan < 4; ++chan) {
3666                 if (!(writemask & (1 << chan))) {
3667                         channels[chan] = LLVMGetUndef(base->elem_type);
3668                         continue;
3669                 }
3670
3671                 index = lp_build_const_int32(gallivm, chan);
3672                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3673                 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3674         }
3675         emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3676 }
3677
3678 static void load_emit(
3679                 const struct lp_build_tgsi_action *action,
3680                 struct lp_build_tgsi_context *bld_base,
3681                 struct lp_build_emit_data *emit_data)
3682 {
3683         struct si_shader_context *ctx = si_shader_context(bld_base);
3684         struct gallivm_state *gallivm = bld_base->base.gallivm;
3685         LLVMBuilderRef builder = gallivm->builder;
3686         const struct tgsi_full_instruction * inst = emit_data->inst;
3687         char intrinsic_name[32];
3688         char coords_type[8];
3689
3690         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3691                 load_emit_memory(ctx, emit_data);
3692                 return;
3693         }
3694
3695         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3696                 emit_waitcnt(ctx);
3697
3698         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3699                 load_emit_buffer(ctx, emit_data);
3700                 return;
3701         }
3702
3703         if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3704                 emit_data->output[emit_data->chan] =
3705                         lp_build_intrinsic(
3706                                 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3707                                 emit_data->args, emit_data->arg_count,
3708                                 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3709         } else {
3710                 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3711                                     coords_type, sizeof(coords_type));
3712
3713                 snprintf(intrinsic_name, sizeof(intrinsic_name),
3714                          "llvm.amdgcn.image.load.%s", coords_type);
3715
3716                 emit_data->output[emit_data->chan] =
3717                         lp_build_intrinsic(
3718                                 builder, intrinsic_name, emit_data->dst_type,
3719                                 emit_data->args, emit_data->arg_count,
3720                                 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3721         }
3722 }
3723
3724 static void store_fetch_args(
3725                 struct lp_build_tgsi_context * bld_base,
3726                 struct lp_build_emit_data * emit_data)
3727 {
3728         struct si_shader_context *ctx = si_shader_context(bld_base);
3729         struct gallivm_state *gallivm = bld_base->base.gallivm;
3730         LLVMBuilderRef builder = gallivm->builder;
3731         const struct tgsi_full_instruction * inst = emit_data->inst;
3732         struct tgsi_full_src_register memory;
3733         LLVMValueRef chans[4];
3734         LLVMValueRef data;
3735         LLVMValueRef rsrc;
3736         unsigned chan;
3737
3738         emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3739
3740         for (chan = 0; chan < 4; ++chan) {
3741                 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3742         }
3743         data = lp_build_gather_values(gallivm, chans, 4);
3744
3745         emit_data->args[emit_data->arg_count++] = data;
3746
3747         memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3748
3749         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3750                 LLVMValueRef offset;
3751                 LLVMValueRef tmp;
3752
3753                 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3754
3755                 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3756                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3757
3758                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3759                                    offset, false);
3760         } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3761                 unsigned target = inst->Memory.Texture;
3762                 LLVMValueRef coords;
3763
3764                 coords = image_fetch_coords(bld_base, inst, 0);
3765
3766                 if (target == TGSI_TEXTURE_BUFFER) {
3767                         image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3768
3769                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3770                         buffer_append_args(ctx, emit_data, rsrc, coords,
3771                                         bld_base->uint_bld.zero, false);
3772                 } else {
3773                         emit_data->args[1] = coords;
3774                         image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3775                         emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3776                         emit_data->arg_count = 4;
3777
3778                         image_append_args(ctx, emit_data, target, false);
3779                 }
3780         }
3781 }
3782
3783 static void store_emit_buffer(
3784                 struct si_shader_context *ctx,
3785                 struct lp_build_emit_data *emit_data)
3786 {
3787         const struct tgsi_full_instruction *inst = emit_data->inst;
3788         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3789         LLVMBuilderRef builder = gallivm->builder;
3790         struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3791         LLVMValueRef base_data = emit_data->args[0];
3792         LLVMValueRef base_offset = emit_data->args[3];
3793         unsigned writemask = inst->Dst[0].Register.WriteMask;
3794
3795         while (writemask) {
3796                 int start, count;
3797                 const char *intrinsic_name;
3798                 LLVMValueRef data;
3799                 LLVMValueRef offset;
3800                 LLVMValueRef tmp;
3801
3802                 u_bit_scan_consecutive_range(&writemask, &start, &count);
3803
3804                 /* Due to an LLVM limitation, split 3-element writes
3805                  * into a 2-element and a 1-element write. */
3806                 if (count == 3) {
3807                         writemask |= 1 << (start + 2);
3808                         count = 2;
3809                 }
3810
3811                 if (count == 4) {
3812                         data = base_data;
3813                         intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3814                 } else if (count == 2) {
3815                         LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3816
3817                         tmp = LLVMBuildExtractElement(
3818                                 builder, base_data,
3819                                 lp_build_const_int32(gallivm, start), "");
3820                         data = LLVMBuildInsertElement(
3821                                 builder, LLVMGetUndef(v2f32), tmp,
3822                                 uint_bld->zero, "");
3823
3824                         tmp = LLVMBuildExtractElement(
3825                                 builder, base_data,
3826                                 lp_build_const_int32(gallivm, start + 1), "");
3827                         data = LLVMBuildInsertElement(
3828                                 builder, data, tmp, uint_bld->one, "");
3829
3830                         intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3831                 } else {
3832                         assert(count == 1);
3833                         data = LLVMBuildExtractElement(
3834                                 builder, base_data,
3835                                 lp_build_const_int32(gallivm, start), "");
3836                         intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3837                 }
3838
3839                 offset = base_offset;
3840                 if (start != 0) {
3841                         offset = LLVMBuildAdd(
3842                                 builder, offset,
3843                                 lp_build_const_int32(gallivm, start * 4), "");
3844                 }
3845
3846                 emit_data->args[0] = data;
3847                 emit_data->args[3] = offset;
3848
3849                 lp_build_intrinsic(
3850                         builder, intrinsic_name, emit_data->dst_type,
3851                         emit_data->args, emit_data->arg_count,
3852                         LLVMNoUnwindAttribute);
3853         }
3854 }
3855
3856 static void store_emit_memory(
3857                 struct si_shader_context *ctx,
3858                 struct lp_build_emit_data *emit_data)
3859 {
3860         const struct tgsi_full_instruction *inst = emit_data->inst;
3861         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3862         struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3863         LLVMBuilderRef builder = gallivm->builder;
3864         unsigned writemask = inst->Dst[0].Register.WriteMask;
3865         LLVMValueRef ptr, derived_ptr, data, index;
3866         int chan;
3867
3868         ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3869
3870         for (chan = 0; chan < 4; ++chan) {
3871                 if (!(writemask & (1 << chan))) {
3872                         continue;
3873                 }
3874                 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3875                 index = lp_build_const_int32(gallivm, chan);
3876                 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3877                 LLVMBuildStore(builder, data, derived_ptr);
3878         }
3879 }
3880
3881 static void store_emit(
3882                 const struct lp_build_tgsi_action *action,
3883                 struct lp_build_tgsi_context *bld_base,
3884                 struct lp_build_emit_data *emit_data)
3885 {
3886         struct si_shader_context *ctx = si_shader_context(bld_base);
3887         struct gallivm_state *gallivm = bld_base->base.gallivm;
3888         LLVMBuilderRef builder = gallivm->builder;
3889         const struct tgsi_full_instruction * inst = emit_data->inst;
3890         unsigned target = inst->Memory.Texture;
3891         char intrinsic_name[32];
3892         char coords_type[8];
3893
3894         if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3895                 store_emit_memory(ctx, emit_data);
3896                 return;
3897         }
3898
3899         if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3900                 emit_waitcnt(ctx);
3901
3902         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3903                 store_emit_buffer(ctx, emit_data);
3904                 return;
3905         }
3906
3907         if (target == TGSI_TEXTURE_BUFFER) {
3908                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3909                         builder, "llvm.amdgcn.buffer.store.format.v4f32",
3910                         emit_data->dst_type, emit_data->args, emit_data->arg_count,
3911                         LLVMNoUnwindAttribute);
3912         } else {
3913                 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3914                                     coords_type, sizeof(coords_type));
3915                 snprintf(intrinsic_name, sizeof(intrinsic_name),
3916                          "llvm.amdgcn.image.store.%s", coords_type);
3917
3918                 emit_data->output[emit_data->chan] =
3919                         lp_build_intrinsic(
3920                                 builder, intrinsic_name, emit_data->dst_type,
3921                                 emit_data->args, emit_data->arg_count,
3922                                 LLVMNoUnwindAttribute);
3923         }
3924 }
3925
3926 static void atomic_fetch_args(
3927                 struct lp_build_tgsi_context * bld_base,
3928                 struct lp_build_emit_data * emit_data)
3929 {
3930         struct si_shader_context *ctx = si_shader_context(bld_base);
3931         struct gallivm_state *gallivm = bld_base->base.gallivm;
3932         LLVMBuilderRef builder = gallivm->builder;
3933         const struct tgsi_full_instruction * inst = emit_data->inst;
3934         LLVMValueRef data1, data2;
3935         LLVMValueRef rsrc;
3936         LLVMValueRef tmp;
3937
3938         emit_data->dst_type = bld_base->base.elem_type;
3939
3940         tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3941         data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3942
3943         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3944                 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3945                 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3946         }
3947
3948         /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3949          * of arguments, which is reversed relative to TGSI (and GLSL)
3950          */
3951         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3952                 emit_data->args[emit_data->arg_count++] = data2;
3953         emit_data->args[emit_data->arg_count++] = data1;
3954
3955         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3956                 LLVMValueRef offset;
3957
3958                 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3959
3960                 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3961                 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3962
3963                 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3964                                    offset, true);
3965         } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3966                 unsigned target = inst->Memory.Texture;
3967                 LLVMValueRef coords;
3968
3969                 image_fetch_rsrc(bld_base, &inst->Src[0],
3970                                  target != TGSI_TEXTURE_BUFFER, &rsrc);
3971                 coords = image_fetch_coords(bld_base, inst, 1);
3972
3973                 if (target == TGSI_TEXTURE_BUFFER) {
3974                         rsrc = extract_rsrc_top_half(ctx, rsrc);
3975                         buffer_append_args(ctx, emit_data, rsrc, coords,
3976                                            bld_base->uint_bld.zero, true);
3977                 } else {
3978                         emit_data->args[emit_data->arg_count++] = coords;
3979                         emit_data->args[emit_data->arg_count++] = rsrc;
3980
3981                         image_append_args(ctx, emit_data, target, true);
3982                 }
3983         }
3984 }
3985
3986 static void atomic_emit_memory(struct si_shader_context *ctx,
3987                                struct lp_build_emit_data *emit_data) {
3988         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3989         LLVMBuilderRef builder = gallivm->builder;
3990         const struct tgsi_full_instruction * inst = emit_data->inst;
3991         LLVMValueRef ptr, result, arg;
3992
3993         ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3994
3995         arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
3996         arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3997
3998         if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3999                 LLVMValueRef new_data;
4000                 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
4001                                                inst, 3, 0);
4002
4003                 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4004
4005 #if HAVE_LLVM >= 0x309
4006                 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4007                                        LLVMAtomicOrderingSequentiallyConsistent,
4008                                        LLVMAtomicOrderingSequentiallyConsistent,
4009                                        false);
4010 #endif
4011
4012                 result = LLVMBuildExtractValue(builder, result, 0, "");
4013         } else {
4014                 LLVMAtomicRMWBinOp op;
4015
4016                 switch(inst->Instruction.Opcode) {
4017                         case TGSI_OPCODE_ATOMUADD:
4018                                 op = LLVMAtomicRMWBinOpAdd;
4019                                 break;
4020                         case TGSI_OPCODE_ATOMXCHG:
4021                                 op = LLVMAtomicRMWBinOpXchg;
4022                                 break;
4023                         case TGSI_OPCODE_ATOMAND:
4024                                 op = LLVMAtomicRMWBinOpAnd;
4025                                 break;
4026                         case TGSI_OPCODE_ATOMOR:
4027                                 op = LLVMAtomicRMWBinOpOr;
4028                                 break;
4029                         case TGSI_OPCODE_ATOMXOR:
4030                                 op = LLVMAtomicRMWBinOpXor;
4031                                 break;
4032                         case TGSI_OPCODE_ATOMUMIN:
4033                                 op = LLVMAtomicRMWBinOpUMin;
4034                                 break;
4035                         case TGSI_OPCODE_ATOMUMAX:
4036                                 op = LLVMAtomicRMWBinOpUMax;
4037                                 break;
4038                         case TGSI_OPCODE_ATOMIMIN:
4039                                 op = LLVMAtomicRMWBinOpMin;
4040                                 break;
4041                         case TGSI_OPCODE_ATOMIMAX:
4042                                 op = LLVMAtomicRMWBinOpMax;
4043                                 break;
4044                         default:
4045                                 unreachable("unknown atomic opcode");
4046                 }
4047
4048                 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4049                                        LLVMAtomicOrderingSequentiallyConsistent,
4050                                        false);
4051         }
4052         emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4053 }
4054
4055 static void atomic_emit(
4056                 const struct lp_build_tgsi_action *action,
4057                 struct lp_build_tgsi_context *bld_base,
4058                 struct lp_build_emit_data *emit_data)
4059 {
4060         struct si_shader_context *ctx = si_shader_context(bld_base);
4061         struct gallivm_state *gallivm = bld_base->base.gallivm;
4062         LLVMBuilderRef builder = gallivm->builder;
4063         const struct tgsi_full_instruction * inst = emit_data->inst;
4064         char intrinsic_name[40];
4065         LLVMValueRef tmp;
4066
4067         if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4068                 atomic_emit_memory(ctx, emit_data);
4069                 return;
4070         }
4071
4072         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4073             inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4074                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4075                          "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4076         } else {
4077                 char coords_type[8];
4078
4079                 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4080                                     coords_type, sizeof(coords_type));
4081                 snprintf(intrinsic_name, sizeof(intrinsic_name),
4082                          "llvm.amdgcn.image.atomic.%s.%s",
4083                          action->intr_name, coords_type);
4084         }
4085
4086         tmp = lp_build_intrinsic(
4087                 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4088                 emit_data->args, emit_data->arg_count,
4089                 LLVMNoUnwindAttribute);
4090         emit_data->output[emit_data->chan] =
4091                 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4092 }
4093
4094 static void resq_fetch_args(
4095                 struct lp_build_tgsi_context * bld_base,
4096                 struct lp_build_emit_data * emit_data)
4097 {
4098         struct si_shader_context *ctx = si_shader_context(bld_base);
4099         struct gallivm_state *gallivm = bld_base->base.gallivm;
4100         const struct tgsi_full_instruction *inst = emit_data->inst;
4101         const struct tgsi_full_src_register *reg = &inst->Src[0];
4102
4103         emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
4104
4105         if (reg->Register.File == TGSI_FILE_BUFFER) {
4106                 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4107                 emit_data->arg_count = 1;
4108         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4109                 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4110                 emit_data->arg_count = 1;
4111         } else {
4112                 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4113                 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4114                 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4115                 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4116                 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4117                 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4118                         bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4119                 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4120                 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4121                 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4122                 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4123                 emit_data->arg_count = 10;
4124         }
4125 }
4126
4127 static void resq_emit(
4128                 const struct lp_build_tgsi_action *action,
4129                 struct lp_build_tgsi_context *bld_base,
4130                 struct lp_build_emit_data *emit_data)
4131 {
4132         struct gallivm_state *gallivm = bld_base->base.gallivm;
4133         LLVMBuilderRef builder = gallivm->builder;
4134         const struct tgsi_full_instruction *inst = emit_data->inst;
4135         LLVMValueRef out;
4136
4137         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4138                 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4139                                               lp_build_const_int32(gallivm, 2), "");
4140         } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4141                 out = get_buffer_size(bld_base, emit_data->args[0]);
4142         } else {
4143                 out = lp_build_intrinsic(
4144                         builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4145                         emit_data->args, emit_data->arg_count,
4146                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4147
4148                 /* Divide the number of layers by 6 to get the number of cubes. */
4149                 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4150                         LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4151                         LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4152
4153                         LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4154                         z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
4155                         z = LLVMBuildSDiv(builder, z, imm6, "");
4156                         z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
4157                         out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4158                 }
4159         }
4160
4161         emit_data->output[emit_data->chan] = out;
4162 }
4163
4164 static void set_tex_fetch_args(struct si_shader_context *ctx,
4165                                struct lp_build_emit_data *emit_data,
4166                                unsigned opcode, unsigned target,
4167                                LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4168                                LLVMValueRef *param, unsigned count,
4169                                unsigned dmask)
4170 {
4171         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4172         unsigned num_args;
4173         unsigned is_rect = target == TGSI_TEXTURE_RECT;
4174
4175         /* Pad to power of two vector */
4176         while (count < util_next_power_of_two(count))
4177                 param[count++] = LLVMGetUndef(ctx->i32);
4178
4179         /* Texture coordinates. */
4180         if (count > 1)
4181                 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4182         else
4183                 emit_data->args[0] = param[0];
4184
4185         /* Resource. */
4186         emit_data->args[1] = res_ptr;
4187         num_args = 2;
4188
4189         if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4190                 emit_data->dst_type = ctx->v4i32;
4191         else {
4192                 emit_data->dst_type = ctx->v4f32;
4193
4194                 emit_data->args[num_args++] = samp_ptr;
4195         }
4196
4197         emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4198         emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4199         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4200         emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4201                                         tgsi_is_array_sampler(target)); /* da */
4202         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4203         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4204         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4205         emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4206
4207         emit_data->arg_count = num_args;
4208 }
4209
4210 static const struct lp_build_tgsi_action tex_action;
4211
4212 enum desc_type {
4213         DESC_IMAGE,
4214         DESC_FMASK,
4215         DESC_SAMPLER
4216 };
4217
4218 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4219 {
4220         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4221                                CONST_ADDR_SPACE);
4222 }
4223
4224 /**
4225  * Load an image view, fmask view. or sampler state descriptor.
4226  */
4227 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4228                                             LLVMValueRef list, LLVMValueRef index,
4229                                             enum desc_type type)
4230 {
4231         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4232         LLVMBuilderRef builder = gallivm->builder;
4233
4234         switch (type) {
4235         case DESC_IMAGE:
4236                 /* The image is at [0:7]. */
4237                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4238                 break;
4239         case DESC_FMASK:
4240                 /* The FMASK is at [8:15]. */
4241                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4242                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4243                 break;
4244         case DESC_SAMPLER:
4245                 /* The sampler state is at [12:15]. */
4246                 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4247                 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4248                 list = LLVMBuildPointerCast(builder, list,
4249                                             const_array(ctx->v4i32, 0), "");
4250                 break;
4251         }
4252
4253         return build_indexed_load_const(ctx, list, index);
4254 }
4255
4256 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4257                                      LLVMValueRef index, enum desc_type type)
4258 {
4259         LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4260                                          SI_PARAM_SAMPLERS);
4261
4262         return get_sampler_desc_custom(ctx, list, index, type);
4263 }
4264
4265 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4266  *
4267  * SI-CI:
4268  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4269  *   filtering manually. The driver sets img7 to a mask clearing
4270  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4271  *     s_and_b32 samp0, samp0, img7
4272  *
4273  * VI:
4274  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
4275  */
4276 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4277                                            LLVMValueRef res, LLVMValueRef samp)
4278 {
4279         LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4280         LLVMValueRef img7, samp0;
4281
4282         if (ctx->screen->b.chip_class >= VI)
4283                 return samp;
4284
4285         img7 = LLVMBuildExtractElement(builder, res,
4286                                        LLVMConstInt(ctx->i32, 7, 0), "");
4287         samp0 = LLVMBuildExtractElement(builder, samp,
4288                                         LLVMConstInt(ctx->i32, 0, 0), "");
4289         samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4290         return LLVMBuildInsertElement(builder, samp, samp0,
4291                                       LLVMConstInt(ctx->i32, 0, 0), "");
4292 }
4293
4294 static void tex_fetch_ptrs(
4295         struct lp_build_tgsi_context *bld_base,
4296         struct lp_build_emit_data *emit_data,
4297         LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4298 {
4299         struct si_shader_context *ctx = si_shader_context(bld_base);
4300         const struct tgsi_full_instruction *inst = emit_data->inst;
4301         unsigned target = inst->Texture.Texture;
4302         unsigned sampler_src;
4303         unsigned sampler_index;
4304
4305         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4306         sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4307
4308         if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4309                 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4310                 LLVMValueRef ind_index;
4311
4312                 ind_index = get_bounded_indirect_index(ctx,
4313                                                        &reg->Indirect,
4314                                                        reg->Register.Index,
4315                                                        SI_NUM_SAMPLERS);
4316
4317                 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4318
4319                 if (target == TGSI_TEXTURE_2D_MSAA ||
4320                     target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4321                         if (samp_ptr)
4322                                 *samp_ptr = NULL;
4323                         if (fmask_ptr)
4324                                 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4325                 } else {
4326                         if (samp_ptr) {
4327                                 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4328                                 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4329                         }
4330                         if (fmask_ptr)
4331                                 *fmask_ptr = NULL;
4332                 }
4333         } else {
4334                 *res_ptr = ctx->sampler_views[sampler_index];
4335                 if (samp_ptr)
4336                         *samp_ptr = ctx->sampler_states[sampler_index];
4337                 if (fmask_ptr)
4338                         *fmask_ptr = ctx->fmasks[sampler_index];
4339         }
4340 }
4341
4342 static void txq_fetch_args(
4343         struct lp_build_tgsi_context *bld_base,
4344         struct lp_build_emit_data *emit_data)
4345 {
4346         struct si_shader_context *ctx = si_shader_context(bld_base);
4347         struct gallivm_state *gallivm = bld_base->base.gallivm;
4348         LLVMBuilderRef builder = gallivm->builder;
4349         const struct tgsi_full_instruction *inst = emit_data->inst;
4350         unsigned target = inst->Texture.Texture;
4351         LLVMValueRef res_ptr;
4352         LLVMValueRef address;
4353
4354         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4355
4356         if (target == TGSI_TEXTURE_BUFFER) {
4357                 /* Read the size from the buffer descriptor directly. */
4358                 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4359                 emit_data->args[0] = get_buffer_size(bld_base, res);
4360                 return;
4361         }
4362
4363         /* Textures - set the mip level. */
4364         address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4365
4366         set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4367                            NULL, &address, 1, 0xf);
4368 }
4369
4370 static void txq_emit(const struct lp_build_tgsi_action *action,
4371                      struct lp_build_tgsi_context *bld_base,
4372                      struct lp_build_emit_data *emit_data)
4373 {
4374         struct lp_build_context *base = &bld_base->base;
4375         unsigned target = emit_data->inst->Texture.Texture;
4376
4377         if (target == TGSI_TEXTURE_BUFFER) {
4378                 /* Just return the buffer size. */
4379                 emit_data->output[emit_data->chan] = emit_data->args[0];
4380                 return;
4381         }
4382
4383         emit_data->output[emit_data->chan] = lp_build_intrinsic(
4384                 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4385                 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4386                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4387
4388         /* Divide the number of layers by 6 to get the number of cubes. */
4389         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4390             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4391                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4392                 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4393                 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4394
4395                 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4396                 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4397                 z = LLVMBuildSDiv(builder, z, six, "");
4398
4399                 emit_data->output[emit_data->chan] =
4400                         LLVMBuildInsertElement(builder, v4, z, two, "");
4401         }
4402 }
4403
4404 static void tex_fetch_args(
4405         struct lp_build_tgsi_context *bld_base,
4406         struct lp_build_emit_data *emit_data)
4407 {
4408         struct si_shader_context *ctx = si_shader_context(bld_base);
4409         struct gallivm_state *gallivm = bld_base->base.gallivm;
4410         const struct tgsi_full_instruction *inst = emit_data->inst;
4411         unsigned opcode = inst->Instruction.Opcode;
4412         unsigned target = inst->Texture.Texture;
4413         LLVMValueRef coords[5], derivs[6];
4414         LLVMValueRef address[16];
4415         unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4416         int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4417         unsigned count = 0;
4418         unsigned chan;
4419         unsigned num_deriv_channels = 0;
4420         bool has_offset = inst->Texture.NumOffsets > 0;
4421         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4422         unsigned dmask = 0xf;
4423
4424         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4425
4426         if (target == TGSI_TEXTURE_BUFFER) {
4427                 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4428
4429                 /* Bitcast and truncate v8i32 to v16i8. */
4430                 LLVMValueRef res = res_ptr;
4431                 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4432                 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4433                 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4434
4435                 emit_data->dst_type = ctx->v4f32;
4436                 emit_data->args[0] = res;
4437                 emit_data->args[1] = bld_base->uint_bld.zero;
4438                 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4439                 emit_data->arg_count = 3;
4440                 return;
4441         }
4442
4443         /* Fetch and project texture coordinates */
4444         coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4445         for (chan = 0; chan < 3; chan++ ) {
4446                 coords[chan] = lp_build_emit_fetch(bld_base,
4447                                                    emit_data->inst, 0,
4448                                                    chan);
4449                 if (opcode == TGSI_OPCODE_TXP)
4450                         coords[chan] = lp_build_emit_llvm_binary(bld_base,
4451                                                                  TGSI_OPCODE_DIV,
4452                                                                  coords[chan],
4453                                                                  coords[3]);
4454         }
4455
4456         if (opcode == TGSI_OPCODE_TXP)
4457                 coords[3] = bld_base->base.one;
4458
4459         /* Pack offsets. */
4460         if (has_offset && opcode != TGSI_OPCODE_TXF) {
4461                 /* The offsets are six-bit signed integers packed like this:
4462                  *   X=[5:0], Y=[13:8], and Z=[21:16].
4463                  */
4464                 LLVMValueRef offset[3], pack;
4465
4466                 assert(inst->Texture.NumOffsets == 1);
4467
4468                 for (chan = 0; chan < 3; chan++) {
4469                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4470                                                                      emit_data->inst, 0, chan);
4471                         offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4472                                                     lp_build_const_int32(gallivm, 0x3f), "");
4473                         if (chan)
4474                                 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4475                                                             lp_build_const_int32(gallivm, chan*8), "");
4476                 }
4477
4478                 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4479                 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4480                 address[count++] = pack;
4481         }
4482
4483         /* Pack LOD bias value */
4484         if (opcode == TGSI_OPCODE_TXB)
4485                 address[count++] = coords[3];
4486         if (opcode == TGSI_OPCODE_TXB2)
4487                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4488
4489         /* Pack depth comparison value */
4490         if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4491                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4492                         address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4493                 } else {
4494                         assert(ref_pos >= 0);
4495                         address[count++] = coords[ref_pos];
4496                 }
4497         }
4498
4499         /* Pack user derivatives */
4500         if (opcode == TGSI_OPCODE_TXD) {
4501                 int param, num_src_deriv_channels;
4502
4503                 switch (target) {
4504                 case TGSI_TEXTURE_3D:
4505                         num_src_deriv_channels = 3;
4506                         num_deriv_channels = 3;
4507                         break;
4508                 case TGSI_TEXTURE_2D:
4509                 case TGSI_TEXTURE_SHADOW2D:
4510                 case TGSI_TEXTURE_RECT:
4511                 case TGSI_TEXTURE_SHADOWRECT:
4512                 case TGSI_TEXTURE_2D_ARRAY:
4513                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4514                         num_src_deriv_channels = 2;
4515                         num_deriv_channels = 2;
4516                         break;
4517                 case TGSI_TEXTURE_CUBE:
4518                 case TGSI_TEXTURE_SHADOWCUBE:
4519                 case TGSI_TEXTURE_CUBE_ARRAY:
4520                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4521                         /* Cube derivatives will be converted to 2D. */
4522                         num_src_deriv_channels = 3;
4523                         num_deriv_channels = 2;
4524                         break;
4525                 case TGSI_TEXTURE_1D:
4526                 case TGSI_TEXTURE_SHADOW1D:
4527                 case TGSI_TEXTURE_1D_ARRAY:
4528                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4529                         num_src_deriv_channels = 1;
4530                         num_deriv_channels = 1;
4531                         break;
4532                 default:
4533                         unreachable("invalid target");
4534                 }
4535
4536                 for (param = 0; param < 2; param++)
4537                         for (chan = 0; chan < num_src_deriv_channels; chan++)
4538                                 derivs[param * num_src_deriv_channels + chan] =
4539                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
4540         }
4541
4542         if (target == TGSI_TEXTURE_CUBE ||
4543             target == TGSI_TEXTURE_CUBE_ARRAY ||
4544             target == TGSI_TEXTURE_SHADOWCUBE ||
4545             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4546                 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4547
4548         if (opcode == TGSI_OPCODE_TXD)
4549                 for (int i = 0; i < num_deriv_channels * 2; i++)
4550                         address[count++] = derivs[i];
4551
4552         /* Pack texture coordinates */
4553         address[count++] = coords[0];
4554         if (num_coords > 1)
4555                 address[count++] = coords[1];
4556         if (num_coords > 2)
4557                 address[count++] = coords[2];
4558
4559         /* Pack LOD or sample index */
4560         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4561                 address[count++] = coords[3];
4562         else if (opcode == TGSI_OPCODE_TXL2)
4563                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4564
4565         if (count > 16) {
4566                 assert(!"Cannot handle more than 16 texture address parameters");
4567                 count = 16;
4568         }
4569
4570         for (chan = 0; chan < count; chan++ ) {
4571                 address[chan] = LLVMBuildBitCast(gallivm->builder,
4572                                                  address[chan], ctx->i32, "");
4573         }
4574
4575         /* Adjust the sample index according to FMASK.
4576          *
4577          * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4578          * which is the identity mapping. Each nibble says which physical sample
4579          * should be fetched to get that sample.
4580          *
4581          * For example, 0x11111100 means there are only 2 samples stored and
4582          * the second sample covers 3/4 of the pixel. When reading samples 0
4583          * and 1, return physical sample 0 (determined by the first two 0s
4584          * in FMASK), otherwise return physical sample 1.
4585          *
4586          * The sample index should be adjusted as follows:
4587          *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
4588          */
4589         if (target == TGSI_TEXTURE_2D_MSAA ||
4590             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4591                 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4592                 struct lp_build_emit_data txf_emit_data = *emit_data;
4593                 LLVMValueRef txf_address[4];
4594                 unsigned txf_count = count;
4595                 struct tgsi_full_instruction inst = {};
4596
4597                 memcpy(txf_address, address, sizeof(txf_address));
4598
4599                 if (target == TGSI_TEXTURE_2D_MSAA) {
4600                         txf_address[2] = bld_base->uint_bld.zero;
4601                 }
4602                 txf_address[3] = bld_base->uint_bld.zero;
4603
4604                 /* Read FMASK using TXF. */
4605                 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4606                 inst.Texture.Texture = target;
4607                 txf_emit_data.inst = &inst;
4608                 txf_emit_data.chan = 0;
4609                 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4610                                    target, fmask_ptr, NULL,
4611                                    txf_address, txf_count, 0xf);
4612                 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4613
4614                 /* Initialize some constants. */
4615                 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4616                 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4617
4618                 /* Apply the formula. */
4619                 LLVMValueRef fmask =
4620                         LLVMBuildExtractElement(gallivm->builder,
4621                                                 txf_emit_data.output[0],
4622                                                 uint_bld->zero, "");
4623
4624                 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4625
4626                 LLVMValueRef sample_index4 =
4627                         LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4628
4629                 LLVMValueRef shifted_fmask =
4630                         LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4631
4632                 LLVMValueRef final_sample =
4633                         LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4634
4635                 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4636                  * resource descriptor is 0 (invalid),
4637                  */
4638                 LLVMValueRef fmask_desc =
4639                         LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4640                                          ctx->v8i32, "");
4641
4642                 LLVMValueRef fmask_word1 =
4643                         LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4644                                                 uint_bld->one, "");
4645
4646                 LLVMValueRef word1_is_nonzero =
4647                         LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4648                                       fmask_word1, uint_bld->zero, "");
4649
4650                 /* Replace the MSAA sample index. */
4651                 address[sample_chan] =
4652                         LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4653                                         final_sample, address[sample_chan], "");
4654         }
4655
4656         if (opcode == TGSI_OPCODE_TXF) {
4657                 /* add tex offsets */
4658                 if (inst->Texture.NumOffsets) {
4659                         struct lp_build_context *uint_bld = &bld_base->uint_bld;
4660                         struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4661                         const struct tgsi_texture_offset *off = inst->TexOffsets;
4662
4663                         assert(inst->Texture.NumOffsets == 1);
4664
4665                         switch (target) {
4666                         case TGSI_TEXTURE_3D:
4667                                 address[2] = lp_build_add(uint_bld, address[2],
4668                                                 bld->immediates[off->Index][off->SwizzleZ]);
4669                                 /* fall through */
4670                         case TGSI_TEXTURE_2D:
4671                         case TGSI_TEXTURE_SHADOW2D:
4672                         case TGSI_TEXTURE_RECT:
4673                         case TGSI_TEXTURE_SHADOWRECT:
4674                         case TGSI_TEXTURE_2D_ARRAY:
4675                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
4676                                 address[1] =
4677                                         lp_build_add(uint_bld, address[1],
4678                                                 bld->immediates[off->Index][off->SwizzleY]);
4679                                 /* fall through */
4680                         case TGSI_TEXTURE_1D:
4681                         case TGSI_TEXTURE_SHADOW1D:
4682                         case TGSI_TEXTURE_1D_ARRAY:
4683                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
4684                                 address[0] =
4685                                         lp_build_add(uint_bld, address[0],
4686                                                 bld->immediates[off->Index][off->SwizzleX]);
4687                                 break;
4688                                 /* texture offsets do not apply to other texture targets */
4689                         }
4690                 }
4691         }
4692
4693         if (opcode == TGSI_OPCODE_TG4) {
4694                 unsigned gather_comp = 0;
4695
4696                 /* DMASK was repurposed for GATHER4. 4 components are always
4697                  * returned and DMASK works like a swizzle - it selects
4698                  * the component to fetch. The only valid DMASK values are
4699                  * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4700                  * (red,red,red,red) etc.) The ISA document doesn't mention
4701                  * this.
4702                  */
4703
4704                 /* Get the component index from src1.x for Gather4. */
4705                 if (!tgsi_is_shadow_target(target)) {
4706                         LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4707                         LLVMValueRef comp_imm;
4708                         struct tgsi_src_register src1 = inst->Src[1].Register;
4709
4710                         assert(src1.File == TGSI_FILE_IMMEDIATE);
4711
4712                         comp_imm = imms[src1.Index][src1.SwizzleX];
4713                         gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4714                         gather_comp = CLAMP(gather_comp, 0, 3);
4715                 }
4716
4717                 dmask = 1 << gather_comp;
4718         }
4719
4720         set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4721                            samp_ptr, address, count, dmask);
4722 }
4723
4724 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4725                                 struct lp_build_tgsi_context *bld_base,
4726                                 struct lp_build_emit_data *emit_data)
4727 {
4728         struct si_shader_context *ctx = si_shader_context(bld_base);
4729         struct lp_build_context *base = &bld_base->base;
4730         unsigned opcode = emit_data->inst->Instruction.Opcode;
4731         unsigned target = emit_data->inst->Texture.Texture;
4732         char intr_name[127];
4733         bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4734         bool is_shadow = tgsi_is_shadow_target(target);
4735         char type[64];
4736         const char *name = "llvm.SI.image.sample";
4737         const char *infix = "";
4738
4739         if (target == TGSI_TEXTURE_BUFFER) {
4740                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4741                         base->gallivm->builder,
4742                         "llvm.SI.vs.load.input", emit_data->dst_type,
4743                         emit_data->args, emit_data->arg_count,
4744                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4745                 return;
4746         }
4747
4748         switch (opcode) {
4749         case TGSI_OPCODE_TXF:
4750                 name = target == TGSI_TEXTURE_2D_MSAA ||
4751                        target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4752                                "llvm.SI.image.load" :
4753                                "llvm.SI.image.load.mip";
4754                 is_shadow = false;
4755                 has_offset = false;
4756                 break;
4757         case TGSI_OPCODE_LODQ:
4758                 name = "llvm.SI.getlod";
4759                 is_shadow = false;
4760                 has_offset = false;
4761                 break;
4762         case TGSI_OPCODE_TEX:
4763         case TGSI_OPCODE_TEX2:
4764         case TGSI_OPCODE_TXP:
4765                 if (ctx->type != PIPE_SHADER_FRAGMENT)
4766                         infix = ".lz";
4767                 break;
4768         case TGSI_OPCODE_TXB:
4769         case TGSI_OPCODE_TXB2:
4770                 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4771                 infix = ".b";
4772                 break;
4773         case TGSI_OPCODE_TXL:
4774         case TGSI_OPCODE_TXL2:
4775                 infix = ".l";
4776                 break;
4777         case TGSI_OPCODE_TXD:
4778                 infix = ".d";
4779                 break;
4780         case TGSI_OPCODE_TG4:
4781                 name = "llvm.SI.gather4";
4782                 infix = ".lz";
4783                 break;
4784         default:
4785                 assert(0);
4786                 return;
4787         }
4788
4789         /* Add the type and suffixes .c, .o if needed. */
4790         build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4791         sprintf(intr_name, "%s%s%s%s.%s",
4792                 name, is_shadow ? ".c" : "", infix,
4793                 has_offset ? ".o" : "", type);
4794
4795         emit_data->output[emit_data->chan] = lp_build_intrinsic(
4796                 base->gallivm->builder, intr_name, emit_data->dst_type,
4797                 emit_data->args, emit_data->arg_count,
4798                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4799 }
4800
4801 static void si_llvm_emit_txqs(
4802         const struct lp_build_tgsi_action *action,
4803         struct lp_build_tgsi_context *bld_base,
4804         struct lp_build_emit_data *emit_data)
4805 {
4806         struct si_shader_context *ctx = si_shader_context(bld_base);
4807         struct gallivm_state *gallivm = bld_base->base.gallivm;
4808         LLVMBuilderRef builder = gallivm->builder;
4809         LLVMValueRef res, samples;
4810         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4811
4812         tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4813
4814
4815         /* Read the samples from the descriptor directly. */
4816         res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4817         samples = LLVMBuildExtractElement(
4818                 builder, res,
4819                 lp_build_const_int32(gallivm, 3), "");
4820         samples = LLVMBuildLShr(builder, samples,
4821                                 lp_build_const_int32(gallivm, 16), "");
4822         samples = LLVMBuildAnd(builder, samples,
4823                                lp_build_const_int32(gallivm, 0xf), "");
4824         samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4825                                samples, "");
4826
4827         emit_data->output[emit_data->chan] = samples;
4828 }
4829
4830 /*
4831  * SI implements derivatives using the local data store (LDS)
4832  * All writes to the LDS happen in all executing threads at
4833  * the same time. TID is the Thread ID for the current
4834  * thread and is a value between 0 and 63, representing
4835  * the thread's position in the wavefront.
4836  *
4837  * For the pixel shader threads are grouped into quads of four pixels.
4838  * The TIDs of the pixels of a quad are:
4839  *
4840  *  +------+------+
4841  *  |4n + 0|4n + 1|
4842  *  +------+------+
4843  *  |4n + 2|4n + 3|
4844  *  +------+------+
4845  *
4846  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4847  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4848  * the current pixel's column, and masking with 0xfffffffe yields the TID
4849  * of the left pixel of the current pixel's row.
4850  *
4851  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4852  * adding 2 yields the TID of the pixel below the top pixel.
4853  */
4854 /* masks for thread ID. */
4855 #define TID_MASK_TOP_LEFT 0xfffffffc
4856 #define TID_MASK_TOP      0xfffffffd
4857 #define TID_MASK_LEFT     0xfffffffe
4858
4859 static void si_llvm_emit_ddxy(
4860         const struct lp_build_tgsi_action *action,
4861         struct lp_build_tgsi_context *bld_base,
4862         struct lp_build_emit_data *emit_data)
4863 {
4864         struct si_shader_context *ctx = si_shader_context(bld_base);
4865         struct gallivm_state *gallivm = bld_base->base.gallivm;
4866         const struct tgsi_full_instruction *inst = emit_data->inst;
4867         unsigned opcode = inst->Instruction.Opcode;
4868         LLVMValueRef indices[2];
4869         LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4870         LLVMValueRef tl, trbl, result[4];
4871         LLVMValueRef tl_tid, trbl_tid;
4872         unsigned swizzle[4];
4873         unsigned c;
4874         int idx;
4875         unsigned mask;
4876
4877         indices[0] = bld_base->uint_bld.zero;
4878         indices[1] = get_thread_id(ctx);
4879         store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4880                                  indices, 2, "");
4881
4882         if (opcode == TGSI_OPCODE_DDX_FINE)
4883                 mask = TID_MASK_LEFT;
4884         else if (opcode == TGSI_OPCODE_DDY_FINE)
4885                 mask = TID_MASK_TOP;
4886         else
4887                 mask = TID_MASK_TOP_LEFT;
4888
4889         tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4890                                 lp_build_const_int32(gallivm, mask), "");
4891         indices[1] = tl_tid;
4892         load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4893                                  indices, 2, "");
4894
4895         /* for DDX we want to next X pixel, DDY next Y pixel. */
4896         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4897         trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4898                                   lp_build_const_int32(gallivm, idx), "");
4899         indices[1] = trbl_tid;
4900         load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4901                                  indices, 2, "");
4902
4903         for (c = 0; c < 4; ++c) {
4904                 unsigned i;
4905                 LLVMValueRef val;
4906                 LLVMValueRef args[2];
4907
4908                 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4909                 for (i = 0; i < c; ++i) {
4910                         if (swizzle[i] == swizzle[c]) {
4911                                 result[c] = result[i];
4912                                 break;
4913                         }
4914                 }
4915                 if (i != c)
4916                         continue;
4917
4918                 val = LLVMBuildBitCast(gallivm->builder,
4919                                 lp_build_emit_fetch(bld_base, inst, 0, c),
4920                                                 ctx->i32, "");
4921
4922                 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4923
4924                         args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4925                                         lp_build_const_int32(gallivm, 4), "");
4926                         args[1] = val;
4927                         tl = lp_build_intrinsic(gallivm->builder,
4928                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
4929                                         args, 2, LLVMReadNoneAttribute);
4930
4931                         args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4932                                         lp_build_const_int32(gallivm, 4), "");
4933                         trbl = lp_build_intrinsic(gallivm->builder,
4934                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
4935                                         args, 2, LLVMReadNoneAttribute);
4936                 } else {
4937                         LLVMBuildStore(gallivm->builder, val, store_ptr);
4938                         tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4939                         trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4940                 }
4941                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4942                 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4943                 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4944         }
4945
4946         emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4947 }
4948
4949 /*
4950  * this takes an I,J coordinate pair,
4951  * and works out the X and Y derivatives.
4952  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4953  */
4954 static LLVMValueRef si_llvm_emit_ddxy_interp(
4955         struct lp_build_tgsi_context *bld_base,
4956         LLVMValueRef interp_ij)
4957 {
4958         struct si_shader_context *ctx = si_shader_context(bld_base);
4959         struct gallivm_state *gallivm = bld_base->base.gallivm;
4960         LLVMValueRef indices[2];
4961         LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4962         LLVMValueRef tl, tr, bl, result[4];
4963         unsigned c;
4964
4965         indices[0] = bld_base->uint_bld.zero;
4966         indices[1] = get_thread_id(ctx);
4967         store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4968                                  indices, 2, "");
4969
4970         temp = LLVMBuildAnd(gallivm->builder, indices[1],
4971                             lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4972
4973         temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4974                              lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4975
4976         indices[1] = temp;
4977         load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4978                                   indices, 2, "");
4979
4980         indices[1] = temp2;
4981         load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4982                                   indices, 2, "");
4983
4984         indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4985                                   lp_build_const_int32(gallivm, 1), "");
4986         load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4987                                    indices, 2, "");
4988
4989         indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4990                                   lp_build_const_int32(gallivm, 2), "");
4991         load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4992                                    indices, 2, "");
4993
4994         for (c = 0; c < 2; ++c) {
4995                 LLVMValueRef store_val;
4996                 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
4997
4998                 store_val = LLVMBuildExtractElement(gallivm->builder,
4999                                                     interp_ij, c_ll, "");
5000                 LLVMBuildStore(gallivm->builder,
5001                                store_val,
5002                                store_ptr);
5003
5004                 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
5005                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5006
5007                 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
5008                 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
5009
5010                 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
5011
5012                 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
5013                 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5014
5015                 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
5016                 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
5017
5018                 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
5019         }
5020
5021         return lp_build_gather_values(gallivm, result, 4);
5022 }
5023
5024 static void interp_fetch_args(
5025         struct lp_build_tgsi_context *bld_base,
5026         struct lp_build_emit_data *emit_data)
5027 {
5028         struct si_shader_context *ctx = si_shader_context(bld_base);
5029         struct gallivm_state *gallivm = bld_base->base.gallivm;
5030         const struct tgsi_full_instruction *inst = emit_data->inst;
5031
5032         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5033                 /* offset is in second src, first two channels */
5034                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5035                                                          emit_data->inst, 1,
5036                                                          TGSI_CHAN_X);
5037                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5038                                                          emit_data->inst, 1,
5039                                                          TGSI_CHAN_Y);
5040                 emit_data->arg_count = 2;
5041         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5042                 LLVMValueRef sample_position;
5043                 LLVMValueRef sample_id;
5044                 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5045
5046                 /* fetch sample ID, then fetch its sample position,
5047                  * and place into first two channels.
5048                  */
5049                 sample_id = lp_build_emit_fetch(bld_base,
5050                                                 emit_data->inst, 1, TGSI_CHAN_X);
5051                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5052                                              ctx->i32, "");
5053                 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5054
5055                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5056                                                              sample_position,
5057                                                              lp_build_const_int32(gallivm, 0), "");
5058
5059                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5060                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5061                                                              sample_position,
5062                                                              lp_build_const_int32(gallivm, 1), "");
5063                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5064                 emit_data->arg_count = 2;
5065         }
5066 }
5067
5068 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5069                                 struct lp_build_tgsi_context *bld_base,
5070                                 struct lp_build_emit_data *emit_data)
5071 {
5072         struct si_shader_context *ctx = si_shader_context(bld_base);
5073         struct si_shader *shader = ctx->shader;
5074         struct gallivm_state *gallivm = bld_base->base.gallivm;
5075         LLVMValueRef interp_param;
5076         const struct tgsi_full_instruction *inst = emit_data->inst;
5077         const char *intr_name;
5078         int input_index = inst->Src[0].Register.Index;
5079         int chan;
5080         int i;
5081         LLVMValueRef attr_number;
5082         LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5083         int interp_param_idx;
5084         unsigned interp = shader->selector->info.input_interpolate[input_index];
5085         unsigned location;
5086
5087         assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5088
5089         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5090             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5091                 location = TGSI_INTERPOLATE_LOC_CENTER;
5092         else
5093                 location = TGSI_INTERPOLATE_LOC_CENTROID;
5094
5095         interp_param_idx = lookup_interp_param_index(interp, location);
5096         if (interp_param_idx == -1)
5097                 return;
5098         else if (interp_param_idx)
5099                 interp_param = get_interp_param(ctx, interp_param_idx);
5100         else
5101                 interp_param = NULL;
5102
5103         attr_number = lp_build_const_int32(gallivm, input_index);
5104
5105         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5106             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5107                 LLVMValueRef ij_out[2];
5108                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5109
5110                 /*
5111                  * take the I then J parameters, and the DDX/Y for it, and
5112                  * calculate the IJ inputs for the interpolator.
5113                  * temp1 = ddx * offset/sample.x + I;
5114                  * interp_param.I = ddy * offset/sample.y + temp1;
5115                  * temp1 = ddx * offset/sample.x + J;
5116                  * interp_param.J = ddy * offset/sample.y + temp1;
5117                  */
5118                 for (i = 0; i < 2; i++) {
5119                         LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5120                         LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5121                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5122                                                                       ddxy_out, ix_ll, "");
5123                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5124                                                                       ddxy_out, iy_ll, "");
5125                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5126                                                                          interp_param, ix_ll, "");
5127                         LLVMValueRef temp1, temp2;
5128
5129                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5130                                                      ctx->f32, "");
5131
5132                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5133
5134                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5135
5136                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5137
5138                         temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5139
5140                         ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5141                                                      temp2, ctx->i32, "");
5142                 }
5143                 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5144         }
5145
5146         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5147         for (chan = 0; chan < 2; chan++) {
5148                 LLVMValueRef args[4];
5149                 LLVMValueRef llvm_chan;
5150                 unsigned schan;
5151
5152                 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5153                 llvm_chan = lp_build_const_int32(gallivm, schan);
5154
5155                 args[0] = llvm_chan;
5156                 args[1] = attr_number;
5157                 args[2] = params;
5158                 args[3] = interp_param;
5159
5160                 emit_data->output[chan] =
5161                         lp_build_intrinsic(gallivm->builder, intr_name,
5162                                            ctx->f32, args, args[3] ? 4 : 3,
5163                                            LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
5164         }
5165 }
5166
5167 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5168                                        struct lp_build_emit_data *emit_data)
5169 {
5170         LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5171         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5172         unsigned stream;
5173
5174         assert(src0.File == TGSI_FILE_IMMEDIATE);
5175
5176         stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5177         return stream;
5178 }
5179
5180 /* Emit one vertex from the geometry shader */
5181 static void si_llvm_emit_vertex(
5182         const struct lp_build_tgsi_action *action,
5183         struct lp_build_tgsi_context *bld_base,
5184         struct lp_build_emit_data *emit_data)
5185 {
5186         struct si_shader_context *ctx = si_shader_context(bld_base);
5187         struct lp_build_context *uint = &bld_base->uint_bld;
5188         struct si_shader *shader = ctx->shader;
5189         struct tgsi_shader_info *info = &shader->selector->info;
5190         struct gallivm_state *gallivm = bld_base->base.gallivm;
5191         LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5192                                             SI_PARAM_GS2VS_OFFSET);
5193         LLVMValueRef gs_next_vertex;
5194         LLVMValueRef can_emit, kill;
5195         LLVMValueRef args[2];
5196         unsigned chan;
5197         int i;
5198         unsigned stream;
5199
5200         stream = si_llvm_get_stream(bld_base, emit_data);
5201
5202         /* Write vertex attribute values to GSVS ring */
5203         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5204                                        ctx->gs_next_vertex[stream],
5205                                        "");
5206
5207         /* If this thread has already emitted the declared maximum number of
5208          * vertices, kill it: excessive vertex emissions are not supposed to
5209          * have any effect, and GS threads have no externally observable
5210          * effects other than emitting vertices.
5211          */
5212         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5213                                  lp_build_const_int32(gallivm,
5214                                                       shader->selector->gs_max_out_vertices), "");
5215         kill = lp_build_select(&bld_base->base, can_emit,
5216                                lp_build_const_float(gallivm, 1.0f),
5217                                lp_build_const_float(gallivm, -1.0f));
5218
5219         lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5220                            ctx->voidt, &kill, 1, 0);
5221
5222         for (i = 0; i < info->num_outputs; i++) {
5223                 LLVMValueRef *out_ptr =
5224                         ctx->radeon_bld.soa.outputs[i];
5225
5226                 for (chan = 0; chan < 4; chan++) {
5227                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5228                         LLVMValueRef voffset =
5229                                 lp_build_const_int32(gallivm, (i * 4 + chan) *
5230                                                      shader->selector->gs_max_out_vertices);
5231
5232                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
5233                         voffset = lp_build_mul_imm(uint, voffset, 4);
5234
5235                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5236
5237                         build_tbuffer_store(ctx,
5238                                             ctx->gsvs_ring[stream],
5239                                             out_val, 1,
5240                                             voffset, soffset, 0,
5241                                             V_008F0C_BUF_DATA_FORMAT_32,
5242                                             V_008F0C_BUF_NUM_FORMAT_UINT,
5243                                             1, 0, 1, 1, 0);
5244                 }
5245         }
5246         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5247                                       lp_build_const_int32(gallivm, 1));
5248
5249         LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5250
5251         /* Signal vertex emission */
5252         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5253         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5254         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5255                            ctx->voidt, args, 2, LLVMNoUnwindAttribute);
5256 }
5257
5258 /* Cut one primitive from the geometry shader */
5259 static void si_llvm_emit_primitive(
5260         const struct lp_build_tgsi_action *action,
5261         struct lp_build_tgsi_context *bld_base,
5262         struct lp_build_emit_data *emit_data)
5263 {
5264         struct si_shader_context *ctx = si_shader_context(bld_base);
5265         struct gallivm_state *gallivm = bld_base->base.gallivm;
5266         LLVMValueRef args[2];
5267         unsigned stream;
5268
5269         /* Signal primitive cut */
5270         stream = si_llvm_get_stream(bld_base, emit_data);
5271         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5272         args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5273         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5274                            ctx->voidt, args, 2, LLVMNoUnwindAttribute);
5275 }
5276
5277 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5278                                  struct lp_build_tgsi_context *bld_base,
5279                                  struct lp_build_emit_data *emit_data)
5280 {
5281         struct si_shader_context *ctx = si_shader_context(bld_base);
5282         struct gallivm_state *gallivm = bld_base->base.gallivm;
5283
5284         /* The real barrier instruction isn’t needed, because an entire patch
5285          * always fits into a single wave.
5286          */
5287         if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5288                 emit_optimization_barrier(ctx);
5289                 return;
5290         }
5291
5292         lp_build_intrinsic(gallivm->builder,
5293                            HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5294                                                : "llvm.AMDGPU.barrier.local",
5295                            ctx->voidt, NULL, 0, LLVMNoUnwindAttribute);
5296 }
5297
5298 static const struct lp_build_tgsi_action tex_action = {
5299         .fetch_args = tex_fetch_args,
5300         .emit = build_tex_intrinsic,
5301 };
5302
5303 static const struct lp_build_tgsi_action interp_action = {
5304         .fetch_args = interp_fetch_args,
5305         .emit = build_interp_intrinsic,
5306 };
5307
5308 static void si_create_function(struct si_shader_context *ctx,
5309                                LLVMTypeRef *returns, unsigned num_returns,
5310                                LLVMTypeRef *params, unsigned num_params,
5311                                int last_array_pointer, int last_sgpr)
5312 {
5313         int i;
5314
5315         radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5316                                 params, num_params);
5317         radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5318         ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5319
5320         for (i = 0; i <= last_sgpr; ++i) {
5321                 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5322
5323                 /* We tell llvm that array inputs are passed by value to allow Sinking pass
5324                  * to move load. Inputs are constant so this is fine. */
5325                 if (i <= last_array_pointer)
5326                         LLVMAddAttribute(P, LLVMByValAttribute);
5327                 else
5328                         LLVMAddAttribute(P, LLVMInRegAttribute);
5329         }
5330
5331         if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5332                 /* These were copied from some LLVM test. */
5333                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5334                                                    "less-precise-fpmad",
5335                                                    "true");
5336                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5337                                                    "no-infs-fp-math",
5338                                                    "true");
5339                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5340                                                    "no-nans-fp-math",
5341                                                    "true");
5342                 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5343                                                    "unsafe-fp-math",
5344                                                    "true");
5345         }
5346 }
5347
5348 static void create_meta_data(struct si_shader_context *ctx)
5349 {
5350         struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5351         LLVMValueRef args[3];
5352
5353         args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
5354         args[1] = 0;
5355         args[2] = lp_build_const_int32(gallivm, 1);
5356
5357         ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
5358
5359         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5360                                                         "amdgpu.uniform", 14);
5361
5362         ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5363 }
5364
5365 static void declare_streamout_params(struct si_shader_context *ctx,
5366                                      struct pipe_stream_output_info *so,
5367                                      LLVMTypeRef *params, LLVMTypeRef i32,
5368                                      unsigned *num_params)
5369 {
5370         int i;
5371
5372         /* Streamout SGPRs. */
5373         if (so->num_outputs) {
5374                 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5375                         params[ctx->param_streamout_config = (*num_params)++] = i32;
5376                 else
5377                         ctx->param_streamout_config = ctx->param_tess_offchip;
5378
5379                 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5380         }
5381         /* A streamout buffer offset is loaded if the stride is non-zero. */
5382         for (i = 0; i < 4; i++) {
5383                 if (!so->stride[i])
5384                         continue;
5385
5386                 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5387         }
5388 }
5389
5390 static unsigned llvm_get_type_size(LLVMTypeRef type)
5391 {
5392         LLVMTypeKind kind = LLVMGetTypeKind(type);
5393
5394         switch (kind) {
5395         case LLVMIntegerTypeKind:
5396                 return LLVMGetIntTypeWidth(type) / 8;
5397         case LLVMFloatTypeKind:
5398                 return 4;
5399         case LLVMPointerTypeKind:
5400                 return 8;
5401         case LLVMVectorTypeKind:
5402                 return LLVMGetVectorSize(type) *
5403                        llvm_get_type_size(LLVMGetElementType(type));
5404         default:
5405                 assert(0);
5406                 return 0;
5407         }
5408 }
5409
5410 static void declare_tess_lds(struct si_shader_context *ctx)
5411 {
5412         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5413         LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
5414         unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5415
5416         /* The actual size is computed outside of the shader to reduce
5417          * the number of shader variants. */
5418         ctx->lds =
5419                 LLVMAddGlobalInAddressSpace(gallivm->module,
5420                                             LLVMArrayType(i32, lds_size / 4),
5421                                             "tess_lds",
5422                                             LOCAL_ADDR_SPACE);
5423 }
5424
5425 static void create_function(struct si_shader_context *ctx)
5426 {
5427         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5428         struct gallivm_state *gallivm = bld_base->base.gallivm;
5429         struct si_shader *shader = ctx->shader;
5430         LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5431         LLVMTypeRef returns[16+32*4];
5432         unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
5433         unsigned num_returns = 0;
5434
5435         v3i32 = LLVMVectorType(ctx->i32, 3);
5436
5437         params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5438         params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5439         params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5440         params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5441         params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5442         last_array_pointer = SI_PARAM_SHADER_BUFFERS;
5443
5444         switch (ctx->type) {
5445         case PIPE_SHADER_VERTEX:
5446                 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5447                 last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
5448                 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5449                 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5450                 num_params = SI_PARAM_START_INSTANCE+1;
5451
5452                 if (shader->key.vs.as_es) {
5453                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5454                 } else if (shader->key.vs.as_ls) {
5455                         params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5456                         num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5457                 } else {
5458                         if (ctx->is_gs_copy_shader) {
5459                                 last_array_pointer = SI_PARAM_RW_BUFFERS;
5460                                 num_params = SI_PARAM_RW_BUFFERS+1;
5461                         } else {
5462                                 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5463                                 num_params = SI_PARAM_VS_STATE_BITS+1;
5464                         }
5465
5466                         /* The locations of the other parameters are assigned dynamically. */
5467                         declare_streamout_params(ctx, &shader->selector->so,
5468                                                  params, ctx->i32, &num_params);
5469                 }
5470
5471                 last_sgpr = num_params-1;
5472
5473                 /* VGPRs */
5474                 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5475                 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5476                 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5477                 params[ctx->param_instance_id = num_params++] = ctx->i32;
5478
5479                 if (!ctx->is_monolithic &&
5480                     !ctx->is_gs_copy_shader) {
5481                         /* Vertex load indices. */
5482                         ctx->param_vertex_index0 = num_params;
5483
5484                         for (i = 0; i < shader->selector->info.num_inputs; i++)
5485                                 params[num_params++] = ctx->i32;
5486
5487                         /* PrimitiveID output. */
5488                         if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5489                                 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5490                                         returns[num_returns++] = ctx->f32;
5491                 }
5492                 break;
5493
5494         case PIPE_SHADER_TESS_CTRL:
5495                 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5496                 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5497                 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5498                 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5499                 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5500                 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5501                 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5502
5503                 /* VGPRs */
5504                 params[SI_PARAM_PATCH_ID] = ctx->i32;
5505                 params[SI_PARAM_REL_IDS] = ctx->i32;
5506                 num_params = SI_PARAM_REL_IDS+1;
5507
5508                 if (!ctx->is_monolithic) {
5509                         /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5510                          * placed after the user SGPRs.
5511                          */
5512                         for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5513                                 returns[num_returns++] = ctx->i32; /* SGPRs */
5514
5515                         for (i = 0; i < 3; i++)
5516                                 returns[num_returns++] = ctx->f32; /* VGPRs */
5517                 }
5518                 break;
5519
5520         case PIPE_SHADER_TESS_EVAL:
5521                 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5522                 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5523
5524                 if (shader->key.tes.as_es) {
5525                         params[ctx->param_oc_lds = num_params++] = ctx->i32;
5526                         params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5527                         params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5528                 } else {
5529                         params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5530                         declare_streamout_params(ctx, &shader->selector->so,
5531                                                  params, ctx->i32, &num_params);
5532                         params[ctx->param_oc_lds = num_params++] = ctx->i32;
5533                 }
5534                 last_sgpr = num_params - 1;
5535
5536                 /* VGPRs */
5537                 params[ctx->param_tes_u = num_params++] = ctx->f32;
5538                 params[ctx->param_tes_v = num_params++] = ctx->f32;
5539                 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5540                 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5541
5542                 /* PrimitiveID output. */
5543                 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5544                         for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5545                                 returns[num_returns++] = ctx->f32;
5546                 break;
5547
5548         case PIPE_SHADER_GEOMETRY:
5549                 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5550                 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5551                 last_sgpr = SI_PARAM_GS_WAVE_ID;
5552
5553                 /* VGPRs */
5554                 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5555                 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5556                 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5557                 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5558                 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5559                 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5560                 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5561                 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5562                 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5563                 break;
5564
5565         case PIPE_SHADER_FRAGMENT:
5566                 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5567                 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5568                 last_sgpr = SI_PARAM_PRIM_MASK;
5569                 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5570                 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5571                 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5572                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5573                 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5574                 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5575                 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5576                 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5577                 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5578                 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5579                 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5580                 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5581                 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5582                 params[SI_PARAM_ANCILLARY] = ctx->i32;
5583                 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5584                 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5585                 num_params = SI_PARAM_POS_FIXED_PT+1;
5586
5587                 if (!ctx->is_monolithic) {
5588                         /* Color inputs from the prolog. */
5589                         if (shader->selector->info.colors_read) {
5590                                 unsigned num_color_elements =
5591                                         util_bitcount(shader->selector->info.colors_read);
5592
5593                                 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5594                                 for (i = 0; i < num_color_elements; i++)
5595                                         params[num_params++] = ctx->f32;
5596                         }
5597
5598                         /* Outputs for the epilog. */
5599                         num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5600                         num_returns =
5601                                 num_return_sgprs +
5602                                 util_bitcount(shader->selector->info.colors_written) * 4 +
5603                                 shader->selector->info.writes_z +
5604                                 shader->selector->info.writes_stencil +
5605                                 shader->selector->info.writes_samplemask +
5606                                 1 /* SampleMaskIn */;
5607
5608                         num_returns = MAX2(num_returns,
5609                                            num_return_sgprs +
5610                                            PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5611
5612                         for (i = 0; i < num_return_sgprs; i++)
5613                                 returns[i] = ctx->i32;
5614                         for (; i < num_returns; i++)
5615                                 returns[i] = ctx->f32;
5616                 }
5617                 break;
5618
5619         case PIPE_SHADER_COMPUTE:
5620                 params[SI_PARAM_GRID_SIZE] = v3i32;
5621                 params[SI_PARAM_BLOCK_ID] = v3i32;
5622                 last_sgpr = SI_PARAM_BLOCK_ID;
5623
5624                 params[SI_PARAM_THREAD_ID] = v3i32;
5625                 num_params = SI_PARAM_THREAD_ID + 1;
5626                 break;
5627         default:
5628                 assert(0 && "unimplemented shader");
5629                 return;
5630         }
5631
5632         assert(num_params <= ARRAY_SIZE(params));
5633
5634         si_create_function(ctx, returns, num_returns, params,
5635                            num_params, last_array_pointer, last_sgpr);
5636
5637         /* Reserve register locations for VGPR inputs the PS prolog may need. */
5638         if (ctx->type == PIPE_SHADER_FRAGMENT &&
5639             !ctx->is_monolithic) {
5640                 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5641                                           "InitialPSInputAddr",
5642                                           S_0286D0_PERSP_SAMPLE_ENA(1) |
5643                                           S_0286D0_PERSP_CENTER_ENA(1) |
5644                                           S_0286D0_PERSP_CENTROID_ENA(1) |
5645                                           S_0286D0_LINEAR_SAMPLE_ENA(1) |
5646                                           S_0286D0_LINEAR_CENTER_ENA(1) |
5647                                           S_0286D0_LINEAR_CENTROID_ENA(1) |
5648                                           S_0286D0_FRONT_FACE_ENA(1) |
5649                                           S_0286D0_POS_FIXED_PT_ENA(1));
5650         } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5651                 const unsigned *properties = shader->selector->info.properties;
5652                 unsigned max_work_group_size =
5653                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5654                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5655                                properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5656
5657                 assert(max_work_group_size);
5658
5659                 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5660                                           "amdgpu-max-work-group-size",
5661                                           max_work_group_size);
5662         }
5663
5664         shader->info.num_input_sgprs = 0;
5665         shader->info.num_input_vgprs = 0;
5666
5667         for (i = 0; i <= last_sgpr; ++i)
5668                 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5669
5670         /* Unused fragment shader inputs are eliminated by the compiler,
5671          * so we don't know yet how many there will be.
5672          */
5673         if (ctx->type != PIPE_SHADER_FRAGMENT)
5674                 for (; i < num_params; ++i)
5675                         shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5676
5677         if (bld_base->info &&
5678             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5679              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5680              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5681              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5682              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5683              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5684                 ctx->lds =
5685                         LLVMAddGlobalInAddressSpace(gallivm->module,
5686                                                     LLVMArrayType(ctx->i32, 64),
5687                                                     "ddxy_lds",
5688                                                     LOCAL_ADDR_SPACE);
5689
5690         if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5691             ctx->type == PIPE_SHADER_TESS_CTRL ||
5692             ctx->type == PIPE_SHADER_TESS_EVAL)
5693                 declare_tess_lds(ctx);
5694 }
5695
5696 static void preload_constants(struct si_shader_context *ctx)
5697 {
5698         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5699         struct gallivm_state *gallivm = bld_base->base.gallivm;
5700         const struct tgsi_shader_info *info = bld_base->info;
5701         unsigned buf;
5702         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5703
5704         for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5705                 unsigned i, num_const = info->const_file_max[buf] + 1;
5706
5707                 if (num_const == 0)
5708                         continue;
5709
5710                 /* Allocate space for the constant values */
5711                 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5712
5713                 /* Load the resource descriptor */
5714                 ctx->const_buffers[buf] =
5715                         build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5716
5717                 /* Load the constants, we rely on the code sinking to do the rest */
5718                 for (i = 0; i < num_const * 4; ++i) {
5719                         ctx->constants[buf][i] =
5720                                 buffer_load_const(gallivm->builder,
5721                                         ctx->const_buffers[buf],
5722                                         lp_build_const_int32(gallivm, i * 4),
5723                                         ctx->f32);
5724                 }
5725         }
5726 }
5727
5728 static void preload_shader_buffers(struct si_shader_context *ctx)
5729 {
5730         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5731         LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5732         int buf, maxbuf;
5733
5734         maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5735                       SI_NUM_SHADER_BUFFERS - 1);
5736         for (buf = 0; buf <= maxbuf; ++buf) {
5737                 ctx->shader_buffers[buf] =
5738                         build_indexed_load_const(
5739                                 ctx, ptr, lp_build_const_int32(gallivm, buf));
5740         }
5741 }
5742
5743 static void preload_samplers(struct si_shader_context *ctx)
5744 {
5745         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5746         struct gallivm_state *gallivm = bld_base->base.gallivm;
5747         const struct tgsi_shader_info *info = bld_base->info;
5748         unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5749         LLVMValueRef offset;
5750
5751         if (num_samplers == 0)
5752                 return;
5753
5754         /* Load the resources and samplers, we rely on the code sinking to do the rest */
5755         for (i = 0; i < num_samplers; ++i) {
5756                 /* Resource */
5757                 offset = lp_build_const_int32(gallivm, i);
5758                 ctx->sampler_views[i] =
5759                         get_sampler_desc(ctx, offset, DESC_IMAGE);
5760
5761                 /* FMASK resource */
5762                 if (info->is_msaa_sampler[i])
5763                         ctx->fmasks[i] =
5764                                 get_sampler_desc(ctx, offset, DESC_FMASK);
5765                 else {
5766                         ctx->sampler_states[i] =
5767                                 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5768                         ctx->sampler_states[i] =
5769                                 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5770                                                        ctx->sampler_states[i]);
5771                 }
5772         }
5773 }
5774
5775 static void preload_images(struct si_shader_context *ctx)
5776 {
5777         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5778         struct tgsi_shader_info *info = &ctx->shader->selector->info;
5779         struct gallivm_state *gallivm = bld_base->base.gallivm;
5780         unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5781         LLVMValueRef res_ptr;
5782         unsigned i;
5783
5784         if (num_images == 0)
5785                 return;
5786
5787         res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5788
5789         for (i = 0; i < num_images; ++i) {
5790                 /* Rely on LLVM to shrink the load for buffer resources. */
5791                 LLVMValueRef rsrc =
5792                         build_indexed_load_const(ctx, res_ptr,
5793                                                  lp_build_const_int32(gallivm, i));
5794
5795                 if (info->images_writemask & (1 << i) &&
5796                     !(info->images_buffers & (1 << i)))
5797                         rsrc = force_dcc_off(ctx, rsrc);
5798
5799                 ctx->images[i] = rsrc;
5800         }
5801 }
5802
5803 static void preload_streamout_buffers(struct si_shader_context *ctx)
5804 {
5805         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5806         struct gallivm_state *gallivm = bld_base->base.gallivm;
5807         unsigned i;
5808
5809         /* Streamout can only be used if the shader is compiled as VS. */
5810         if (!ctx->shader->selector->so.num_outputs ||
5811             (ctx->type == PIPE_SHADER_VERTEX &&
5812              (ctx->shader->key.vs.as_es ||
5813               ctx->shader->key.vs.as_ls)) ||
5814             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5815              ctx->shader->key.tes.as_es))
5816                 return;
5817
5818         LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5819                                             SI_PARAM_RW_BUFFERS);
5820
5821         /* Load the resources, we rely on the code sinking to do the rest */
5822         for (i = 0; i < 4; ++i) {
5823                 if (ctx->shader->selector->so.stride[i]) {
5824                         LLVMValueRef offset = lp_build_const_int32(gallivm,
5825                                                                    SI_VS_STREAMOUT_BUF0 + i);
5826
5827                         ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5828                 }
5829         }
5830 }
5831
5832 /**
5833  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5834  * for later use.
5835  */
5836 static void preload_ring_buffers(struct si_shader_context *ctx)
5837 {
5838         struct gallivm_state *gallivm =
5839                 ctx->radeon_bld.soa.bld_base.base.gallivm;
5840
5841         LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5842                                             SI_PARAM_RW_BUFFERS);
5843
5844         if ((ctx->type == PIPE_SHADER_VERTEX &&
5845              ctx->shader->key.vs.as_es) ||
5846             (ctx->type == PIPE_SHADER_TESS_EVAL &&
5847              ctx->shader->key.tes.as_es) ||
5848             ctx->type == PIPE_SHADER_GEOMETRY) {
5849                 unsigned ring =
5850                         ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5851                                                              : SI_ES_RING_ESGS;
5852                 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5853
5854                 ctx->esgs_ring =
5855                         build_indexed_load_const(ctx, buf_ptr, offset);
5856         }
5857
5858         if (ctx->is_gs_copy_shader) {
5859                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5860
5861                 ctx->gsvs_ring[0] =
5862                         build_indexed_load_const(ctx, buf_ptr, offset);
5863         }
5864         if (ctx->type == PIPE_SHADER_GEOMETRY) {
5865                 int i;
5866                 for (i = 0; i < 4; i++) {
5867                         LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5868
5869                         ctx->gsvs_ring[i] =
5870                                 build_indexed_load_const(ctx, buf_ptr, offset);
5871                 }
5872         }
5873 }
5874
5875 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5876                                          LLVMValueRef param_rw_buffers,
5877                                          unsigned param_pos_fixed_pt)
5878 {
5879         struct lp_build_tgsi_context *bld_base =
5880                 &ctx->radeon_bld.soa.bld_base;
5881         struct gallivm_state *gallivm = bld_base->base.gallivm;
5882         LLVMBuilderRef builder = gallivm->builder;
5883         LLVMValueRef slot, desc, offset, row, bit, address[2];
5884
5885         /* Use the fixed-point gl_FragCoord input.
5886          * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5887          * per coordinate to get the repeating effect.
5888          */
5889         address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5890         address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5891
5892         /* Load the buffer descriptor. */
5893         slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5894         desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5895
5896         /* The stipple pattern is 32x32, each row has 32 bits. */
5897         offset = LLVMBuildMul(builder, address[1],
5898                               LLVMConstInt(ctx->i32, 4, 0), "");
5899         row = buffer_load_const(builder, desc, offset, ctx->i32);
5900         bit = LLVMBuildLShr(builder, row, address[0], "");
5901         bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5902
5903         /* The intrinsic kills the thread if arg < 0. */
5904         bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5905                               LLVMConstReal(ctx->f32, -1), "");
5906         lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5907 }
5908
5909 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5910                                   struct si_shader_config *conf,
5911                                   unsigned symbol_offset)
5912 {
5913         unsigned i;
5914         const unsigned char *config =
5915                 radeon_shader_binary_config_start(binary, symbol_offset);
5916         bool really_needs_scratch = false;
5917
5918         /* LLVM adds SGPR spills to the scratch size.
5919          * Find out if we really need the scratch buffer.
5920          */
5921         for (i = 0; i < binary->reloc_count; i++) {
5922                 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5923
5924                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5925                     !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5926                         really_needs_scratch = true;
5927                         break;
5928                 }
5929         }
5930
5931         /* XXX: We may be able to emit some of these values directly rather than
5932          * extracting fields to be emitted later.
5933          */
5934
5935         for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5936                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5937                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5938                 switch (reg) {
5939                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5940                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5941                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5942                 case R_00B848_COMPUTE_PGM_RSRC1:
5943                         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5944                         conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5945                         conf->float_mode =  G_00B028_FLOAT_MODE(value);
5946                         conf->rsrc1 = value;
5947                         break;
5948                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5949                         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5950                         break;
5951                 case R_00B84C_COMPUTE_PGM_RSRC2:
5952                         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5953                         conf->rsrc2 = value;
5954                         break;
5955                 case R_0286CC_SPI_PS_INPUT_ENA:
5956                         conf->spi_ps_input_ena = value;
5957                         break;
5958                 case R_0286D0_SPI_PS_INPUT_ADDR:
5959                         conf->spi_ps_input_addr = value;
5960                         break;
5961                 case R_0286E8_SPI_TMPRING_SIZE:
5962                 case R_00B860_COMPUTE_TMPRING_SIZE:
5963                         /* WAVESIZE is in units of 256 dwords. */
5964                         if (really_needs_scratch)
5965                                 conf->scratch_bytes_per_wave =
5966                                         G_00B860_WAVESIZE(value) * 256 * 4;
5967                         break;
5968                 default:
5969                         {
5970                                 static bool printed;
5971
5972                                 if (!printed) {
5973                                         fprintf(stderr, "Warning: LLVM emitted unknown "
5974                                                 "config register: 0x%x\n", reg);
5975                                         printed = true;
5976                                 }
5977                         }
5978                         break;
5979                 }
5980
5981                 if (!conf->spi_ps_input_addr)
5982                         conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5983         }
5984 }
5985
5986 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5987                         struct si_shader *shader,
5988                         struct si_shader_config *config,
5989                         uint64_t scratch_va)
5990 {
5991         unsigned i;
5992         uint32_t scratch_rsrc_dword0 = scratch_va;
5993         uint32_t scratch_rsrc_dword1 =
5994                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5995
5996         /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
5997          * correctly.
5998          */
5999         if (HAVE_LLVM >= 0x0309)
6000                 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6001         else
6002                 scratch_rsrc_dword1 |=
6003                         S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6004
6005         for (i = 0 ; i < shader->binary.reloc_count; i++) {
6006                 const struct radeon_shader_reloc *reloc =
6007                                         &shader->binary.relocs[i];
6008                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6009                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6010                         &scratch_rsrc_dword0, 4);
6011                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6012                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6013                         &scratch_rsrc_dword1, 4);
6014                 }
6015         }
6016 }
6017
6018 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6019 {
6020         unsigned size = shader->binary.code_size;
6021
6022         if (shader->prolog)
6023                 size += shader->prolog->binary.code_size;
6024         if (shader->epilog)
6025                 size += shader->epilog->binary.code_size;
6026         return size;
6027 }
6028
6029 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6030 {
6031         const struct radeon_shader_binary *prolog =
6032                 shader->prolog ? &shader->prolog->binary : NULL;
6033         const struct radeon_shader_binary *epilog =
6034                 shader->epilog ? &shader->epilog->binary : NULL;
6035         const struct radeon_shader_binary *mainb = &shader->binary;
6036         unsigned bo_size = si_get_shader_binary_size(shader) +
6037                            (!epilog ? mainb->rodata_size : 0);
6038         unsigned char *ptr;
6039
6040         assert(!prolog || !prolog->rodata_size);
6041         assert((!prolog && !epilog) || !mainb->rodata_size);
6042         assert(!epilog || !epilog->rodata_size);
6043
6044         r600_resource_reference(&shader->bo, NULL);
6045         shader->bo = si_resource_create_custom(&sscreen->b.b,
6046                                                PIPE_USAGE_IMMUTABLE,
6047                                                bo_size);
6048         if (!shader->bo)
6049                 return -ENOMEM;
6050
6051         /* Upload. */
6052         ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6053                                         PIPE_TRANSFER_READ_WRITE);
6054
6055         if (prolog) {
6056                 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6057                 ptr += prolog->code_size;
6058         }
6059
6060         util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6061         ptr += mainb->code_size;
6062
6063         if (epilog)
6064                 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6065         else if (mainb->rodata_size > 0)
6066                 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6067
6068         sscreen->b.ws->buffer_unmap(shader->bo->buf);
6069         return 0;
6070 }
6071
6072 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6073                                        struct pipe_debug_callback *debug,
6074                                        const char *name, FILE *file)
6075 {
6076         char *line, *p;
6077         unsigned i, count;
6078
6079         if (binary->disasm_string) {
6080                 fprintf(file, "Shader %s disassembly:\n", name);
6081                 fprintf(file, "%s", binary->disasm_string);
6082
6083                 if (debug && debug->debug_message) {
6084                         /* Very long debug messages are cut off, so send the
6085                          * disassembly one line at a time. This causes more
6086                          * overhead, but on the plus side it simplifies
6087                          * parsing of resulting logs.
6088                          */
6089                         pipe_debug_message(debug, SHADER_INFO,
6090                                            "Shader Disassembly Begin");
6091
6092                         line = binary->disasm_string;
6093                         while (*line) {
6094                                 p = util_strchrnul(line, '\n');
6095                                 count = p - line;
6096
6097                                 if (count) {
6098                                         pipe_debug_message(debug, SHADER_INFO,
6099                                                            "%.*s", count, line);
6100                                 }
6101
6102                                 if (!*p)
6103                                         break;
6104                                 line = p + 1;
6105                         }
6106
6107                         pipe_debug_message(debug, SHADER_INFO,
6108                                            "Shader Disassembly End");
6109                 }
6110         } else {
6111                 fprintf(file, "Shader %s binary:\n", name);
6112                 for (i = 0; i < binary->code_size; i += 4) {
6113                         fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6114                                 binary->code[i + 3], binary->code[i + 2],
6115                                 binary->code[i + 1], binary->code[i]);
6116                 }
6117         }
6118 }
6119
6120 static void si_shader_dump_stats(struct si_screen *sscreen,
6121                                  struct si_shader_config *conf,
6122                                  unsigned num_inputs,
6123                                  unsigned code_size,
6124                                  struct pipe_debug_callback *debug,
6125                                  unsigned processor,
6126                                  FILE *file)
6127 {
6128         unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6129         unsigned lds_per_wave = 0;
6130         unsigned max_simd_waves = 10;
6131         /* Assuming SGPRs aren't spilled. */
6132         unsigned spilled_vgprs = conf->scratch_bytes_per_wave / 64 / 4;
6133
6134         /* Compute LDS usage for PS. */
6135         if (processor == PIPE_SHADER_FRAGMENT) {
6136                 /* The minimum usage per wave is (num_inputs * 48). The maximum
6137                  * usage is (num_inputs * 48 * 16).
6138                  * We can get anything in between and it varies between waves.
6139                  *
6140                  * The 48 bytes per input for a single primitive is equal to
6141                  * 4 bytes/component * 4 components/input * 3 points.
6142                  *
6143                  * Other stages don't know the size at compile time or don't
6144                  * allocate LDS per wave, but instead they do it per thread group.
6145                  */
6146                 lds_per_wave = conf->lds_size * lds_increment +
6147                                align(num_inputs * 48, lds_increment);
6148         }
6149
6150         /* Compute the per-SIMD wave counts. */
6151         if (conf->num_sgprs) {
6152                 if (sscreen->b.chip_class >= VI)
6153                         max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6154                 else
6155                         max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6156         }
6157
6158         if (conf->num_vgprs)
6159                 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6160
6161         /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6162          * that PS can use.
6163          */
6164         if (lds_per_wave)
6165                 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6166
6167         if (file != stderr ||
6168             r600_can_dump_shader(&sscreen->b, processor)) {
6169                 if (processor == PIPE_SHADER_FRAGMENT) {
6170                         fprintf(file, "*** SHADER CONFIG ***\n"
6171                                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6172                                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
6173                                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6174                 }
6175
6176                 fprintf(file, "*** SHADER STATS ***\n"
6177                         "SGPRS: %d\n"
6178                         "VGPRS: %d\n"
6179                         "Spilled VGPRs: %d\n"
6180                         "Code Size: %d bytes\n"
6181                         "LDS: %d blocks\n"
6182                         "Scratch: %d bytes per wave\n"
6183                         "Max Waves: %d\n"
6184                         "********************\n",
6185                         conf->num_sgprs, conf->num_vgprs, spilled_vgprs, code_size,
6186                         conf->lds_size, conf->scratch_bytes_per_wave,
6187                         max_simd_waves);
6188         }
6189
6190         pipe_debug_message(debug, SHADER_INFO,
6191                            "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6192                            "LDS: %d Scratch: %d Max Waves: %d Spilled VGPRs: %d",
6193                            conf->num_sgprs, conf->num_vgprs, code_size,
6194                            conf->lds_size, conf->scratch_bytes_per_wave,
6195                            max_simd_waves, spilled_vgprs);
6196 }
6197
6198 static const char *si_get_shader_name(struct si_shader *shader,
6199                                       unsigned processor)
6200 {
6201         switch (processor) {
6202         case PIPE_SHADER_VERTEX:
6203                 if (shader->key.vs.as_es)
6204                         return "Vertex Shader as ES";
6205                 else if (shader->key.vs.as_ls)
6206                         return "Vertex Shader as LS";
6207                 else
6208                         return "Vertex Shader as VS";
6209         case PIPE_SHADER_TESS_CTRL:
6210                 return "Tessellation Control Shader";
6211         case PIPE_SHADER_TESS_EVAL:
6212                 if (shader->key.tes.as_es)
6213                         return "Tessellation Evaluation Shader as ES";
6214                 else
6215                         return "Tessellation Evaluation Shader as VS";
6216         case PIPE_SHADER_GEOMETRY:
6217                 if (shader->gs_copy_shader == NULL)
6218                         return "GS Copy Shader as VS";
6219                 else
6220                         return "Geometry Shader";
6221         case PIPE_SHADER_FRAGMENT:
6222                 return "Pixel Shader";
6223         case PIPE_SHADER_COMPUTE:
6224                 return "Compute Shader";
6225         default:
6226                 return "Unknown Shader";
6227         }
6228 }
6229
6230 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6231                     struct pipe_debug_callback *debug, unsigned processor,
6232                     FILE *file)
6233 {
6234         if (file != stderr ||
6235             (r600_can_dump_shader(&sscreen->b, processor) &&
6236              !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6237                 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6238
6239                 if (shader->prolog)
6240                         si_shader_dump_disassembly(&shader->prolog->binary,
6241                                                    debug, "prolog", file);
6242
6243                 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6244
6245                 if (shader->epilog)
6246                         si_shader_dump_disassembly(&shader->epilog->binary,
6247                                                    debug, "epilog", file);
6248                 fprintf(file, "\n");
6249         }
6250
6251         si_shader_dump_stats(sscreen, &shader->config,
6252                              shader->selector ? shader->selector->info.num_inputs : 0,
6253                              si_get_shader_binary_size(shader), debug, processor,
6254                              file);
6255 }
6256
6257 int si_compile_llvm(struct si_screen *sscreen,
6258                     struct radeon_shader_binary *binary,
6259                     struct si_shader_config *conf,
6260                     LLVMTargetMachineRef tm,
6261                     LLVMModuleRef mod,
6262                     struct pipe_debug_callback *debug,
6263                     unsigned processor,
6264                     const char *name)
6265 {
6266         int r = 0;
6267         unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6268
6269         if (r600_can_dump_shader(&sscreen->b, processor)) {
6270                 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6271
6272                 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6273                         fprintf(stderr, "%s LLVM IR:\n\n", name);
6274                         LLVMDumpModule(mod);
6275                         fprintf(stderr, "\n");
6276                 }
6277         }
6278
6279         if (!si_replace_shader(count, binary)) {
6280                 r = radeon_llvm_compile(mod, binary, tm, debug);
6281                 if (r)
6282                         return r;
6283         }
6284
6285         si_shader_binary_read_config(binary, conf, 0);
6286
6287         /* Enable 64-bit and 16-bit denormals, because there is no performance
6288          * cost.
6289          *
6290          * If denormals are enabled, all floating-point output modifiers are
6291          * ignored.
6292          *
6293          * Don't enable denormals for 32-bit floats, because:
6294          * - Floating-point output modifiers would be ignored by the hw.
6295          * - Some opcodes don't support denormals, such as v_mad_f32. We would
6296          *   have to stop using those.
6297          * - SI & CI would be very slow.
6298          */
6299         conf->float_mode |= V_00B028_FP_64_DENORMS;
6300
6301         FREE(binary->config);
6302         FREE(binary->global_symbol_offsets);
6303         binary->config = NULL;
6304         binary->global_symbol_offsets = NULL;
6305
6306         /* Some shaders can't have rodata because their binaries can be
6307          * concatenated.
6308          */
6309         if (binary->rodata_size &&
6310             (processor == PIPE_SHADER_VERTEX ||
6311              processor == PIPE_SHADER_TESS_CTRL ||
6312              processor == PIPE_SHADER_TESS_EVAL ||
6313              processor == PIPE_SHADER_FRAGMENT)) {
6314                 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6315                 return -EINVAL;
6316         }
6317
6318         return r;
6319 }
6320
6321 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6322 {
6323         if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6324                 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6325         else
6326                 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6327 }
6328
6329 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6330 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6331                                       struct si_shader_context *ctx,
6332                                       struct si_shader *gs,
6333                                       struct pipe_debug_callback *debug)
6334 {
6335         struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6336         struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6337         struct lp_build_context *uint = &bld_base->uint_bld;
6338         struct si_shader_output_values *outputs;
6339         struct tgsi_shader_info *gsinfo = &gs->selector->info;
6340         LLVMValueRef args[9];
6341         int i, r;
6342
6343         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6344
6345         si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6346         ctx->type = PIPE_SHADER_VERTEX;
6347         ctx->is_gs_copy_shader = true;
6348
6349         create_meta_data(ctx);
6350         create_function(ctx);
6351         preload_streamout_buffers(ctx);
6352         preload_ring_buffers(ctx);
6353
6354         args[0] = ctx->gsvs_ring[0];
6355         args[1] = lp_build_mul_imm(uint,
6356                                    LLVMGetParam(ctx->radeon_bld.main_fn,
6357                                                 ctx->param_vertex_id),
6358                                    4);
6359         args[3] = uint->zero;
6360         args[4] = uint->one;  /* OFFEN */
6361         args[5] = uint->zero; /* IDXEN */
6362         args[6] = uint->one;  /* GLC */
6363         args[7] = uint->one;  /* SLC */
6364         args[8] = uint->zero; /* TFE */
6365
6366         /* Fetch vertex data from GSVS ring */
6367         for (i = 0; i < gsinfo->num_outputs; ++i) {
6368                 unsigned chan;
6369
6370                 outputs[i].name = gsinfo->output_semantic_name[i];
6371                 outputs[i].sid = gsinfo->output_semantic_index[i];
6372
6373                 for (chan = 0; chan < 4; chan++) {
6374                         args[2] = lp_build_const_int32(gallivm,
6375                                                        (i * 4 + chan) *
6376                                                        gs->selector->gs_max_out_vertices * 16 * 4);
6377
6378                         outputs[i].values[chan] =
6379                                 LLVMBuildBitCast(gallivm->builder,
6380                                                  lp_build_intrinsic(gallivm->builder,
6381                                                                  "llvm.SI.buffer.load.dword.i32.i32",
6382                                                                  ctx->i32, args, 9,
6383                                                                  LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
6384                                                  ctx->f32, "");
6385                 }
6386         }
6387
6388         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6389
6390         LLVMBuildRetVoid(gallivm->builder);
6391
6392         /* Dump LLVM IR before any optimization passes */
6393         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6394             r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6395                 LLVMDumpModule(bld_base->base.gallivm->module);
6396
6397         radeon_llvm_finalize_module(&ctx->radeon_bld);
6398
6399         r = si_compile_llvm(sscreen, &ctx->shader->binary,
6400                             &ctx->shader->config, ctx->tm,
6401                             bld_base->base.gallivm->module,
6402                             debug, PIPE_SHADER_GEOMETRY,
6403                             "GS Copy Shader");
6404         if (!r) {
6405                 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6406                         fprintf(stderr, "GS Copy Shader:\n");
6407                 si_shader_dump(sscreen, ctx->shader, debug,
6408                                PIPE_SHADER_GEOMETRY, stderr);
6409                 r = si_shader_binary_upload(sscreen, ctx->shader);
6410         }
6411
6412         radeon_llvm_dispose(&ctx->radeon_bld);
6413
6414         FREE(outputs);
6415         return r;
6416 }
6417
6418 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
6419 {
6420         int i;
6421
6422         fprintf(f, "SHADER KEY\n");
6423
6424         switch (shader) {
6425         case PIPE_SHADER_VERTEX:
6426                 fprintf(f, "  instance_divisors = {");
6427                 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6428                         fprintf(f, !i ? "%u" : ", %u",
6429                                 key->vs.prolog.instance_divisors[i]);
6430                 fprintf(f, "}\n");
6431                 fprintf(f, "  as_es = %u\n", key->vs.as_es);
6432                 fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
6433                 fprintf(f, "  export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6434                 break;
6435
6436         case PIPE_SHADER_TESS_CTRL:
6437                 fprintf(f, "  prim_mode = %u\n", key->tcs.epilog.prim_mode);
6438                 break;
6439
6440         case PIPE_SHADER_TESS_EVAL:
6441                 fprintf(f, "  as_es = %u\n", key->tes.as_es);
6442                 fprintf(f, "  export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6443                 break;
6444
6445         case PIPE_SHADER_GEOMETRY:
6446         case PIPE_SHADER_COMPUTE:
6447                 break;
6448
6449         case PIPE_SHADER_FRAGMENT:
6450                 fprintf(f, "  prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6451                 fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6452                 fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6453                 fprintf(f, "  prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6454                 fprintf(f, "  prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6455                 fprintf(f, "  prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6456                 fprintf(f, "  prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6457                 fprintf(f, "  prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6458                 fprintf(f, "  prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6459                 fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6460                 fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6461                 fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6462                 fprintf(f, "  epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6463                 fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6464                 fprintf(f, "  epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6465                 fprintf(f, "  epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6466                 break;
6467
6468         default:
6469                 assert(0);
6470         }
6471 }
6472
6473 static void si_init_shader_ctx(struct si_shader_context *ctx,
6474                                struct si_screen *sscreen,
6475                                struct si_shader *shader,
6476                                LLVMTargetMachineRef tm)
6477 {
6478         struct lp_build_tgsi_context *bld_base;
6479         struct lp_build_tgsi_action tmpl = {};
6480
6481         memset(ctx, 0, sizeof(*ctx));
6482         radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
6483         ctx->tm = tm;
6484         ctx->screen = sscreen;
6485         if (shader && shader->selector)
6486                 ctx->type = shader->selector->info.processor;
6487         else
6488                 ctx->type = -1;
6489         ctx->shader = shader;
6490
6491         ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6492         ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6493         ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6494         ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6495         ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6496         ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6497         ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6498         ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6499         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6500         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6501         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6502         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6503
6504         bld_base = &ctx->radeon_bld.soa.bld_base;
6505         if (shader && shader->selector)
6506                 bld_base->info = &shader->selector->info;
6507         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6508
6509         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6510         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6511         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6512
6513         bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6514         bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6515         bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6516         bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6517         bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6518         bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6519         bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6520         bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6521         bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6522         bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6523         bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6524         bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6525         bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6526         bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6527
6528         bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6529         bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6530         bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6531         bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6532         bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6533         bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6534
6535         tmpl.fetch_args = atomic_fetch_args;
6536         tmpl.emit = atomic_emit;
6537         bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6538         bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6539         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6540         bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6541         bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6542         bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6543         bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6544         bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6545         bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6546         bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6547         bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6548         bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6549         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6550         bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6551         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6552         bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6553         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6554         bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6555         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6556         bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6557
6558         bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6559
6560         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6561         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6562         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6563         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6564
6565         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6566         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6567         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6568
6569         bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6570         bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6571         bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6572         bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6573 }
6574
6575 int si_compile_tgsi_shader(struct si_screen *sscreen,
6576                            LLVMTargetMachineRef tm,
6577                            struct si_shader *shader,
6578                            bool is_monolithic,
6579                            struct pipe_debug_callback *debug)
6580 {
6581         struct si_shader_selector *sel = shader->selector;
6582         struct si_shader_context ctx;
6583         struct lp_build_tgsi_context *bld_base;
6584         LLVMModuleRef mod;
6585         int r = 0;
6586
6587         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6588          * conversion fails. */
6589         if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6590             !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6591                 if (is_monolithic)
6592                         si_dump_shader_key(sel->type, &shader->key, stderr);
6593                 tgsi_dump(sel->tokens, 0);
6594                 si_dump_streamout(&sel->so);
6595         }
6596
6597         si_init_shader_ctx(&ctx, sscreen, shader, tm);
6598         ctx.is_monolithic = is_monolithic;
6599
6600         shader->info.uses_instanceid = sel->info.uses_instanceid;
6601
6602         bld_base = &ctx.radeon_bld.soa.bld_base;
6603         ctx.radeon_bld.load_system_value = declare_system_value;
6604
6605         switch (ctx.type) {
6606         case PIPE_SHADER_VERTEX:
6607                 ctx.radeon_bld.load_input = declare_input_vs;
6608                 if (shader->key.vs.as_ls)
6609                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6610                 else if (shader->key.vs.as_es)
6611                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6612                 else
6613                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6614                 break;
6615         case PIPE_SHADER_TESS_CTRL:
6616                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6617                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6618                 bld_base->emit_store = store_output_tcs;
6619                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6620                 break;
6621         case PIPE_SHADER_TESS_EVAL:
6622                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6623                 if (shader->key.tes.as_es)
6624                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6625                 else
6626                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6627                 break;
6628         case PIPE_SHADER_GEOMETRY:
6629                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6630                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6631                 break;
6632         case PIPE_SHADER_FRAGMENT:
6633                 ctx.radeon_bld.load_input = declare_input_fs;
6634                 if (is_monolithic)
6635                         bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6636                 else
6637                         bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6638                 break;
6639         case PIPE_SHADER_COMPUTE:
6640                 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6641                 break;
6642         default:
6643                 assert(!"Unsupported shader type");
6644                 return -1;
6645         }
6646
6647         create_meta_data(&ctx);
6648         create_function(&ctx);
6649         preload_constants(&ctx);
6650         preload_shader_buffers(&ctx);
6651         preload_samplers(&ctx);
6652         preload_images(&ctx);
6653         preload_streamout_buffers(&ctx);
6654         preload_ring_buffers(&ctx);
6655
6656         if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6657             shader->key.ps.prolog.poly_stipple) {
6658                 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6659                                                  SI_PARAM_RW_BUFFERS);
6660                 si_llvm_emit_polygon_stipple(&ctx, list,
6661                                              SI_PARAM_POS_FIXED_PT);
6662         }
6663
6664         if (ctx.type == PIPE_SHADER_GEOMETRY) {
6665                 int i;
6666                 for (i = 0; i < 4; i++) {
6667                         ctx.gs_next_vertex[i] =
6668                                 lp_build_alloca(bld_base->base.gallivm,
6669                                                 ctx.i32, "");
6670                 }
6671         }
6672
6673         if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6674                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6675                 goto out;
6676         }
6677
6678         si_llvm_build_ret(&ctx, ctx.return_value);
6679         mod = bld_base->base.gallivm->module;
6680
6681         /* Dump LLVM IR before any optimization passes */
6682         if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6683             r600_can_dump_shader(&sscreen->b, ctx.type))
6684                 LLVMDumpModule(mod);
6685
6686         radeon_llvm_finalize_module(&ctx.radeon_bld);
6687
6688         r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6689                             mod, debug, ctx.type, "TGSI shader");
6690         if (r) {
6691                 fprintf(stderr, "LLVM failed to compile shader\n");
6692                 goto out;
6693         }
6694
6695         radeon_llvm_dispose(&ctx.radeon_bld);
6696
6697         /* Add the scratch offset to input SGPRs. */
6698         if (shader->config.scratch_bytes_per_wave)
6699                 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6700
6701         /* Calculate the number of fragment input VGPRs. */
6702         if (ctx.type == PIPE_SHADER_FRAGMENT) {
6703                 shader->info.num_input_vgprs = 0;
6704                 shader->info.face_vgpr_index = -1;
6705
6706                 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6707                         shader->info.num_input_vgprs += 2;
6708                 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6709                         shader->info.num_input_vgprs += 2;
6710                 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6711                         shader->info.num_input_vgprs += 2;
6712                 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6713                         shader->info.num_input_vgprs += 3;
6714                 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6715                         shader->info.num_input_vgprs += 2;
6716                 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6717                         shader->info.num_input_vgprs += 2;
6718                 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6719                         shader->info.num_input_vgprs += 2;
6720                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6721                         shader->info.num_input_vgprs += 1;
6722                 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6723                         shader->info.num_input_vgprs += 1;
6724                 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6725                         shader->info.num_input_vgprs += 1;
6726                 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6727                         shader->info.num_input_vgprs += 1;
6728                 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6729                         shader->info.num_input_vgprs += 1;
6730                 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6731                         shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6732                         shader->info.num_input_vgprs += 1;
6733                 }
6734                 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6735                         shader->info.num_input_vgprs += 1;
6736                 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6737                         shader->info.num_input_vgprs += 1;
6738                 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6739                         shader->info.num_input_vgprs += 1;
6740         }
6741
6742         if (ctx.type == PIPE_SHADER_GEOMETRY) {
6743                 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6744                 shader->gs_copy_shader->selector = shader->selector;
6745                 ctx.shader = shader->gs_copy_shader;
6746                 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6747                                                     shader, debug))) {
6748                         free(shader->gs_copy_shader);
6749                         shader->gs_copy_shader = NULL;
6750                         goto out;
6751                 }
6752         }
6753
6754 out:
6755         for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6756                 FREE(ctx.constants[i]);
6757         return r;
6758 }
6759
6760 /**
6761  * Create, compile and return a shader part (prolog or epilog).
6762  *
6763  * \param sscreen       screen
6764  * \param list          list of shader parts of the same category
6765  * \param key           shader part key
6766  * \param tm            LLVM target machine
6767  * \param debug         debug callback
6768  * \param compile       the callback responsible for compilation
6769  * \return              non-NULL on success
6770  */
6771 static struct si_shader_part *
6772 si_get_shader_part(struct si_screen *sscreen,
6773                    struct si_shader_part **list,
6774                    union si_shader_part_key *key,
6775                    LLVMTargetMachineRef tm,
6776                    struct pipe_debug_callback *debug,
6777                    bool (*compile)(struct si_screen *,
6778                                    LLVMTargetMachineRef,
6779                                    struct pipe_debug_callback *,
6780                                    struct si_shader_part *))
6781 {
6782         struct si_shader_part *result;
6783
6784         pipe_mutex_lock(sscreen->shader_parts_mutex);
6785
6786         /* Find existing. */
6787         for (result = *list; result; result = result->next) {
6788                 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6789                         pipe_mutex_unlock(sscreen->shader_parts_mutex);
6790                         return result;
6791                 }
6792         }
6793
6794         /* Compile a new one. */
6795         result = CALLOC_STRUCT(si_shader_part);
6796         result->key = *key;
6797         if (!compile(sscreen, tm, debug, result)) {
6798                 FREE(result);
6799                 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6800                 return NULL;
6801         }
6802
6803         result->next = *list;
6804         *list = result;
6805         pipe_mutex_unlock(sscreen->shader_parts_mutex);
6806         return result;
6807 }
6808
6809 /**
6810  * Create a vertex shader prolog.
6811  *
6812  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6813  * All inputs are returned unmodified. The vertex load indices are
6814  * stored after them, which will used by the API VS for fetching inputs.
6815  *
6816  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6817  *   input_v0,
6818  *   input_v1,
6819  *   input_v2,
6820  *   input_v3,
6821  *   (VertexID + BaseVertex),
6822  *   (InstanceID + StartInstance),
6823  *   (InstanceID / 2 + StartInstance)
6824  */
6825 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6826                                  LLVMTargetMachineRef tm,
6827                                  struct pipe_debug_callback *debug,
6828                                  struct si_shader_part *out)
6829 {
6830         union si_shader_part_key *key = &out->key;
6831         struct si_shader shader = {};
6832         struct si_shader_context ctx;
6833         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6834         LLVMTypeRef *params, *returns;
6835         LLVMValueRef ret, func;
6836         int last_sgpr, num_params, num_returns, i;
6837         bool status = true;
6838
6839         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6840         ctx.type = PIPE_SHADER_VERTEX;
6841         ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6842         ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6843
6844         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6845         params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6846                         sizeof(LLVMTypeRef));
6847         returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6848                           key->vs_prolog.last_input + 1) *
6849                          sizeof(LLVMTypeRef));
6850         num_params = 0;
6851         num_returns = 0;
6852
6853         /* Declare input and output SGPRs. */
6854         num_params = 0;
6855         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6856                 params[num_params++] = ctx.i32;
6857                 returns[num_returns++] = ctx.i32;
6858         }
6859         last_sgpr = num_params - 1;
6860
6861         /* 4 preloaded VGPRs (outputs must be floats) */
6862         for (i = 0; i < 4; i++) {
6863                 params[num_params++] = ctx.i32;
6864                 returns[num_returns++] = ctx.f32;
6865         }
6866
6867         /* Vertex load indices. */
6868         for (i = 0; i <= key->vs_prolog.last_input; i++)
6869                 returns[num_returns++] = ctx.f32;
6870
6871         /* Create the function. */
6872         si_create_function(&ctx, returns, num_returns, params,
6873                            num_params, -1, last_sgpr);
6874         func = ctx.radeon_bld.main_fn;
6875
6876         /* Copy inputs to outputs. This should be no-op, as the registers match,
6877          * but it will prevent the compiler from overwriting them unintentionally.
6878          */
6879         ret = ctx.return_value;
6880         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6881                 LLVMValueRef p = LLVMGetParam(func, i);
6882                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6883         }
6884         for (i = num_params - 4; i < num_params; i++) {
6885                 LLVMValueRef p = LLVMGetParam(func, i);
6886                 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6887                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6888         }
6889
6890         /* Compute vertex load indices from instance divisors. */
6891         for (i = 0; i <= key->vs_prolog.last_input; i++) {
6892                 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6893                 LLVMValueRef index;
6894
6895                 if (divisor) {
6896                         /* InstanceID / Divisor + StartInstance */
6897                         index = get_instance_index_for_fetch(&ctx.radeon_bld,
6898                                                              SI_SGPR_START_INSTANCE,
6899                                                              divisor);
6900                 } else {
6901                         /* VertexID + BaseVertex */
6902                         index = LLVMBuildAdd(gallivm->builder,
6903                                              LLVMGetParam(func, ctx.param_vertex_id),
6904                                              LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6905                 }
6906
6907                 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6908                 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6909                                            num_params++, "");
6910         }
6911
6912         /* Compile. */
6913         si_llvm_build_ret(&ctx, ret);
6914         radeon_llvm_finalize_module(&ctx.radeon_bld);
6915
6916         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6917                             gallivm->module, debug, ctx.type,
6918                             "Vertex Shader Prolog"))
6919                 status = false;
6920
6921         radeon_llvm_dispose(&ctx.radeon_bld);
6922         return status;
6923 }
6924
6925 /**
6926  * Compile the vertex shader epilog. This is also used by the tessellation
6927  * evaluation shader compiled as VS.
6928  *
6929  * The input is PrimitiveID.
6930  *
6931  * If PrimitiveID is required by the pixel shader, export it.
6932  * Otherwise, do nothing.
6933  */
6934 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6935                                  LLVMTargetMachineRef tm,
6936                                  struct pipe_debug_callback *debug,
6937                                  struct si_shader_part *out)
6938 {
6939         union si_shader_part_key *key = &out->key;
6940         struct si_shader_context ctx;
6941         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6942         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6943         LLVMTypeRef params[5];
6944         int num_params, i;
6945         bool status = true;
6946
6947         si_init_shader_ctx(&ctx, sscreen, NULL, tm);
6948         ctx.type = PIPE_SHADER_VERTEX;
6949
6950         /* Declare input VGPRs. */
6951         num_params = key->vs_epilog.states.export_prim_id ?
6952                            (VS_EPILOG_PRIMID_LOC + 1) : 0;
6953         assert(num_params <= ARRAY_SIZE(params));
6954
6955         for (i = 0; i < num_params; i++)
6956                 params[i] = ctx.f32;
6957
6958         /* Create the function. */
6959         si_create_function(&ctx, NULL, 0, params, num_params,
6960                            -1, -1);
6961
6962         /* Emit exports. */
6963         if (key->vs_epilog.states.export_prim_id) {
6964                 struct lp_build_context *base = &bld_base->base;
6965                 struct lp_build_context *uint = &bld_base->uint_bld;
6966                 LLVMValueRef args[9];
6967
6968                 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
6969                 args[1] = uint->zero; /* whether the EXEC mask is valid */
6970                 args[2] = uint->zero; /* DONE bit */
6971                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
6972                                                key->vs_epilog.prim_id_param_offset);
6973                 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
6974                 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
6975                                        VS_EPILOG_PRIMID_LOC); /* X */
6976                 args[6] = uint->undef; /* Y */
6977                 args[7] = uint->undef; /* Z */
6978                 args[8] = uint->undef; /* W */
6979
6980                 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
6981                                    LLVMVoidTypeInContext(base->gallivm->context),
6982                                    args, 9, 0);
6983         }
6984
6985         /* Compile. */
6986         LLVMBuildRetVoid(gallivm->builder);
6987         radeon_llvm_finalize_module(&ctx.radeon_bld);
6988
6989         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6990                             gallivm->module, debug, ctx.type,
6991                             "Vertex Shader Epilog"))
6992                 status = false;
6993
6994         radeon_llvm_dispose(&ctx.radeon_bld);
6995         return status;
6996 }
6997
6998 /**
6999  * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7000  */
7001 static bool si_get_vs_epilog(struct si_screen *sscreen,
7002                              LLVMTargetMachineRef tm,
7003                              struct si_shader *shader,
7004                              struct pipe_debug_callback *debug,
7005                              struct si_vs_epilog_bits *states)
7006 {
7007         union si_shader_part_key epilog_key;
7008
7009         memset(&epilog_key, 0, sizeof(epilog_key));
7010         epilog_key.vs_epilog.states = *states;
7011
7012         /* Set up the PrimitiveID output. */
7013         if (shader->key.vs.epilog.export_prim_id) {
7014                 unsigned index = shader->selector->info.num_outputs;
7015                 unsigned offset = shader->info.nr_param_exports++;
7016
7017                 epilog_key.vs_epilog.prim_id_param_offset = offset;
7018                 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7019                 shader->info.vs_output_param_offset[index] = offset;
7020         }
7021
7022         shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7023                                             &epilog_key, tm, debug,
7024                                             si_compile_vs_epilog);
7025         return shader->epilog != NULL;
7026 }
7027
7028 /**
7029  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7030  */
7031 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7032                                       LLVMTargetMachineRef tm,
7033                                       struct si_shader *shader,
7034                                       struct pipe_debug_callback *debug)
7035 {
7036         struct tgsi_shader_info *info = &shader->selector->info;
7037         union si_shader_part_key prolog_key;
7038         unsigned i;
7039
7040         /* Get the prolog. */
7041         memset(&prolog_key, 0, sizeof(prolog_key));
7042         prolog_key.vs_prolog.states = shader->key.vs.prolog;
7043         prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7044         prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7045
7046         /* The prolog is a no-op if there are no inputs. */
7047         if (info->num_inputs) {
7048                 shader->prolog =
7049                         si_get_shader_part(sscreen, &sscreen->vs_prologs,
7050                                            &prolog_key, tm, debug,
7051                                            si_compile_vs_prolog);
7052                 if (!shader->prolog)
7053                         return false;
7054         }
7055
7056         /* Get the epilog. */
7057         if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7058             !si_get_vs_epilog(sscreen, tm, shader, debug,
7059                               &shader->key.vs.epilog))
7060                 return false;
7061
7062         /* Set the instanceID flag. */
7063         for (i = 0; i < info->num_inputs; i++)
7064                 if (prolog_key.vs_prolog.states.instance_divisors[i])
7065                         shader->info.uses_instanceid = true;
7066
7067         return true;
7068 }
7069
7070 /**
7071  * Select and compile (or reuse) TES parts (epilog).
7072  */
7073 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7074                                        LLVMTargetMachineRef tm,
7075                                        struct si_shader *shader,
7076                                        struct pipe_debug_callback *debug)
7077 {
7078         if (shader->key.tes.as_es)
7079                 return true;
7080
7081         /* TES compiled as VS. */
7082         return si_get_vs_epilog(sscreen, tm, shader, debug,
7083                                 &shader->key.tes.epilog);
7084 }
7085
7086 /**
7087  * Compile the TCS epilog. This writes tesselation factors to memory based on
7088  * the output primitive type of the tesselator (determined by TES).
7089  */
7090 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7091                                   LLVMTargetMachineRef tm,
7092                                   struct pipe_debug_callback *debug,
7093                                   struct si_shader_part *out)
7094 {
7095         union si_shader_part_key *key = &out->key;
7096         struct si_shader shader = {};
7097         struct si_shader_context ctx;
7098         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7099         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7100         LLVMTypeRef params[16];
7101         LLVMValueRef func;
7102         int last_array_pointer, last_sgpr, num_params;
7103         bool status = true;
7104
7105         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7106         ctx.type = PIPE_SHADER_TESS_CTRL;
7107         shader.key.tcs.epilog = key->tcs_epilog.states;
7108
7109         /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7110         params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7111         last_array_pointer = SI_PARAM_RW_BUFFERS;
7112         params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7113         params[SI_PARAM_SAMPLERS] = ctx.i64;
7114         params[SI_PARAM_IMAGES] = ctx.i64;
7115         params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7116         params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7117         params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7118         params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7119         params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7120         params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7121         params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7122         last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7123         num_params = last_sgpr + 1;
7124
7125         params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7126         params[num_params++] = ctx.i32; /* invocation ID within the patch */
7127         params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7128
7129         /* Create the function. */
7130         si_create_function(&ctx, NULL, 0, params, num_params,
7131                            last_array_pointer, last_sgpr);
7132         declare_tess_lds(&ctx);
7133         func = ctx.radeon_bld.main_fn;
7134
7135         si_write_tess_factors(bld_base,
7136                               LLVMGetParam(func, last_sgpr + 1),
7137                               LLVMGetParam(func, last_sgpr + 2),
7138                               LLVMGetParam(func, last_sgpr + 3));
7139
7140         /* Compile. */
7141         LLVMBuildRetVoid(gallivm->builder);
7142         radeon_llvm_finalize_module(&ctx.radeon_bld);
7143
7144         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7145                             gallivm->module, debug, ctx.type,
7146                             "Tessellation Control Shader Epilog"))
7147                 status = false;
7148
7149         radeon_llvm_dispose(&ctx.radeon_bld);
7150         return status;
7151 }
7152
7153 /**
7154  * Select and compile (or reuse) TCS parts (epilog).
7155  */
7156 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7157                                        LLVMTargetMachineRef tm,
7158                                        struct si_shader *shader,
7159                                        struct pipe_debug_callback *debug)
7160 {
7161         union si_shader_part_key epilog_key;
7162
7163         /* Get the epilog. */
7164         memset(&epilog_key, 0, sizeof(epilog_key));
7165         epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7166
7167         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7168                                             &epilog_key, tm, debug,
7169                                             si_compile_tcs_epilog);
7170         return shader->epilog != NULL;
7171 }
7172
7173 /**
7174  * Compile the pixel shader prolog. This handles:
7175  * - two-side color selection and interpolation
7176  * - overriding interpolation parameters for the API PS
7177  * - polygon stippling
7178  *
7179  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7180  * overriden by other states. (e.g. per-sample interpolation)
7181  * Interpolated colors are stored after the preloaded VGPRs.
7182  */
7183 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7184                                  LLVMTargetMachineRef tm,
7185                                  struct pipe_debug_callback *debug,
7186                                  struct si_shader_part *out)
7187 {
7188         union si_shader_part_key *key = &out->key;
7189         struct si_shader shader = {};
7190         struct si_shader_context ctx;
7191         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7192         LLVMTypeRef *params;
7193         LLVMValueRef ret, func;
7194         int last_sgpr, num_params, num_returns, i, num_color_channels;
7195         bool status = true;
7196
7197         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7198         ctx.type = PIPE_SHADER_FRAGMENT;
7199         shader.key.ps.prolog = key->ps_prolog.states;
7200
7201         /* Number of inputs + 8 color elements. */
7202         params = alloca((key->ps_prolog.num_input_sgprs +
7203                          key->ps_prolog.num_input_vgprs + 8) *
7204                         sizeof(LLVMTypeRef));
7205
7206         /* Declare inputs. */
7207         num_params = 0;
7208         for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7209                 params[num_params++] = ctx.i32;
7210         last_sgpr = num_params - 1;
7211
7212         for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7213                 params[num_params++] = ctx.f32;
7214
7215         /* Declare outputs (same as inputs + add colors if needed) */
7216         num_returns = num_params;
7217         num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7218         for (i = 0; i < num_color_channels; i++)
7219                 params[num_returns++] = ctx.f32;
7220
7221         /* Create the function. */
7222         si_create_function(&ctx, params, num_returns, params,
7223                            num_params, -1, last_sgpr);
7224         func = ctx.radeon_bld.main_fn;
7225
7226         /* Copy inputs to outputs. This should be no-op, as the registers match,
7227          * but it will prevent the compiler from overwriting them unintentionally.
7228          */
7229         ret = ctx.return_value;
7230         for (i = 0; i < num_params; i++) {
7231                 LLVMValueRef p = LLVMGetParam(func, i);
7232                 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7233         }
7234
7235         /* Polygon stippling. */
7236         if (key->ps_prolog.states.poly_stipple) {
7237                 /* POS_FIXED_PT is always last. */
7238                 unsigned pos = key->ps_prolog.num_input_sgprs +
7239                                key->ps_prolog.num_input_vgprs - 1;
7240                 LLVMValueRef ptr[2], list;
7241
7242                 /* Get the pointer to rw buffers. */
7243                 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7244                 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7245                 list = lp_build_gather_values(gallivm, ptr, 2);
7246                 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7247                 list = LLVMBuildIntToPtr(gallivm->builder, list,
7248                                           const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7249
7250                 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7251         }
7252
7253         if (key->ps_prolog.states.bc_optimize_for_persp ||
7254             key->ps_prolog.states.bc_optimize_for_linear) {
7255                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7256                 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7257
7258                 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7259                  * The hw doesn't compute CENTROID if the whole wave only
7260                  * contains fully-covered quads.
7261                  *
7262                  * PRIM_MASK is after user SGPRs.
7263                  */
7264                 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7265                 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7266                                             LLVMConstInt(ctx.i32, 31, 0), "");
7267                 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7268                                              ctx.i1, "");
7269
7270                 if (key->ps_prolog.states.bc_optimize_for_persp) {
7271                         /* Read PERSP_CENTER. */
7272                         for (i = 0; i < 2; i++)
7273                                 center[i] = LLVMGetParam(func, base + 2 + i);
7274                         /* Read PERSP_CENTROID. */
7275                         for (i = 0; i < 2; i++)
7276                                 centroid[i] = LLVMGetParam(func, base + 4 + i);
7277                         /* Select PERSP_CENTROID. */
7278                         for (i = 0; i < 2; i++) {
7279                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7280                                                       center[i], centroid[i], "");
7281                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7282                                                            tmp, base + 4 + i, "");
7283                         }
7284                 }
7285                 if (key->ps_prolog.states.bc_optimize_for_linear) {
7286                         /* Read LINEAR_CENTER. */
7287                         for (i = 0; i < 2; i++)
7288                                 center[i] = LLVMGetParam(func, base + 8 + i);
7289                         /* Read LINEAR_CENTROID. */
7290                         for (i = 0; i < 2; i++)
7291                                 centroid[i] = LLVMGetParam(func, base + 10 + i);
7292                         /* Select LINEAR_CENTROID. */
7293                         for (i = 0; i < 2; i++) {
7294                                 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7295                                                       center[i], centroid[i], "");
7296                                 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7297                                                            tmp, base + 10 + i, "");
7298                         }
7299                 }
7300         }
7301
7302         /* Interpolate colors. */
7303         for (i = 0; i < 2; i++) {
7304                 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7305                 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7306                                      key->ps_prolog.face_vgpr_index;
7307                 LLVMValueRef interp[2], color[4];
7308                 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7309
7310                 if (!writemask)
7311                         continue;
7312
7313                 /* If the interpolation qualifier is not CONSTANT (-1). */
7314                 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7315                         unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7316                                                key->ps_prolog.color_interp_vgpr_index[i];
7317
7318                         /* Get the (i,j) updated by bc_optimize handling. */
7319                         interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7320                                                           interp_vgpr, "");
7321                         interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7322                                                           interp_vgpr + 1, "");
7323                         interp_ij = lp_build_gather_values(gallivm, interp, 2);
7324                         interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7325                                                      ctx.v2i32, "");
7326                 }
7327
7328                 /* Use the absolute location of the input. */
7329                 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7330
7331                 if (key->ps_prolog.states.color_two_side) {
7332                         face = LLVMGetParam(func, face_vgpr);
7333                         face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7334                 }
7335
7336                 interp_fs_input(&ctx,
7337                                 key->ps_prolog.color_attr_index[i],
7338                                 TGSI_SEMANTIC_COLOR, i,
7339                                 key->ps_prolog.num_interp_inputs,
7340                                 key->ps_prolog.colors_read, interp_ij,
7341                                 prim_mask, face, color);
7342
7343                 while (writemask) {
7344                         unsigned chan = u_bit_scan(&writemask);
7345                         ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7346                                                    num_params++, "");
7347                 }
7348         }
7349
7350         /* Force per-sample interpolation. */
7351         if (key->ps_prolog.states.force_persp_sample_interp) {
7352                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7353                 LLVMValueRef persp_sample[2];
7354
7355                 /* Read PERSP_SAMPLE. */
7356                 for (i = 0; i < 2; i++)
7357                         persp_sample[i] = LLVMGetParam(func, base + i);
7358                 /* Overwrite PERSP_CENTER. */
7359                 for (i = 0; i < 2; i++)
7360                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7361                                                    persp_sample[i], base + 2 + i, "");
7362                 /* Overwrite PERSP_CENTROID. */
7363                 for (i = 0; i < 2; i++)
7364                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7365                                                    persp_sample[i], base + 4 + i, "");
7366         }
7367         if (key->ps_prolog.states.force_linear_sample_interp) {
7368                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7369                 LLVMValueRef linear_sample[2];
7370
7371                 /* Read LINEAR_SAMPLE. */
7372                 for (i = 0; i < 2; i++)
7373                         linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7374                 /* Overwrite LINEAR_CENTER. */
7375                 for (i = 0; i < 2; i++)
7376                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7377                                                    linear_sample[i], base + 8 + i, "");
7378                 /* Overwrite LINEAR_CENTROID. */
7379                 for (i = 0; i < 2; i++)
7380                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7381                                                    linear_sample[i], base + 10 + i, "");
7382         }
7383
7384         /* Force center interpolation. */
7385         if (key->ps_prolog.states.force_persp_center_interp) {
7386                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7387                 LLVMValueRef persp_center[2];
7388
7389                 /* Read PERSP_CENTER. */
7390                 for (i = 0; i < 2; i++)
7391                         persp_center[i] = LLVMGetParam(func, base + 2 + i);
7392                 /* Overwrite PERSP_SAMPLE. */
7393                 for (i = 0; i < 2; i++)
7394                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7395                                                    persp_center[i], base + i, "");
7396                 /* Overwrite PERSP_CENTROID. */
7397                 for (i = 0; i < 2; i++)
7398                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7399                                                    persp_center[i], base + 4 + i, "");
7400         }
7401         if (key->ps_prolog.states.force_linear_center_interp) {
7402                 unsigned i, base = key->ps_prolog.num_input_sgprs;
7403                 LLVMValueRef linear_center[2];
7404
7405                 /* Read LINEAR_CENTER. */
7406                 for (i = 0; i < 2; i++)
7407                         linear_center[i] = LLVMGetParam(func, base + 8 + i);
7408                 /* Overwrite LINEAR_SAMPLE. */
7409                 for (i = 0; i < 2; i++)
7410                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7411                                                    linear_center[i], base + 6 + i, "");
7412                 /* Overwrite LINEAR_CENTROID. */
7413                 for (i = 0; i < 2; i++)
7414                         ret = LLVMBuildInsertValue(gallivm->builder, ret,
7415                                                    linear_center[i], base + 10 + i, "");
7416         }
7417
7418         /* Tell LLVM to insert WQM instruction sequence when needed. */
7419         if (key->ps_prolog.wqm) {
7420                 LLVMAddTargetDependentFunctionAttr(func,
7421                                                    "amdgpu-ps-wqm-outputs", "");
7422         }
7423
7424         /* Compile. */
7425         si_llvm_build_ret(&ctx, ret);
7426         radeon_llvm_finalize_module(&ctx.radeon_bld);
7427
7428         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7429                             gallivm->module, debug, ctx.type,
7430                             "Fragment Shader Prolog"))
7431                 status = false;
7432
7433         radeon_llvm_dispose(&ctx.radeon_bld);
7434         return status;
7435 }
7436
7437 /**
7438  * Compile the pixel shader epilog. This handles everything that must be
7439  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7440  */
7441 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7442                                  LLVMTargetMachineRef tm,
7443                                  struct pipe_debug_callback *debug,
7444                                  struct si_shader_part *out)
7445 {
7446         union si_shader_part_key *key = &out->key;
7447         struct si_shader shader = {};
7448         struct si_shader_context ctx;
7449         struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7450         struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7451         LLVMTypeRef params[16+8*4+3];
7452         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7453         int last_array_pointer, last_sgpr, num_params, i;
7454         bool status = true;
7455
7456         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7457         ctx.type = PIPE_SHADER_FRAGMENT;
7458         shader.key.ps.epilog = key->ps_epilog.states;
7459
7460         /* Declare input SGPRs. */
7461         params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7462         params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7463         params[SI_PARAM_SAMPLERS] = ctx.i64;
7464         params[SI_PARAM_IMAGES] = ctx.i64;
7465         params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7466         params[SI_PARAM_ALPHA_REF] = ctx.f32;
7467         last_array_pointer = -1;
7468         last_sgpr = SI_PARAM_ALPHA_REF;
7469
7470         /* Declare input VGPRs. */
7471         num_params = (last_sgpr + 1) +
7472                      util_bitcount(key->ps_epilog.colors_written) * 4 +
7473                      key->ps_epilog.writes_z +
7474                      key->ps_epilog.writes_stencil +
7475                      key->ps_epilog.writes_samplemask;
7476
7477         num_params = MAX2(num_params,
7478                           last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7479
7480         assert(num_params <= ARRAY_SIZE(params));
7481
7482         for (i = last_sgpr + 1; i < num_params; i++)
7483                 params[i] = ctx.f32;
7484
7485         /* Create the function. */
7486         si_create_function(&ctx, NULL, 0, params, num_params,
7487                            last_array_pointer, last_sgpr);
7488         /* Disable elimination of unused inputs. */
7489         radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7490                                   "InitialPSInputAddr", 0xffffff);
7491
7492         /* Process colors. */
7493         unsigned vgpr = last_sgpr + 1;
7494         unsigned colors_written = key->ps_epilog.colors_written;
7495         int last_color_export = -1;
7496
7497         /* Find the last color export. */
7498         if (!key->ps_epilog.writes_z &&
7499             !key->ps_epilog.writes_stencil &&
7500             !key->ps_epilog.writes_samplemask) {
7501                 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7502
7503                 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7504                 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7505                         /* Just set this if any of the colorbuffers are enabled. */
7506                         if (spi_format &
7507                             ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7508                                 last_color_export = 0;
7509                 } else {
7510                         for (i = 0; i < 8; i++)
7511                                 if (colors_written & (1 << i) &&
7512                                     (spi_format >> (i * 4)) & 0xf)
7513                                         last_color_export = i;
7514                 }
7515         }
7516
7517         while (colors_written) {
7518                 LLVMValueRef color[4];
7519                 int mrt = u_bit_scan(&colors_written);
7520
7521                 for (i = 0; i < 4; i++)
7522                         color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7523
7524                 si_export_mrt_color(bld_base, color, mrt,
7525                                     num_params - 1,
7526                                     mrt == last_color_export);
7527         }
7528
7529         /* Process depth, stencil, samplemask. */
7530         if (key->ps_epilog.writes_z)
7531                 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7532         if (key->ps_epilog.writes_stencil)
7533                 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7534         if (key->ps_epilog.writes_samplemask)
7535                 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7536
7537         if (depth || stencil || samplemask)
7538                 si_export_mrt_z(bld_base, depth, stencil, samplemask);
7539         else if (last_color_export == -1)
7540                 si_export_null(bld_base);
7541
7542         /* Compile. */
7543         LLVMBuildRetVoid(gallivm->builder);
7544         radeon_llvm_finalize_module(&ctx.radeon_bld);
7545
7546         if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7547                             gallivm->module, debug, ctx.type,
7548                             "Fragment Shader Epilog"))
7549                 status = false;
7550
7551         radeon_llvm_dispose(&ctx.radeon_bld);
7552         return status;
7553 }
7554
7555 /**
7556  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7557  */
7558 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7559                                       LLVMTargetMachineRef tm,
7560                                       struct si_shader *shader,
7561                                       struct pipe_debug_callback *debug)
7562 {
7563         struct tgsi_shader_info *info = &shader->selector->info;
7564         union si_shader_part_key prolog_key;
7565         union si_shader_part_key epilog_key;
7566         unsigned i;
7567
7568         /* Get the prolog. */
7569         memset(&prolog_key, 0, sizeof(prolog_key));
7570         prolog_key.ps_prolog.states = shader->key.ps.prolog;
7571         prolog_key.ps_prolog.colors_read = info->colors_read;
7572         prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7573         prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7574         prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7575                 (prolog_key.ps_prolog.colors_read ||
7576                  prolog_key.ps_prolog.states.force_persp_sample_interp ||
7577                  prolog_key.ps_prolog.states.force_linear_sample_interp ||
7578                  prolog_key.ps_prolog.states.force_persp_center_interp ||
7579                  prolog_key.ps_prolog.states.force_linear_center_interp ||
7580                  prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7581                  prolog_key.ps_prolog.states.bc_optimize_for_linear);
7582
7583         if (info->colors_read) {
7584                 unsigned *color = shader->selector->color_attr_index;
7585
7586                 if (shader->key.ps.prolog.color_two_side) {
7587                         /* BCOLORs are stored after the last input. */
7588                         prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7589                         prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7590                         shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7591                 }
7592
7593                 for (i = 0; i < 2; i++) {
7594                         unsigned interp = info->input_interpolate[color[i]];
7595                         unsigned location = info->input_interpolate_loc[color[i]];
7596
7597                         if (!(info->colors_read & (0xf << i*4)))
7598                                 continue;
7599
7600                         prolog_key.ps_prolog.color_attr_index[i] = color[i];
7601
7602                         if (shader->key.ps.prolog.flatshade_colors &&
7603                             interp == TGSI_INTERPOLATE_COLOR)
7604                                 interp = TGSI_INTERPOLATE_CONSTANT;
7605
7606                         switch (interp) {
7607                         case TGSI_INTERPOLATE_CONSTANT:
7608                                 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7609                                 break;
7610                         case TGSI_INTERPOLATE_PERSPECTIVE:
7611                         case TGSI_INTERPOLATE_COLOR:
7612                                 /* Force the interpolation location for colors here. */
7613                                 if (shader->key.ps.prolog.force_persp_sample_interp)
7614                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7615                                 if (shader->key.ps.prolog.force_persp_center_interp)
7616                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7617
7618                                 switch (location) {
7619                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7620                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7621                                         shader->config.spi_ps_input_ena |=
7622                                                 S_0286CC_PERSP_SAMPLE_ENA(1);
7623                                         break;
7624                                 case TGSI_INTERPOLATE_LOC_CENTER:
7625                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7626                                         shader->config.spi_ps_input_ena |=
7627                                                 S_0286CC_PERSP_CENTER_ENA(1);
7628                                         break;
7629                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7630                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7631                                         shader->config.spi_ps_input_ena |=
7632                                                 S_0286CC_PERSP_CENTROID_ENA(1);
7633                                         break;
7634                                 default:
7635                                         assert(0);
7636                                 }
7637                                 break;
7638                         case TGSI_INTERPOLATE_LINEAR:
7639                                 /* Force the interpolation location for colors here. */
7640                                 if (shader->key.ps.prolog.force_linear_sample_interp)
7641                                         location = TGSI_INTERPOLATE_LOC_SAMPLE;
7642                                 if (shader->key.ps.prolog.force_linear_center_interp)
7643                                         location = TGSI_INTERPOLATE_LOC_CENTER;
7644
7645                                 switch (location) {
7646                                 case TGSI_INTERPOLATE_LOC_SAMPLE:
7647                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7648                                         shader->config.spi_ps_input_ena |=
7649                                                 S_0286CC_LINEAR_SAMPLE_ENA(1);
7650                                         break;
7651                                 case TGSI_INTERPOLATE_LOC_CENTER:
7652                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7653                                         shader->config.spi_ps_input_ena |=
7654                                                 S_0286CC_LINEAR_CENTER_ENA(1);
7655                                         break;
7656                                 case TGSI_INTERPOLATE_LOC_CENTROID:
7657                                         prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7658                                         shader->config.spi_ps_input_ena |=
7659                                                 S_0286CC_LINEAR_CENTROID_ENA(1);
7660                                         break;
7661                                 default:
7662                                         assert(0);
7663                                 }
7664                                 break;
7665                         default:
7666                                 assert(0);
7667                         }
7668                 }
7669         }
7670
7671         /* The prolog is a no-op if these aren't set. */
7672         if (prolog_key.ps_prolog.colors_read ||
7673             prolog_key.ps_prolog.states.force_persp_sample_interp ||
7674             prolog_key.ps_prolog.states.force_linear_sample_interp ||
7675             prolog_key.ps_prolog.states.force_persp_center_interp ||
7676             prolog_key.ps_prolog.states.force_linear_center_interp ||
7677             prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7678             prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7679             prolog_key.ps_prolog.states.poly_stipple) {
7680                 shader->prolog =
7681                         si_get_shader_part(sscreen, &sscreen->ps_prologs,
7682                                            &prolog_key, tm, debug,
7683                                            si_compile_ps_prolog);
7684                 if (!shader->prolog)
7685                         return false;
7686         }
7687
7688         /* Get the epilog. */
7689         memset(&epilog_key, 0, sizeof(epilog_key));
7690         epilog_key.ps_epilog.colors_written = info->colors_written;
7691         epilog_key.ps_epilog.writes_z = info->writes_z;
7692         epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7693         epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7694         epilog_key.ps_epilog.states = shader->key.ps.epilog;
7695
7696         shader->epilog =
7697                 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7698                                    &epilog_key, tm, debug,
7699                                    si_compile_ps_epilog);
7700         if (!shader->epilog)
7701                 return false;
7702
7703         /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7704         if (shader->key.ps.prolog.poly_stipple) {
7705                 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7706                 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7707         }
7708
7709         /* Set up the enable bits for per-sample shading if needed. */
7710         if (shader->key.ps.prolog.force_persp_sample_interp &&
7711             (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7712              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7713                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7714                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7715                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7716         }
7717         if (shader->key.ps.prolog.force_linear_sample_interp &&
7718             (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7719              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7720                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7721                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7722                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7723         }
7724         if (shader->key.ps.prolog.force_persp_center_interp &&
7725             (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7726              G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7727                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7728                 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7729                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7730         }
7731         if (shader->key.ps.prolog.force_linear_center_interp &&
7732             (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7733              G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7734                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7735                 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7736                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7737         }
7738
7739         /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7740         if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7741             !(shader->config.spi_ps_input_ena & 0xf)) {
7742                 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7743                 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7744         }
7745
7746         /* At least one pair of interpolation weights must be enabled. */
7747         if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7748                 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7749                 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7750         }
7751
7752         /* The sample mask input is always enabled, because the API shader always
7753          * passes it through to the epilog. Disable it here if it's unused.
7754          */
7755         if (!shader->key.ps.epilog.poly_line_smoothing &&
7756             !shader->selector->info.reads_samplemask)
7757                 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7758
7759         return true;
7760 }
7761
7762 static void si_fix_num_sgprs(struct si_shader *shader)
7763 {
7764         unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7765
7766         shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7767 }
7768
7769 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7770                      struct si_shader *shader,
7771                      struct pipe_debug_callback *debug)
7772 {
7773         struct si_shader *mainp = shader->selector->main_shader_part;
7774         int r;
7775
7776         /* LS, ES, VS are compiled on demand if the main part hasn't been
7777          * compiled for that stage.
7778          */
7779         if (!mainp ||
7780             (shader->selector->type == PIPE_SHADER_VERTEX &&
7781              (shader->key.vs.as_es != mainp->key.vs.as_es ||
7782               shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7783             (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7784              shader->key.tes.as_es != mainp->key.tes.as_es) ||
7785             (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7786              shader->key.tcs.epilog.inputs_to_copy) ||
7787             shader->selector->type == PIPE_SHADER_COMPUTE) {
7788                 /* Monolithic shader (compiled as a whole, has many variants,
7789                  * may take a long time to compile).
7790                  */
7791                 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7792                 if (r)
7793                         return r;
7794         } else {
7795                 /* The shader consists of 2-3 parts:
7796                  *
7797                  * - the middle part is the user shader, it has 1 variant only
7798                  *   and it was compiled during the creation of the shader
7799                  *   selector
7800                  * - the prolog part is inserted at the beginning
7801                  * - the epilog part is inserted at the end
7802                  *
7803                  * The prolog and epilog have many (but simple) variants.
7804                  */
7805
7806                 /* Copy the compiled TGSI shader data over. */
7807                 shader->is_binary_shared = true;
7808                 shader->binary = mainp->binary;
7809                 shader->config = mainp->config;
7810                 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7811                 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7812                 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7813                 memcpy(shader->info.vs_output_param_offset,
7814                        mainp->info.vs_output_param_offset,
7815                        sizeof(mainp->info.vs_output_param_offset));
7816                 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7817                 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7818                 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7819
7820                 /* Select prologs and/or epilogs. */
7821                 switch (shader->selector->type) {
7822                 case PIPE_SHADER_VERTEX:
7823                         if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7824                                 return -1;
7825                         break;
7826                 case PIPE_SHADER_TESS_CTRL:
7827                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7828                                 return -1;
7829                         break;
7830                 case PIPE_SHADER_TESS_EVAL:
7831                         if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7832                                 return -1;
7833                         break;
7834                 case PIPE_SHADER_FRAGMENT:
7835                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7836                                 return -1;
7837
7838                         /* Make sure we have at least as many VGPRs as there
7839                          * are allocated inputs.
7840                          */
7841                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7842                                                         shader->info.num_input_vgprs);
7843                         break;
7844                 }
7845
7846                 /* Update SGPR and VGPR counts. */
7847                 if (shader->prolog) {
7848                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7849                                                         shader->prolog->config.num_sgprs);
7850                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7851                                                         shader->prolog->config.num_vgprs);
7852                 }
7853                 if (shader->epilog) {
7854                         shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7855                                                         shader->epilog->config.num_sgprs);
7856                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7857                                                         shader->epilog->config.num_vgprs);
7858                 }
7859         }
7860
7861         si_fix_num_sgprs(shader);
7862         si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7863                        stderr);
7864
7865         /* Upload. */
7866         r = si_shader_binary_upload(sscreen, shader);
7867         if (r) {
7868                 fprintf(stderr, "LLVM failed to upload shader\n");
7869                 return r;
7870         }
7871
7872         return 0;
7873 }
7874
7875 void si_shader_destroy(struct si_shader *shader)
7876 {
7877         if (shader->gs_copy_shader) {
7878                 si_shader_destroy(shader->gs_copy_shader);
7879                 FREE(shader->gs_copy_shader);
7880         }
7881
7882         if (shader->scratch_bo)
7883                 r600_resource_reference(&shader->scratch_bo, NULL);
7884
7885         r600_resource_reference(&shader->bo, NULL);
7886
7887         if (!shader->is_binary_shared)
7888                 radeon_shader_binary_clean(&shader->binary);
7889 }