src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_bitarit.h"
  35 #include "gallivm/lp_bld_flow.h"
  36 #include "radeon/r600_cs.h"
  37 #include "radeon/radeon_llvm.h"
  38 #include "radeon/radeon_elf_util.h"
  39 #include "radeon/radeon_llvm_emit.h"
  40 #include "util/u_memory.h"
  41 #include "util/u_pstipple.h"
  42 #include "tgsi/tgsi_parse.h"
  43 #include "tgsi/tgsi_util.h"
  44 #include "tgsi/tgsi_dump.h"
  45
  46 #include "si_pipe.h"
  47 #include "si_shader.h"
  48 #include "sid.h"
  49
  50 #include <errno.h>
  51
  52 static const char *scratch_rsrc_dword0_symbol =
  53         "SCRATCH_RSRC_DWORD0";
  54
  55 static const char *scratch_rsrc_dword1_symbol =
  56         "SCRATCH_RSRC_DWORD1";
  57
  58 struct si_shader_output_values
  59 {
  60         LLVMValueRef values[4];
  61         unsigned name;
  62         unsigned sid;
  63 };
  64
  65 struct si_shader_context
  66 {
  67         struct radeon_llvm_context radeon_bld;
  68         struct si_shader *shader;
  69         struct si_screen *screen;
  70         unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
  71         int param_streamout_config;
  72         int param_streamout_write_index;
  73         int param_streamout_offset[4];
  74         int param_vertex_id;
  75         int param_rel_auto_id;
  76         int param_instance_id;
  77         int param_tes_u;
  78         int param_tes_v;
  79         int param_tes_rel_patch_id;
  80         int param_tes_patch_id;
  81         int param_es2gs_offset;
  82         LLVMTargetMachineRef tm;
  83         LLVMValueRef const_md;
  84         LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
  85         LLVMValueRef lds;
  86         LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
  87         LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
  88         LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
  89         LLVMValueRef so_buffers[4];
  90         LLVMValueRef esgs_ring;
  91         LLVMValueRef gsvs_ring[4];
  92         LLVMValueRef gs_next_vertex[4];
  93 };
  94
  95 static struct si_shader_context * si_shader_context(
  96         struct lp_build_tgsi_context * bld_base)
  97 {
  98         return (struct si_shader_context *)bld_base;
  99 }
 100
 101
 102 #define PERSPECTIVE_BASE 0
 103 #define LINEAR_BASE 9
 104
 105 #define SAMPLE_OFFSET 0
 106 #define CENTER_OFFSET 2
 107 #define CENTROID_OFSET 4
 108
 109 #define USE_SGPR_MAX_SUFFIX_LEN 5
 110 #define CONST_ADDR_SPACE 2
 111 #define LOCAL_ADDR_SPACE 3
 112 #define USER_SGPR_ADDR_SPACE 8
 113
 114
 115 #define SENDMSG_GS 2
 116 #define SENDMSG_GS_DONE 3
 117
 118 #define SENDMSG_GS_OP_NOP      (0 << 4)
 119 #define SENDMSG_GS_OP_CUT      (1 << 4)
 120 #define SENDMSG_GS_OP_EMIT     (2 << 4)
 121 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
 122
 123 /**
 124  * Returns a unique index for a semantic name and index. The index must be
 125  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 126  * calculated.
 127  */
 128 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 129 {
 130         switch (semantic_name) {
 131         case TGSI_SEMANTIC_POSITION:
 132                 return 0;
 133         case TGSI_SEMANTIC_PSIZE:
 134                 return 1;
 135         case TGSI_SEMANTIC_CLIPDIST:
 136                 assert(index <= 1);
 137                 return 2 + index;
 138         case TGSI_SEMANTIC_GENERIC:
 139                 assert(index <= 63-4);
 140                 return 4 + index;
 141
 142         /* patch indices are completely separate and thus start from 0 */
 143         case TGSI_SEMANTIC_TESSOUTER:
 144                 return 0;
 145         case TGSI_SEMANTIC_TESSINNER:
 146                 return 1;
 147         case TGSI_SEMANTIC_PATCH:
 148                 return 2 + index;
 149
 150         default:
 151                 /* Don't fail here. The result of this function is only used
 152                  * for LS, TCS, TES, and GS, where legacy GL semantics can't
 153                  * occur, but this function is called for all vertex shaders
 154                  * before it's known whether LS will be compiled or not.
 155                  */
 156                 return 0;
 157         }
 158 }
 159
 160 /**
 161  * Given a semantic name and index of a parameter and a mask of used parameters
 162  * (inputs or outputs), return the index of the parameter in the list of all
 163  * used parameters.
 164  *
 165  * For example, assume this list of parameters:
 166  *   POSITION, PSIZE, GENERIC0, GENERIC2
 167  * which has the mask:
 168  *   11000000000101
 169  * Then:
 170  *   querying POSITION returns 0,
 171  *   querying PSIZE returns 1,
 172  *   querying GENERIC0 returns 2,
 173  *   querying GENERIC2 returns 3.
 174  *
 175  * Which can be used as an offset to a parameter buffer in units of vec4s.
 176  */
 177 static int get_param_index(unsigned semantic_name, unsigned index,
 178                            uint64_t mask)
 179 {
 180         unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index);
 181         int i, param_index = 0;
 182
 183         /* If not present... */
 184         if (!((1llu << unique_index) & mask))
 185                 return -1;
 186
 187         for (i = 0; mask; i++) {
 188                 uint64_t bit = 1llu << i;
 189
 190                 if (bit & mask) {
 191                         if (i == unique_index)
 192                                 return param_index;
 193
 194                         mask &= ~bit;
 195                         param_index++;
 196                 }
 197         }
 198
 199         assert(!"unreachable");
 200         return -1;
 201 }
 202
 203 /**
 204  * Get the value of a shader input parameter and extract a bitfield.
 205  */
 206 static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
 207                                  unsigned param, unsigned rshift,
 208                                  unsigned bitwidth)
 209 {
 210         struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
 211         LLVMValueRef value = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 212                                           param);
 213
 214         if (rshift)
 215                 value = LLVMBuildLShr(gallivm->builder, value,
 216                                       lp_build_const_int32(gallivm, rshift), "");
 217
 218         if (rshift + bitwidth < 32) {
 219                 unsigned mask = (1 << bitwidth) - 1;
 220                 value = LLVMBuildAnd(gallivm->builder, value,
 221                                      lp_build_const_int32(gallivm, mask), "");
 222         }
 223
 224         return value;
 225 }
 226
 227 static LLVMValueRef get_rel_patch_id(struct si_shader_context *si_shader_ctx)
 228 {
 229         switch (si_shader_ctx->type) {
 230         case TGSI_PROCESSOR_TESS_CTRL:
 231                 return unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 0, 8);
 232
 233         case TGSI_PROCESSOR_TESS_EVAL:
 234                 return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 235                                     si_shader_ctx->param_tes_rel_patch_id);
 236
 237         default:
 238                 assert(0);
 239                 return NULL;
 240         }
 241 }
 242
 243 /* Tessellation shaders pass outputs to the next shader using LDS.
 244  *
 245  * LS outputs = TCS inputs
 246  * TCS outputs = TES inputs
 247  *
 248  * The LDS layout is:
 249  * - TCS inputs for patch 0
 250  * - TCS inputs for patch 1
 251  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
 252  * - ...
 253  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
 254  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
 255  * - TCS outputs for patch 1
 256  * - Per-patch TCS outputs for patch 1
 257  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
 258  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
 259  * - ...
 260  *
 261  * All three shaders VS(LS), TCS, TES share the same LDS space.
 262  */
 263
 264 static LLVMValueRef
 265 get_tcs_in_patch_stride(struct si_shader_context *si_shader_ctx)
 266 {
 267         if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX)
 268                 return unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
 269         else if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
 270                 return unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
 271         else {
 272                 assert(0);
 273                 return NULL;
 274         }
 275 }
 276
 277 static LLVMValueRef
 278 get_tcs_out_patch_stride(struct si_shader_context *si_shader_ctx)
 279 {
 280         return unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
 281 }
 282
 283 static LLVMValueRef
 284 get_tcs_out_patch0_offset(struct si_shader_context *si_shader_ctx)
 285 {
 286         return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
 287                                 unpack_param(si_shader_ctx,
 288                                              SI_PARAM_TCS_OUT_OFFSETS,
 289                                              0, 16),
 290                                 4);
 291 }
 292
 293 static LLVMValueRef
 294 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *si_shader_ctx)
 295 {
 296         return lp_build_mul_imm(&si_shader_ctx->radeon_bld.soa.bld_base.uint_bld,
 297                                 unpack_param(si_shader_ctx,
 298                                              SI_PARAM_TCS_OUT_OFFSETS,
 299                                              16, 16),
 300                                 4);
 301 }
 302
 303 static LLVMValueRef
 304 get_tcs_in_current_patch_offset(struct si_shader_context *si_shader_ctx)
 305 {
 306         struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
 307         LLVMValueRef patch_stride = get_tcs_in_patch_stride(si_shader_ctx);
 308         LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
 309
 310         return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
 311 }
 312
 313 static LLVMValueRef
 314 get_tcs_out_current_patch_offset(struct si_shader_context *si_shader_ctx)
 315 {
 316         struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
 317         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(si_shader_ctx);
 318         LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
 319         LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
 320
 321         return LLVMBuildAdd(gallivm->builder, patch0_offset,
 322                             LLVMBuildMul(gallivm->builder, patch_stride,
 323                                          rel_patch_id, ""),
 324                             "");
 325 }
 326
 327 static LLVMValueRef
 328 get_tcs_out_current_patch_data_offset(struct si_shader_context *si_shader_ctx)
 329 {
 330         struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
 331         LLVMValueRef patch0_patch_data_offset =
 332                 get_tcs_out_patch0_patch_data_offset(si_shader_ctx);
 333         LLVMValueRef patch_stride = get_tcs_out_patch_stride(si_shader_ctx);
 334         LLVMValueRef rel_patch_id = get_rel_patch_id(si_shader_ctx);
 335
 336         return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
 337                             LLVMBuildMul(gallivm->builder, patch_stride,
 338                                          rel_patch_id, ""),
 339                             "");
 340 }
 341
 342 static void build_indexed_store(struct si_shader_context *si_shader_ctx,
 343                                 LLVMValueRef base_ptr, LLVMValueRef index,
 344                                 LLVMValueRef value)
 345 {
 346         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
 347         struct gallivm_state *gallivm = bld_base->base.gallivm;
 348         LLVMValueRef indices[2], pointer;
 349
 350         indices[0] = bld_base->uint_bld.zero;
 351         indices[1] = index;
 352
 353         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 354         LLVMBuildStore(gallivm->builder, value, pointer);
 355 }
 356
 357 /**
 358  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
 359  * It's equivalent to doing a load from &base_ptr[index].
 360  *
 361  * \param base_ptr  Where the array starts.
 362  * \param index     The element index into the array.
 363  */
 364 static LLVMValueRef build_indexed_load(struct si_shader_context *si_shader_ctx,
 365                                        LLVMValueRef base_ptr, LLVMValueRef index)
 366 {
 367         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
 368         struct gallivm_state *gallivm = bld_base->base.gallivm;
 369         LLVMValueRef indices[2], pointer;
 370
 371         indices[0] = bld_base->uint_bld.zero;
 372         indices[1] = index;
 373
 374         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 375         return LLVMBuildLoad(gallivm->builder, pointer, "");
 376 }
 377
 378 /**
 379  * Do a load from &base_ptr[index], but also add a flag that it's loading
 380  * a constant.
 381  */
 382 static LLVMValueRef build_indexed_load_const(
 383         struct si_shader_context * si_shader_ctx,
 384         LLVMValueRef base_ptr, LLVMValueRef index)
 385 {
 386         LLVMValueRef result = build_indexed_load(si_shader_ctx, base_ptr, index);
 387         LLVMSetMetadata(result, 1, si_shader_ctx->const_md);
 388         return result;
 389 }
 390
 391 static LLVMValueRef get_instance_index_for_fetch(
 392         struct radeon_llvm_context * radeon_bld,
 393         unsigned divisor)
 394 {
 395         struct si_shader_context *si_shader_ctx =
 396                 si_shader_context(&radeon_bld->soa.bld_base);
 397         struct gallivm_state * gallivm = radeon_bld->soa.bld_base.base.gallivm;
 398
 399         LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
 400                                            si_shader_ctx->param_instance_id);
 401
 402         /* The division must be done before START_INSTANCE is added. */
 403         if (divisor > 1)
 404                 result = LLVMBuildUDiv(gallivm->builder, result,
 405                                 lp_build_const_int32(gallivm, divisor), "");
 406
 407         return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(
 408                         radeon_bld->main_fn, SI_PARAM_START_INSTANCE), "");
 409 }
 410
 411 static void declare_input_vs(
 412         struct radeon_llvm_context *radeon_bld,
 413         unsigned input_index,
 414         const struct tgsi_full_declaration *decl)
 415 {
 416         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 417         struct gallivm_state *gallivm = base->gallivm;
 418         struct si_shader_context *si_shader_ctx =
 419                 si_shader_context(&radeon_bld->soa.bld_base);
 420         unsigned divisor = si_shader_ctx->shader->key.vs.instance_divisors[input_index];
 421
 422         unsigned chan;
 423
 424         LLVMValueRef t_list_ptr;
 425         LLVMValueRef t_offset;
 426         LLVMValueRef t_list;
 427         LLVMValueRef attribute_offset;
 428         LLVMValueRef buffer_index;
 429         LLVMValueRef args[3];
 430         LLVMTypeRef vec4_type;
 431         LLVMValueRef input;
 432
 433         /* Load the T list */
 434         t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFER);
 435
 436         t_offset = lp_build_const_int32(gallivm, input_index);
 437
 438         t_list = build_indexed_load_const(si_shader_ctx, t_list_ptr, t_offset);
 439
 440         /* Build the attribute offset */
 441         attribute_offset = lp_build_const_int32(gallivm, 0);
 442
 443         if (divisor) {
 444                 /* Build index from instance ID, start instance and divisor */
 445                 si_shader_ctx->shader->uses_instanceid = true;
 446                 buffer_index = get_instance_index_for_fetch(&si_shader_ctx->radeon_bld, divisor);
 447         } else {
 448                 /* Load the buffer index for vertices. */
 449                 LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 450                                                       si_shader_ctx->param_vertex_id);
 451                 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
 452                                                         SI_PARAM_BASE_VERTEX);
 453                 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
 454         }
 455
 456         vec4_type = LLVMVectorType(base->elem_type, 4);
 457         args[0] = t_list;
 458         args[1] = attribute_offset;
 459         args[2] = buffer_index;
 460         input = lp_build_intrinsic(gallivm->builder,
 461                 "llvm.SI.vs.load.input", vec4_type, args, 3,
 462                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 463
 464         /* Break up the vec4 into individual components */
 465         for (chan = 0; chan < 4; chan++) {
 466                 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 467                 /* XXX: Use a helper function for this.  There is one in
 468                  * tgsi_llvm.c. */
 469                 si_shader_ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
 470                                 LLVMBuildExtractElement(gallivm->builder,
 471                                 input, llvm_chan, "");
 472         }
 473 }
 474
 475 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 476                                      unsigned swizzle)
 477 {
 478         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 479
 480         if (swizzle > 0)
 481                 return bld_base->uint_bld.zero;
 482
 483         switch (si_shader_ctx->type) {
 484         case TGSI_PROCESSOR_TESS_CTRL:
 485                 return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 486                                     SI_PARAM_PATCH_ID);
 487         case TGSI_PROCESSOR_TESS_EVAL:
 488                 return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 489                                     si_shader_ctx->param_tes_patch_id);
 490         case TGSI_PROCESSOR_GEOMETRY:
 491                 return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 492                                     SI_PARAM_PRIMITIVE_ID);
 493         default:
 494                 assert(0);
 495                 return bld_base->uint_bld.zero;
 496         }
 497 }
 498
 499 /**
 500  * Return the value of tgsi_ind_register for indexing.
 501  * This is the indirect index with the constant offset added to it.
 502  */
 503 static LLVMValueRef get_indirect_index(struct si_shader_context *si_shader_ctx,
 504                                        const struct tgsi_ind_register *ind,
 505                                        int rel_index)
 506 {
 507         struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
 508         LLVMValueRef result;
 509
 510         result = si_shader_ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
 511         result = LLVMBuildLoad(gallivm->builder, result, "");
 512         result = LLVMBuildAdd(gallivm->builder, result,
 513                               lp_build_const_int32(gallivm, rel_index), "");
 514         return result;
 515 }
 516
 517 /**
 518  * Calculate a dword address given an input or output register and a stride.
 519  */
 520 static LLVMValueRef get_dw_address(struct si_shader_context *si_shader_ctx,
 521                                    const struct tgsi_full_dst_register *dst,
 522                                    const struct tgsi_full_src_register *src,
 523                                    LLVMValueRef vertex_dw_stride,
 524                                    LLVMValueRef base_addr)
 525 {
 526         struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
 527         struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info;
 528         ubyte *name, *index, *array_first;
 529         int first, param;
 530         struct tgsi_full_dst_register reg;
 531
 532         /* Set the register description. The address computation is the same
 533          * for sources and destinations. */
 534         if (src) {
 535                 reg.Register.File = src->Register.File;
 536                 reg.Register.Index = src->Register.Index;
 537                 reg.Register.Indirect = src->Register.Indirect;
 538                 reg.Register.Dimension = src->Register.Dimension;
 539                 reg.Indirect = src->Indirect;
 540                 reg.Dimension = src->Dimension;
 541                 reg.DimIndirect = src->DimIndirect;
 542         } else
 543                 reg = *dst;
 544
 545         /* If the register is 2-dimensional (e.g. an array of vertices
 546          * in a primitive), calculate the base address of the vertex. */
 547         if (reg.Register.Dimension) {
 548                 LLVMValueRef index;
 549
 550                 if (reg.Dimension.Indirect)
 551                         index = get_indirect_index(si_shader_ctx, &reg.DimIndirect,
 552                                                    reg.Dimension.Index);
 553                 else
 554                         index = lp_build_const_int32(gallivm, reg.Dimension.Index);
 555
 556                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 557                                          LLVMBuildMul(gallivm->builder, index,
 558                                                       vertex_dw_stride, ""), "");
 559         }
 560
 561         /* Get information about the register. */
 562         if (reg.Register.File == TGSI_FILE_INPUT) {
 563                 name = info->input_semantic_name;
 564                 index = info->input_semantic_index;
 565                 array_first = info->input_array_first;
 566         } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
 567                 name = info->output_semantic_name;
 568                 index = info->output_semantic_index;
 569                 array_first = info->output_array_first;
 570         } else {
 571                 assert(0);
 572                 return NULL;
 573         }
 574
 575         if (reg.Register.Indirect) {
 576                 /* Add the relative address of the element. */
 577                 LLVMValueRef ind_index;
 578
 579                 if (reg.Indirect.ArrayID)
 580                         first = array_first[reg.Indirect.ArrayID];
 581                 else
 582                         first = reg.Register.Index;
 583
 584                 ind_index = get_indirect_index(si_shader_ctx, &reg.Indirect,
 585                                            reg.Register.Index - first);
 586
 587                 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
 588                                     LLVMBuildMul(gallivm->builder, ind_index,
 589                                                  lp_build_const_int32(gallivm, 4), ""), "");
 590
 591                 param = si_shader_io_get_unique_index(name[first], index[first]);
 592         } else {
 593                 param = si_shader_io_get_unique_index(name[reg.Register.Index],
 594                                                       index[reg.Register.Index]);
 595         }
 596
 597         /* Add the base address of the element. */
 598         return LLVMBuildAdd(gallivm->builder, base_addr,
 599                             lp_build_const_int32(gallivm, param * 4), "");
 600 }
 601
 602 /**
 603  * Load from LDS.
 604  *
 605  * \param type          output value type
 606  * \param swizzle       offset (typically 0..3); it can be ~0, which loads a vec4
 607  * \param dw_addr       address in dwords
 608  */
 609 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 610                              enum tgsi_opcode_type type, unsigned swizzle,
 611                              LLVMValueRef dw_addr)
 612 {
 613         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 614         struct gallivm_state *gallivm = bld_base->base.gallivm;
 615         LLVMValueRef value;
 616
 617         if (swizzle == ~0) {
 618                 LLVMValueRef values[TGSI_NUM_CHANNELS];
 619
 620                 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
 621                         values[chan] = lds_load(bld_base, type, chan, dw_addr);
 622
 623                 return lp_build_gather_values(bld_base->base.gallivm, values,
 624                                               TGSI_NUM_CHANNELS);
 625         }
 626
 627         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 628                             lp_build_const_int32(gallivm, swizzle));
 629
 630         value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
 631         return LLVMBuildBitCast(gallivm->builder, value,
 632                                 tgsi2llvmtype(bld_base, type), "");
 633 }
 634
 635 /**
 636  * Store to LDS.
 637  *
 638  * \param swizzle       offset (typically 0..3)
 639  * \param dw_addr       address in dwords
 640  * \param value         value to store
 641  */
 642 static void lds_store(struct lp_build_tgsi_context * bld_base,
 643                       unsigned swizzle, LLVMValueRef dw_addr,
 644                       LLVMValueRef value)
 645 {
 646         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 647         struct gallivm_state *gallivm = bld_base->base.gallivm;
 648
 649         dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
 650                             lp_build_const_int32(gallivm, swizzle));
 651
 652         value = LLVMBuildBitCast(gallivm->builder, value,
 653                                  LLVMInt32TypeInContext(gallivm->context), "");
 654         build_indexed_store(si_shader_ctx, si_shader_ctx->lds,
 655                             dw_addr, value);
 656 }
 657
 658 static LLVMValueRef fetch_input_tcs(
 659         struct lp_build_tgsi_context *bld_base,
 660         const struct tgsi_full_src_register *reg,
 661         enum tgsi_opcode_type type, unsigned swizzle)
 662 {
 663         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 664         LLVMValueRef dw_addr, stride;
 665
 666         stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
 667         dw_addr = get_tcs_in_current_patch_offset(si_shader_ctx);
 668         dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
 669
 670         return lds_load(bld_base, type, swizzle, dw_addr);
 671 }
 672
 673 static LLVMValueRef fetch_output_tcs(
 674                 struct lp_build_tgsi_context *bld_base,
 675                 const struct tgsi_full_src_register *reg,
 676                 enum tgsi_opcode_type type, unsigned swizzle)
 677 {
 678         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 679         struct si_shader *shader = si_shader_ctx->shader;
 680         struct tgsi_shader_info *info = &shader->selector->info;
 681         unsigned name = info->output_semantic_name[reg->Register.Index];
 682         LLVMValueRef dw_addr, stride;
 683
 684         /* Just read the local temp "output" register to get TESSOUTER/INNER. */
 685         if (!reg->Register.Indirect &&
 686             (name == TGSI_SEMANTIC_TESSOUTER ||
 687              name == TGSI_SEMANTIC_TESSINNER)) {
 688                 return radeon_llvm_emit_fetch(bld_base, reg, type, swizzle);
 689         }
 690
 691         if (reg->Register.Dimension) {
 692                 stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
 693                 dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
 694                 dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
 695         } else {
 696                 dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
 697                 dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
 698         }
 699
 700         return lds_load(bld_base, type, swizzle, dw_addr);
 701 }
 702
 703 static LLVMValueRef fetch_input_tes(
 704         struct lp_build_tgsi_context *bld_base,
 705         const struct tgsi_full_src_register *reg,
 706         enum tgsi_opcode_type type, unsigned swizzle)
 707 {
 708         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 709         LLVMValueRef dw_addr, stride;
 710
 711         if (reg->Register.Dimension) {
 712                 stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
 713                 dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
 714                 dw_addr = get_dw_address(si_shader_ctx, NULL, reg, stride, dw_addr);
 715         } else {
 716                 dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
 717                 dw_addr = get_dw_address(si_shader_ctx, NULL, reg, NULL, dw_addr);
 718         }
 719
 720         return lds_load(bld_base, type, swizzle, dw_addr);
 721 }
 722
 723 static void store_output_tcs(struct lp_build_tgsi_context * bld_base,
 724                              const struct tgsi_full_instruction * inst,
 725                              const struct tgsi_opcode_info * info,
 726                              LLVMValueRef dst[4])
 727 {
 728         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 729         struct si_shader *shader = si_shader_ctx->shader;
 730         struct tgsi_shader_info *sinfo = &shader->selector->info;
 731         const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 732         unsigned chan_index;
 733         LLVMValueRef dw_addr, stride;
 734
 735         /* Only handle per-patch and per-vertex outputs here.
 736          * Vectors will be lowered to scalars and this function will be called again.
 737          */
 738         if (reg->Register.File != TGSI_FILE_OUTPUT ||
 739             (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
 740                 radeon_llvm_emit_store(bld_base, inst, info, dst);
 741                 return;
 742         }
 743
 744         /* Write tessellation levels to "output" temp registers.
 745          * Also write them to LDS as per-patch outputs (below).
 746          */
 747         if (!reg->Register.Indirect &&
 748             (sinfo->output_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_TESSINNER ||
 749              sinfo->output_semantic_name[reg->Register.Index] == TGSI_SEMANTIC_TESSOUTER))
 750                 radeon_llvm_emit_store(bld_base, inst, info, dst);
 751
 752         if (reg->Register.Dimension) {
 753                 stride = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
 754                 dw_addr = get_tcs_out_current_patch_offset(si_shader_ctx);
 755                 dw_addr = get_dw_address(si_shader_ctx, reg, NULL, stride, dw_addr);
 756         } else {
 757                 dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
 758                 dw_addr = get_dw_address(si_shader_ctx, reg, NULL, NULL, dw_addr);
 759         }
 760
 761         TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
 762                 LLVMValueRef value = dst[chan_index];
 763
 764                 if (inst->Instruction.Saturate)
 765                         value = radeon_llvm_saturate(bld_base, value);
 766
 767                 lds_store(bld_base, chan_index, dw_addr, value);
 768         }
 769 }
 770
 771 static LLVMValueRef fetch_input_gs(
 772         struct lp_build_tgsi_context *bld_base,
 773         const struct tgsi_full_src_register *reg,
 774         enum tgsi_opcode_type type,
 775         unsigned swizzle)
 776 {
 777         struct lp_build_context *base = &bld_base->base;
 778         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 779         struct si_shader *shader = si_shader_ctx->shader;
 780         struct lp_build_context *uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 781         struct gallivm_state *gallivm = base->gallivm;
 782         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 783         LLVMValueRef vtx_offset;
 784         LLVMValueRef args[9];
 785         unsigned vtx_offset_param;
 786         struct tgsi_shader_info *info = &shader->selector->info;
 787         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 788         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
 789
 790         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
 791                 return get_primitive_id(bld_base, swizzle);
 792
 793         if (!reg->Register.Dimension)
 794                 return NULL;
 795
 796         if (swizzle == ~0) {
 797                 LLVMValueRef values[TGSI_NUM_CHANNELS];
 798                 unsigned chan;
 799                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 800                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
 801                 }
 802                 return lp_build_gather_values(bld_base->base.gallivm, values,
 803                                               TGSI_NUM_CHANNELS);
 804         }
 805
 806         /* Get the vertex offset parameter */
 807         vtx_offset_param = reg->Dimension.Index;
 808         if (vtx_offset_param < 2) {
 809                 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
 810         } else {
 811                 assert(vtx_offset_param < 6);
 812                 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
 813         }
 814         vtx_offset = lp_build_mul_imm(uint,
 815                                       LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 816                                                    vtx_offset_param),
 817                                       4);
 818
 819         args[0] = si_shader_ctx->esgs_ring;
 820         args[1] = vtx_offset;
 821         args[2] = lp_build_const_int32(gallivm,
 822                                        (get_param_index(semantic_name, semantic_index,
 823                                                         shader->selector->inputs_read) * 4 +
 824                                         swizzle) * 256);
 825         args[3] = uint->zero;
 826         args[4] = uint->one;  /* OFFEN */
 827         args[5] = uint->zero; /* IDXEN */
 828         args[6] = uint->one;  /* GLC */
 829         args[7] = uint->zero; /* SLC */
 830         args[8] = uint->zero; /* TFE */
 831
 832         return LLVMBuildBitCast(gallivm->builder,
 833                                 lp_build_intrinsic(gallivm->builder,
 834                                                 "llvm.SI.buffer.load.dword.i32.i32",
 835                                                 i32, args, 9,
 836                                                 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
 837                                 tgsi2llvmtype(bld_base, type), "");
 838 }
 839
 840 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
 841 {
 842         switch (interpolate) {
 843         case TGSI_INTERPOLATE_CONSTANT:
 844                 return 0;
 845
 846         case TGSI_INTERPOLATE_LINEAR:
 847                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
 848                         return SI_PARAM_LINEAR_SAMPLE;
 849                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
 850                         return SI_PARAM_LINEAR_CENTROID;
 851                 else
 852                         return SI_PARAM_LINEAR_CENTER;
 853                 break;
 854         case TGSI_INTERPOLATE_COLOR:
 855         case TGSI_INTERPOLATE_PERSPECTIVE:
 856                 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
 857                         return SI_PARAM_PERSP_SAMPLE;
 858                 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
 859                         return SI_PARAM_PERSP_CENTROID;
 860                 else
 861                         return SI_PARAM_PERSP_CENTER;
 862                 break;
 863         default:
 864                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
 865                 return -1;
 866         }
 867 }
 868
 869 static void declare_input_fs(
 870         struct radeon_llvm_context *radeon_bld,
 871         unsigned input_index,
 872         const struct tgsi_full_declaration *decl)
 873 {
 874         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 875         struct si_shader_context *si_shader_ctx =
 876                 si_shader_context(&radeon_bld->soa.bld_base);
 877         struct si_shader *shader = si_shader_ctx->shader;
 878         struct lp_build_context *uint = &radeon_bld->soa.bld_base.uint_bld;
 879         struct gallivm_state *gallivm = base->gallivm;
 880         LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
 881         LLVMValueRef main_fn = radeon_bld->main_fn;
 882
 883         LLVMValueRef interp_param = NULL;
 884         int interp_param_idx;
 885         const char * intr_name;
 886
 887         /* This value is:
 888          * [15:0] NewPrimMask (Bit mask for each quad.  It is set it the
 889          *                     quad begins a new primitive.  Bit 0 always needs
 890          *                     to be unset)
 891          * [32:16] ParamOffset
 892          *
 893          */
 894         LLVMValueRef params = LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
 895         LLVMValueRef attr_number;
 896
 897         unsigned chan;
 898
 899         if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
 900                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 901                         unsigned soa_index =
 902                                 radeon_llvm_reg_index_soa(input_index, chan);
 903                         radeon_bld->inputs[soa_index] =
 904                                 LLVMGetParam(main_fn, SI_PARAM_POS_X_FLOAT + chan);
 905
 906                         if (chan == 3)
 907                                 /* RCP for fragcoord.w */
 908                                 radeon_bld->inputs[soa_index] =
 909                                         LLVMBuildFDiv(gallivm->builder,
 910                                                       lp_build_const_float(gallivm, 1.0f),
 911                                                       radeon_bld->inputs[soa_index],
 912                                                       "");
 913                 }
 914                 return;
 915         }
 916
 917         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
 918                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
 919                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
 920                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
 921                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
 922                         lp_build_const_float(gallivm, 0.0f);
 923                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
 924                         lp_build_const_float(gallivm, 1.0f);
 925
 926                 return;
 927         }
 928
 929         shader->ps_input_param_offset[input_index] = shader->nparam++;
 930         attr_number = lp_build_const_int32(gallivm,
 931                                            shader->ps_input_param_offset[input_index]);
 932
 933         shader->ps_input_interpolate[input_index] = decl->Interp.Interpolate;
 934         interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
 935                                                      decl->Interp.Location);
 936         if (interp_param_idx == -1)
 937                 return;
 938         else if (interp_param_idx)
 939                 interp_param = LLVMGetParam(main_fn, interp_param_idx);
 940
 941         /* fs.constant returns the param from the middle vertex, so it's not
 942          * really useful for flat shading. It's meant to be used for custom
 943          * interpolation (but the intrinsic can't fetch from the other two
 944          * vertices).
 945          *
 946          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
 947          * to do the right thing. The only reason we use fs.constant is that
 948          * fs.interp cannot be used on integers, because they can be equal
 949          * to NaN.
 950          */
 951         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
 952
 953         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
 954             si_shader_ctx->shader->key.ps.color_two_side) {
 955                 LLVMValueRef args[4];
 956                 LLVMValueRef face, is_face_positive;
 957                 LLVMValueRef back_attr_number =
 958                         lp_build_const_int32(gallivm,
 959                                              shader->ps_input_param_offset[input_index] + 1);
 960
 961                 face = LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
 962
 963                 is_face_positive = LLVMBuildFCmp(gallivm->builder,
 964                                                  LLVMRealOGT, face,
 965                                                  lp_build_const_float(gallivm, 0.0f),
 966                                                  "");
 967
 968                 args[2] = params;
 969                 args[3] = interp_param;
 970                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 971                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 972                         unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
 973                         LLVMValueRef front, back;
 974
 975                         args[0] = llvm_chan;
 976                         args[1] = attr_number;
 977                         front = lp_build_intrinsic(gallivm->builder, intr_name,
 978                                                 input_type, args, args[3] ? 4 : 3,
 979                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 980
 981                         args[1] = back_attr_number;
 982                         back = lp_build_intrinsic(gallivm->builder, intr_name,
 983                                                input_type, args, args[3] ? 4 : 3,
 984                                                LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 985
 986                         radeon_bld->inputs[soa_index] =
 987                                 LLVMBuildSelect(gallivm->builder,
 988                                                 is_face_positive,
 989                                                 front,
 990                                                 back,
 991                                                 "");
 992                 }
 993
 994                 shader->nparam++;
 995         } else if (decl->Semantic.Name == TGSI_SEMANTIC_FOG) {
 996                 LLVMValueRef args[4];
 997
 998                 args[0] = uint->zero;
 999                 args[1] = attr_number;
1000                 args[2] = params;
1001                 args[3] = interp_param;
1002                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1003                         lp_build_intrinsic(gallivm->builder, intr_name,
1004                                         input_type, args, args[3] ? 4 : 3,
1005                                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1006                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1007                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1008                         lp_build_const_float(gallivm, 0.0f);
1009                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1010                         lp_build_const_float(gallivm, 1.0f);
1011         } else {
1012                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1013                         LLVMValueRef args[4];
1014                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1015                         unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
1016                         args[0] = llvm_chan;
1017                         args[1] = attr_number;
1018                         args[2] = params;
1019                         args[3] = interp_param;
1020                         radeon_bld->inputs[soa_index] =
1021                                 lp_build_intrinsic(gallivm->builder, intr_name,
1022                                                 input_type, args, args[3] ? 4 : 3,
1023                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1024                 }
1025         }
1026 }
1027
1028 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1029 {
1030         return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1031                             SI_PARAM_ANCILLARY, 8, 4);
1032 }
1033
1034 /**
1035  * Load a dword from a constant buffer.
1036  */
1037 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1038                                       LLVMValueRef offset, LLVMTypeRef return_type)
1039 {
1040         LLVMValueRef args[2] = {resource, offset};
1041
1042         return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1043                                LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1044 }
1045
1046 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1047 {
1048         struct si_shader_context *si_shader_ctx =
1049                 si_shader_context(&radeon_bld->soa.bld_base);
1050         struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1051         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1052         LLVMBuilderRef builder = gallivm->builder;
1053         LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
1054         LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
1055         LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
1056
1057         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
1058         LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1059         LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1060
1061         LLVMValueRef pos[4] = {
1062                 buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
1063                 buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
1064                 lp_build_const_float(gallivm, 0),
1065                 lp_build_const_float(gallivm, 0)
1066         };
1067
1068         return lp_build_gather_values(gallivm, pos, 4);
1069 }
1070
1071 static void declare_system_value(
1072         struct radeon_llvm_context * radeon_bld,
1073         unsigned index,
1074         const struct tgsi_full_declaration *decl)
1075 {
1076         struct si_shader_context *si_shader_ctx =
1077                 si_shader_context(&radeon_bld->soa.bld_base);
1078         struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1079         struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1080         struct gallivm_state *gallivm = &radeon_bld->gallivm;
1081         LLVMValueRef value = 0;
1082
1083         switch (decl->Semantic.Name) {
1084         case TGSI_SEMANTIC_INSTANCEID:
1085                 value = LLVMGetParam(radeon_bld->main_fn,
1086                                      si_shader_ctx->param_instance_id);
1087                 break;
1088
1089         case TGSI_SEMANTIC_VERTEXID:
1090                 value = LLVMBuildAdd(gallivm->builder,
1091                                      LLVMGetParam(radeon_bld->main_fn,
1092                                                   si_shader_ctx->param_vertex_id),
1093                                      LLVMGetParam(radeon_bld->main_fn,
1094                                                   SI_PARAM_BASE_VERTEX), "");
1095                 break;
1096
1097         case TGSI_SEMANTIC_VERTEXID_NOBASE:
1098                 value = LLVMGetParam(radeon_bld->main_fn,
1099                                      si_shader_ctx->param_vertex_id);
1100                 break;
1101
1102         case TGSI_SEMANTIC_BASEVERTEX:
1103                 value = LLVMGetParam(radeon_bld->main_fn,
1104                                      SI_PARAM_BASE_VERTEX);
1105                 break;
1106
1107         case TGSI_SEMANTIC_INVOCATIONID:
1108                 if (si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL)
1109                         value = unpack_param(si_shader_ctx, SI_PARAM_REL_IDS, 8, 5);
1110                 else if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY)
1111                         value = LLVMGetParam(radeon_bld->main_fn,
1112                                              SI_PARAM_GS_INSTANCE_ID);
1113                 else
1114                         assert(!"INVOCATIONID not implemented");
1115                 break;
1116
1117         case TGSI_SEMANTIC_SAMPLEID:
1118                 value = get_sample_id(radeon_bld);
1119                 break;
1120
1121         case TGSI_SEMANTIC_SAMPLEPOS:
1122                 value = load_sample_position(radeon_bld, get_sample_id(radeon_bld));
1123                 break;
1124
1125         case TGSI_SEMANTIC_SAMPLEMASK:
1126                 /* Smoothing isn't MSAA in GL, but it's MSAA in hardware.
1127                  * Therefore, force gl_SampleMaskIn to 1 for GL. */
1128                 if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
1129                         value = uint_bld->one;
1130                 else
1131                         value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1132                 break;
1133
1134         case TGSI_SEMANTIC_TESSCOORD:
1135         {
1136                 LLVMValueRef coord[4] = {
1137                         LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_u),
1138                         LLVMGetParam(radeon_bld->main_fn, si_shader_ctx->param_tes_v),
1139                         bld->zero,
1140                         bld->zero
1141                 };
1142
1143                 /* For triangles, the vector should be (u, v, 1-u-v). */
1144                 if (si_shader_ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1145                     PIPE_PRIM_TRIANGLES)
1146                         coord[2] = lp_build_sub(bld, bld->one,
1147                                                 lp_build_add(bld, coord[0], coord[1]));
1148
1149                 value = lp_build_gather_values(gallivm, coord, 4);
1150                 break;
1151         }
1152
1153         case TGSI_SEMANTIC_VERTICESIN:
1154                 value = unpack_param(si_shader_ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1155                 break;
1156
1157         case TGSI_SEMANTIC_TESSINNER:
1158         case TGSI_SEMANTIC_TESSOUTER:
1159         {
1160                 LLVMValueRef dw_addr;
1161                 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1162
1163                 dw_addr = get_tcs_out_current_patch_data_offset(si_shader_ctx);
1164                 dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
1165                                        lp_build_const_int32(gallivm, param * 4), "");
1166
1167                 value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1168                                  ~0, dw_addr);
1169                 break;
1170         }
1171
1172         case TGSI_SEMANTIC_PRIMID:
1173                 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1174                 break;
1175
1176         default:
1177                 assert(!"unknown system value");
1178                 return;
1179         }
1180
1181         radeon_bld->system_values[index] = value;
1182 }
1183
1184 static LLVMValueRef fetch_constant(
1185         struct lp_build_tgsi_context * bld_base,
1186         const struct tgsi_full_src_register *reg,
1187         enum tgsi_opcode_type type,
1188         unsigned swizzle)
1189 {
1190         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1191         struct lp_build_context * base = &bld_base->base;
1192         const struct tgsi_ind_register *ireg = &reg->Indirect;
1193         unsigned buf, idx;
1194
1195         LLVMValueRef addr, bufp;
1196         LLVMValueRef result;
1197
1198         if (swizzle == LP_CHAN_ALL) {
1199                 unsigned chan;
1200                 LLVMValueRef values[4];
1201                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1202                         values[chan] = fetch_constant(bld_base, reg, type, chan);
1203
1204                 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1205         }
1206
1207         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1208         idx = reg->Register.Index * 4 + swizzle;
1209
1210         if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1211                 if (type != TGSI_TYPE_DOUBLE)
1212                         return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
1213                 else {
1214                         return radeon_llvm_emit_fetch_double(bld_base,
1215                                                              si_shader_ctx->constants[buf][idx],
1216                                                              si_shader_ctx->constants[buf][idx + 1]);
1217                 }
1218         }
1219
1220         if (reg->Register.Dimension && reg->Dimension.Indirect) {
1221                 LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
1222                 LLVMValueRef index;
1223                 index = get_indirect_index(si_shader_ctx, &reg->DimIndirect,
1224                                                    reg->Dimension.Index);
1225                 bufp = build_indexed_load_const(si_shader_ctx, ptr, index);
1226         } else
1227                 bufp = si_shader_ctx->const_resource[buf];
1228
1229         addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1230         addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1231         addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1232         addr = lp_build_add(&bld_base->uint_bld, addr,
1233                             lp_build_const_int32(base->gallivm, idx * 4));
1234
1235         result = buffer_load_const(base->gallivm->builder, bufp,
1236                                    addr, bld_base->base.elem_type);
1237
1238         if (type != TGSI_TYPE_DOUBLE)
1239                 result = bitcast(bld_base, type, result);
1240         else {
1241                 LLVMValueRef addr2, result2;
1242                 addr2 = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1243                 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1244                 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1245                 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1246                                      lp_build_const_int32(base->gallivm, idx * 4));
1247
1248                 result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
1249                                    addr2, bld_base->base.elem_type);
1250
1251                 result = radeon_llvm_emit_fetch_double(bld_base,
1252                                                        result, result2);
1253         }
1254         return result;
1255 }
1256
1257 /* Initialize arguments for the shader export intrinsic */
1258 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1259                                      LLVMValueRef *values,
1260                                      unsigned target,
1261                                      LLVMValueRef *args)
1262 {
1263         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1264         struct lp_build_context *uint =
1265                                 &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
1266         struct lp_build_context *base = &bld_base->base;
1267         unsigned compressed = 0;
1268         unsigned chan;
1269
1270         if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
1271                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1272
1273                 if (cbuf >= 0 && cbuf < 8) {
1274                         compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
1275
1276                         if (compressed)
1277                                 si_shader_ctx->shader->spi_shader_col_format |=
1278                                         V_028714_SPI_SHADER_FP16_ABGR << (4 * cbuf);
1279                         else
1280                                 si_shader_ctx->shader->spi_shader_col_format |=
1281                                         V_028714_SPI_SHADER_32_ABGR << (4 * cbuf);
1282
1283                         si_shader_ctx->shader->cb_shader_mask |= 0xf << (4 * cbuf);
1284                 }
1285         }
1286
1287         if (compressed) {
1288                 /* Pixel shader needs to pack output values before export */
1289                 for (chan = 0; chan < 2; chan++ ) {
1290                         args[0] = values[2 * chan];
1291                         args[1] = values[2 * chan + 1];
1292                         args[chan + 5] =
1293                                 lp_build_intrinsic(base->gallivm->builder,
1294                                                 "llvm.SI.packf16",
1295                                                 LLVMInt32TypeInContext(base->gallivm->context),
1296                                                 args, 2,
1297                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1298                         args[chan + 7] = args[chan + 5] =
1299                                 LLVMBuildBitCast(base->gallivm->builder,
1300                                                  args[chan + 5],
1301                                                  LLVMFloatTypeInContext(base->gallivm->context),
1302                                                  "");
1303                 }
1304
1305                 /* Set COMPR flag */
1306                 args[4] = uint->one;
1307         } else {
1308                 for (chan = 0; chan < 4; chan++ )
1309                         /* +5 because the first output value will be
1310                          * the 6th argument to the intrinsic. */
1311                         args[chan + 5] = values[chan];
1312
1313                 /* Clear COMPR flag */
1314                 args[4] = uint->zero;
1315         }
1316
1317         /* XXX: This controls which components of the output
1318          * registers actually get exported. (e.g bit 0 means export
1319          * X component, bit 1 means export Y component, etc.)  I'm
1320          * hard coding this to 0xf for now.  In the future, we might
1321          * want to do something else. */
1322         args[0] = lp_build_const_int32(base->gallivm, 0xf);
1323
1324         /* Specify whether the EXEC mask represents the valid mask */
1325         args[1] = uint->zero;
1326
1327         /* Specify whether this is the last export */
1328         args[2] = uint->zero;
1329
1330         /* Specify the target we are exporting */
1331         args[3] = lp_build_const_int32(base->gallivm, target);
1332
1333         /* XXX: We probably need to keep track of the output
1334          * values, so we know what we are passing to the next
1335          * stage. */
1336 }
1337
1338 /* Load from output pointers and initialize arguments for the shader export intrinsic */
1339 static void si_llvm_init_export_args_load(struct lp_build_tgsi_context *bld_base,
1340                                           LLVMValueRef *out_ptr,
1341                                           unsigned target,
1342                                           LLVMValueRef *args)
1343 {
1344         struct gallivm_state *gallivm = bld_base->base.gallivm;
1345         LLVMValueRef values[4];
1346         int i;
1347
1348         for (i = 0; i < 4; i++)
1349                 values[i] = LLVMBuildLoad(gallivm->builder, out_ptr[i], "");
1350
1351         si_llvm_init_export_args(bld_base, values, target, args);
1352 }
1353
1354 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1355                           LLVMValueRef alpha_ptr)
1356 {
1357         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1358         struct gallivm_state *gallivm = bld_base->base.gallivm;
1359
1360         if (si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) {
1361                 LLVMValueRef alpha_ref = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
1362                                 SI_PARAM_ALPHA_REF);
1363
1364                 LLVMValueRef alpha_pass =
1365                         lp_build_cmp(&bld_base->base,
1366                                      si_shader_ctx->shader->key.ps.alpha_func,
1367                                      LLVMBuildLoad(gallivm->builder, alpha_ptr, ""),
1368                                      alpha_ref);
1369                 LLVMValueRef arg =
1370                         lp_build_select(&bld_base->base,
1371                                         alpha_pass,
1372                                         lp_build_const_float(gallivm, 1.0f),
1373                                         lp_build_const_float(gallivm, -1.0f));
1374
1375                 lp_build_intrinsic(gallivm->builder,
1376                                 "llvm.AMDGPU.kill",
1377                                 LLVMVoidTypeInContext(gallivm->context),
1378                                 &arg, 1, 0);
1379         } else {
1380                 lp_build_intrinsic(gallivm->builder,
1381                                 "llvm.AMDGPU.kilp",
1382                                 LLVMVoidTypeInContext(gallivm->context),
1383                                 NULL, 0, 0);
1384         }
1385
1386         si_shader_ctx->shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
1387 }
1388
1389 static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
1390                                           LLVMValueRef alpha_ptr)
1391 {
1392         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1393         struct gallivm_state *gallivm = bld_base->base.gallivm;
1394         LLVMValueRef coverage, alpha;
1395
1396         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
1397         coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
1398                                 SI_PARAM_SAMPLE_COVERAGE);
1399         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
1400
1401         coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
1402                                    bld_base->int_bld.elem_type,
1403                                    &coverage, 1, LLVMReadNoneAttribute);
1404
1405         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
1406                                    bld_base->base.elem_type, "");
1407
1408         coverage = LLVMBuildFMul(gallivm->builder, coverage,
1409                                  lp_build_const_float(gallivm,
1410                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
1411
1412         alpha = LLVMBuildLoad(gallivm->builder, alpha_ptr, "");
1413         alpha = LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
1414         LLVMBuildStore(gallivm->builder, alpha, alpha_ptr);
1415 }
1416
1417 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
1418                                     LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
1419 {
1420         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1421         struct lp_build_context *base = &bld_base->base;
1422         struct lp_build_context *uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
1423         unsigned reg_index;
1424         unsigned chan;
1425         unsigned const_chan;
1426         LLVMValueRef base_elt;
1427         LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
1428         LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF);
1429         LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index);
1430
1431         for (reg_index = 0; reg_index < 2; reg_index ++) {
1432                 LLVMValueRef *args = pos[2 + reg_index];
1433
1434                 args[5] =
1435                 args[6] =
1436                 args[7] =
1437                 args[8] = lp_build_const_float(base->gallivm, 0.0f);
1438
1439                 /* Compute dot products of position and user clip plane vectors */
1440                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1441                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
1442                                 args[1] = lp_build_const_int32(base->gallivm,
1443                                                                ((reg_index * 4 + chan) * 4 +
1444                                                                 const_chan) * 4);
1445                                 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
1446                                                       args[1], base->elem_type);
1447                                 args[5 + chan] =
1448                                         lp_build_add(base, args[5 + chan],
1449                                                      lp_build_mul(base, base_elt,
1450                                                                   out_elts[const_chan]));
1451                         }
1452                 }
1453
1454                 args[0] = lp_build_const_int32(base->gallivm, 0xf);
1455                 args[1] = uint->zero;
1456                 args[2] = uint->zero;
1457                 args[3] = lp_build_const_int32(base->gallivm,
1458                                                V_008DFC_SQ_EXP_POS + 2 + reg_index);
1459                 args[4] = uint->zero;
1460         }
1461 }
1462
1463 static void si_dump_streamout(struct pipe_stream_output_info *so)
1464 {
1465         unsigned i;
1466
1467         if (so->num_outputs)
1468                 fprintf(stderr, "STREAMOUT\n");
1469
1470         for (i = 0; i < so->num_outputs; i++) {
1471                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
1472                                 so->output[i].start_component;
1473                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
1474                         i, so->output[i].output_buffer,
1475                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
1476                         so->output[i].register_index,
1477                         mask & 1 ? "x" : "",
1478                         mask & 2 ? "y" : "",
1479                         mask & 4 ? "z" : "",
1480                         mask & 8 ? "w" : "");
1481         }
1482 }
1483
1484 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1485  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1486  * or v4i32 (num_channels=3,4). */
1487 static void build_tbuffer_store(struct si_shader_context *shader,
1488                                 LLVMValueRef rsrc,
1489                                 LLVMValueRef vdata,
1490                                 unsigned num_channels,
1491                                 LLVMValueRef vaddr,
1492                                 LLVMValueRef soffset,
1493                                 unsigned inst_offset,
1494                                 unsigned dfmt,
1495                                 unsigned nfmt,
1496                                 unsigned offen,
1497                                 unsigned idxen,
1498                                 unsigned glc,
1499                                 unsigned slc,
1500                                 unsigned tfe)
1501 {
1502         struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
1503         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
1504         LLVMValueRef args[] = {
1505                 rsrc,
1506                 vdata,
1507                 LLVMConstInt(i32, num_channels, 0),
1508                 vaddr,
1509                 soffset,
1510                 LLVMConstInt(i32, inst_offset, 0),
1511                 LLVMConstInt(i32, dfmt, 0),
1512                 LLVMConstInt(i32, nfmt, 0),
1513                 LLVMConstInt(i32, offen, 0),
1514                 LLVMConstInt(i32, idxen, 0),
1515                 LLVMConstInt(i32, glc, 0),
1516                 LLVMConstInt(i32, slc, 0),
1517                 LLVMConstInt(i32, tfe, 0)
1518         };
1519
1520         /* The instruction offset field has 12 bits */
1521         assert(offen || inst_offset < (1 << 12));
1522
1523         /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
1524         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1525         const char *types[] = {"i32", "v2i32", "v4i32"};
1526         char name[256];
1527         snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
1528
1529         lp_build_intrinsic(gallivm->builder, name,
1530                            LLVMVoidTypeInContext(gallivm->context),
1531                            args, Elements(args), 0);
1532 }
1533
1534 static void build_tbuffer_store_dwords(struct si_shader_context *shader,
1535                                      LLVMValueRef rsrc,
1536                                      LLVMValueRef vdata,
1537                                      unsigned num_channels,
1538                                      LLVMValueRef vaddr,
1539                                      LLVMValueRef soffset,
1540                                      unsigned inst_offset)
1541 {
1542         static unsigned dfmt[] = {
1543                 V_008F0C_BUF_DATA_FORMAT_32,
1544                 V_008F0C_BUF_DATA_FORMAT_32_32,
1545                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1546                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1547         };
1548         assert(num_channels >= 1 && num_channels <= 4);
1549
1550         build_tbuffer_store(shader, rsrc, vdata, num_channels, vaddr, soffset,
1551                             inst_offset, dfmt[num_channels-1],
1552                             V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
1553 }
1554
1555 /* On SI, the vertex shader is responsible for writing streamout data
1556  * to buffers. */
1557 static void si_llvm_emit_streamout(struct si_shader_context *shader,
1558                                    struct si_shader_output_values *outputs,
1559                                    unsigned noutput)
1560 {
1561         struct pipe_stream_output_info *so = &shader->shader->selector->so;
1562         struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
1563         LLVMBuilderRef builder = gallivm->builder;
1564         int i, j;
1565         struct lp_build_if_state if_ctx;
1566
1567         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
1568
1569         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1570         LLVMValueRef so_vtx_count =
1571                 unpack_param(shader, shader->param_streamout_config, 16, 7);
1572
1573         LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", i32,
1574                                            NULL, 0, LLVMReadNoneAttribute);
1575
1576         /* can_emit = tid < so_vtx_count; */
1577         LLVMValueRef can_emit =
1578                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
1579
1580         LLVMValueRef stream_id =
1581                 unpack_param(shader, shader->param_streamout_config, 24, 2);
1582
1583         /* Emit the streamout code conditionally. This actually avoids
1584          * out-of-bounds buffer access. The hw tells us via the SGPR
1585          * (so_vtx_count) which threads are allowed to emit streamout data. */
1586         lp_build_if(&if_ctx, gallivm, can_emit);
1587         {
1588                 /* The buffer offset is computed as follows:
1589                  *   ByteOffset = streamout_offset[buffer_id]*4 +
1590                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
1591                  *                attrib_offset
1592                  */
1593
1594                 LLVMValueRef so_write_index =
1595                         LLVMGetParam(shader->radeon_bld.main_fn,
1596                                      shader->param_streamout_write_index);
1597
1598                 /* Compute (streamout_write_index + thread_id). */
1599                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
1600
1601                 /* Compute the write offset for each enabled buffer. */
1602                 LLVMValueRef so_write_offset[4] = {};
1603                 for (i = 0; i < 4; i++) {
1604                         if (!so->stride[i])
1605                                 continue;
1606
1607                         LLVMValueRef so_offset = LLVMGetParam(shader->radeon_bld.main_fn,
1608                                                               shader->param_streamout_offset[i]);
1609                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(i32, 4, 0), "");
1610
1611                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
1612                                                           LLVMConstInt(i32, so->stride[i]*4, 0), "");
1613                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
1614                 }
1615
1616                 /* Write streamout data. */
1617                 for (i = 0; i < so->num_outputs; i++) {
1618                         unsigned buf_idx = so->output[i].output_buffer;
1619                         unsigned reg = so->output[i].register_index;
1620                         unsigned start = so->output[i].start_component;
1621                         unsigned num_comps = so->output[i].num_components;
1622                         unsigned stream = so->output[i].stream;
1623                         LLVMValueRef out[4];
1624                         struct lp_build_if_state if_ctx_stream;
1625
1626                         assert(num_comps && num_comps <= 4);
1627                         if (!num_comps || num_comps > 4)
1628                                 continue;
1629
1630                         if (reg >= noutput)
1631                                 continue;
1632
1633                         /* Load the output as int. */
1634                         for (j = 0; j < num_comps; j++) {
1635                                 out[j] = LLVMBuildBitCast(builder,
1636                                                           outputs[reg].values[start+j],
1637                                                 i32, "");
1638                         }
1639
1640                         /* Pack the output. */
1641                         LLVMValueRef vdata = NULL;
1642
1643                         switch (num_comps) {
1644                         case 1: /* as i32 */
1645                                 vdata = out[0];
1646                                 break;
1647                         case 2: /* as v2i32 */
1648                         case 3: /* as v4i32 (aligned to 4) */
1649                         case 4: /* as v4i32 */
1650                                 vdata = LLVMGetUndef(LLVMVectorType(i32, util_next_power_of_two(num_comps)));
1651                                 for (j = 0; j < num_comps; j++) {
1652                                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
1653                                                                        LLVMConstInt(i32, j, 0), "");
1654                                 }
1655                                 break;
1656                         }
1657
1658                         LLVMValueRef can_emit_stream =
1659                                 LLVMBuildICmp(builder, LLVMIntEQ,
1660                                               stream_id,
1661                                               lp_build_const_int32(gallivm, stream), "");
1662
1663                         lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
1664                         build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx],
1665                                                    vdata, num_comps,
1666                                                    so_write_offset[buf_idx],
1667                                                    LLVMConstInt(i32, 0, 0),
1668                                                    so->output[i].dst_offset*4);
1669                         lp_build_endif(&if_ctx_stream);
1670                 }
1671         }
1672         lp_build_endif(&if_ctx);
1673 }
1674
1675
1676 /* Generate export instructions for hardware VS shader stage */
1677 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
1678                               struct si_shader_output_values *outputs,
1679                               unsigned noutput)
1680 {
1681         struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
1682         struct si_shader * shader = si_shader_ctx->shader;
1683         struct lp_build_context * base = &bld_base->base;
1684         struct lp_build_context * uint =
1685                                 &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
1686         LLVMValueRef args[9];
1687         LLVMValueRef pos_args[4][9] = { { 0 } };
1688         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
1689         unsigned semantic_name, semantic_index;
1690         unsigned target;
1691         unsigned param_count = 0;
1692         unsigned pos_idx;
1693         int i;
1694
1695         if (outputs && si_shader_ctx->shader->selector->so.num_outputs) {
1696                 si_llvm_emit_streamout(si_shader_ctx, outputs, noutput);
1697         }
1698
1699         for (i = 0; i < noutput; i++) {
1700                 semantic_name = outputs[i].name;
1701                 semantic_index = outputs[i].sid;
1702
1703 handle_semantic:
1704                 /* Select the correct target */
1705                 switch(semantic_name) {
1706                 case TGSI_SEMANTIC_PSIZE:
1707                         psize_value = outputs[i].values[0];
1708                         continue;
1709                 case TGSI_SEMANTIC_EDGEFLAG:
1710                         edgeflag_value = outputs[i].values[0];
1711                         continue;
1712                 case TGSI_SEMANTIC_LAYER:
1713                         layer_value = outputs[i].values[0];
1714                         semantic_name = TGSI_SEMANTIC_GENERIC;
1715                         goto handle_semantic;
1716                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1717                         viewport_index_value = outputs[i].values[0];
1718                         semantic_name = TGSI_SEMANTIC_GENERIC;
1719                         goto handle_semantic;
1720                 case TGSI_SEMANTIC_POSITION:
1721                         target = V_008DFC_SQ_EXP_POS;
1722                         break;
1723                 case TGSI_SEMANTIC_COLOR:
1724                 case TGSI_SEMANTIC_BCOLOR:
1725                         target = V_008DFC_SQ_EXP_PARAM + param_count;
1726                         shader->vs_output_param_offset[i] = param_count;
1727                         param_count++;
1728                         break;
1729                 case TGSI_SEMANTIC_CLIPDIST:
1730                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
1731                         break;
1732                 case TGSI_SEMANTIC_CLIPVERTEX:
1733                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
1734                         continue;
1735                 case TGSI_SEMANTIC_PRIMID:
1736                 case TGSI_SEMANTIC_FOG:
1737                 case TGSI_SEMANTIC_TEXCOORD:
1738                 case TGSI_SEMANTIC_GENERIC:
1739                         target = V_008DFC_SQ_EXP_PARAM + param_count;
1740                         shader->vs_output_param_offset[i] = param_count;
1741                         param_count++;
1742                         break;
1743                 default:
1744                         target = 0;
1745                         fprintf(stderr,
1746                                 "Warning: SI unhandled vs output type:%d\n",
1747                                 semantic_name);
1748                 }
1749
1750                 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
1751
1752                 if (target >= V_008DFC_SQ_EXP_POS &&
1753                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
1754                         memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
1755                                args, sizeof(args));
1756                 } else {
1757                         lp_build_intrinsic(base->gallivm->builder,
1758                                            "llvm.SI.export",
1759                                            LLVMVoidTypeInContext(base->gallivm->context),
1760                                            args, 9, 0);
1761                 }
1762
1763                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
1764                         semantic_name = TGSI_SEMANTIC_GENERIC;
1765                         goto handle_semantic;
1766                 }
1767         }
1768
1769         shader->nr_param_exports = param_count;
1770
1771         /* We need to add the position output manually if it's missing. */
1772         if (!pos_args[0][0]) {
1773                 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1774                 pos_args[0][1] = uint->zero; /* EXEC mask */
1775                 pos_args[0][2] = uint->zero; /* last export? */
1776                 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
1777                 pos_args[0][4] = uint->zero; /* COMPR flag */
1778                 pos_args[0][5] = base->zero; /* X */
1779                 pos_args[0][6] = base->zero; /* Y */
1780                 pos_args[0][7] = base->zero; /* Z */
1781                 pos_args[0][8] = base->one;  /* W */
1782         }
1783
1784         /* Write the misc vector (point size, edgeflag, layer, viewport). */
1785         if (shader->selector->info.writes_psize ||
1786             shader->selector->info.writes_edgeflag ||
1787             shader->selector->info.writes_viewport_index ||
1788             shader->selector->info.writes_layer) {
1789                 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
1790                                                       shader->selector->info.writes_psize |
1791                                                       (shader->selector->info.writes_edgeflag << 1) |
1792                                                       (shader->selector->info.writes_layer << 2) |
1793                                                       (shader->selector->info.writes_viewport_index << 3));
1794                 pos_args[1][1] = uint->zero; /* EXEC mask */
1795                 pos_args[1][2] = uint->zero; /* last export? */
1796                 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
1797                 pos_args[1][4] = uint->zero; /* COMPR flag */
1798                 pos_args[1][5] = base->zero; /* X */
1799                 pos_args[1][6] = base->zero; /* Y */
1800                 pos_args[1][7] = base->zero; /* Z */
1801                 pos_args[1][8] = base->zero; /* W */
1802
1803                 if (shader->selector->info.writes_psize)
1804                         pos_args[1][5] = psize_value;
1805
1806                 if (shader->selector->info.writes_edgeflag) {
1807                         /* The output is a float, but the hw expects an integer
1808                          * with the first bit containing the edge flag. */
1809                         edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
1810                                                          edgeflag_value,
1811                                                          bld_base->uint_bld.elem_type, "");
1812                         edgeflag_value = lp_build_min(&bld_base->int_bld,
1813                                                       edgeflag_value,
1814                                                       bld_base->int_bld.one);
1815
1816                         /* The LLVM intrinsic expects a float. */
1817                         pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
1818                                                           edgeflag_value,
1819                                                           base->elem_type, "");
1820                 }
1821
1822                 if (shader->selector->info.writes_layer)
1823                         pos_args[1][7] = layer_value;
1824
1825                 if (shader->selector->info.writes_viewport_index)
1826                         pos_args[1][8] = viewport_index_value;
1827         }
1828
1829         for (i = 0; i < 4; i++)
1830                 if (pos_args[i][0])
1831                         shader->nr_pos_exports++;
1832
1833         pos_idx = 0;
1834         for (i = 0; i < 4; i++) {
1835                 if (!pos_args[i][0])
1836                         continue;
1837
1838                 /* Specify the target we are exporting */
1839                 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
1840
1841                 if (pos_idx == shader->nr_pos_exports)
1842                         /* Specify that this is the last export */
1843                         pos_args[i][2] = uint->one;
1844
1845                 lp_build_intrinsic(base->gallivm->builder,
1846                                    "llvm.SI.export",
1847                                    LLVMVoidTypeInContext(base->gallivm->context),
1848                                    pos_args[i], 9, 0);
1849         }
1850 }
1851
1852 static void si_write_tess_factors(struct si_shader_context *si_shader_ctx,
1853                                   unsigned name, LLVMValueRef *out_ptr)
1854 {
1855         struct si_shader *shader = si_shader_ctx->shader;
1856         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
1857         struct gallivm_state *gallivm = bld_base->base.gallivm;
1858         LLVMValueRef tf_base, rel_patch_id, byteoffset, buffer, rw_buffers;
1859         LLVMValueRef output, out[4];
1860         unsigned stride, outer_comps, inner_comps, i;
1861
1862         if (name != TGSI_SEMANTIC_TESSOUTER &&
1863             name != TGSI_SEMANTIC_TESSINNER) {
1864                 assert(0);
1865                 return;
1866         }
1867
1868         switch (shader->key.tcs.prim_mode) {
1869         case PIPE_PRIM_LINES:
1870                 stride = 2;
1871                 outer_comps = 2;
1872                 inner_comps = 0;
1873                 break;
1874         case PIPE_PRIM_TRIANGLES:
1875                 stride = 4;
1876                 outer_comps = 3;
1877                 inner_comps = 1;
1878                 break;
1879         case PIPE_PRIM_QUADS:
1880                 stride = 6;
1881                 outer_comps = 4;
1882                 inner_comps = 2;
1883                 break;
1884         default:
1885                 assert(0);
1886         }
1887
1888         /* Load the outputs as i32. */
1889         for (i = 0; i < 4; i++)
1890                 out[i] = LLVMBuildBitCast(gallivm->builder,
1891                                 LLVMBuildLoad(gallivm->builder, out_ptr[i], ""),
1892                                 bld_base->uint_bld.elem_type, "");
1893
1894         /* Convert the outputs to vectors. */
1895         if (name == TGSI_SEMANTIC_TESSOUTER)
1896                 output = lp_build_gather_values(gallivm, out,
1897                                                 util_next_power_of_two(outer_comps));
1898         else if (inner_comps > 1)
1899                 output = lp_build_gather_values(gallivm, out, inner_comps);
1900         else if (inner_comps == 1)
1901                 output = out[0];
1902         else
1903                 return;
1904
1905         /* Get the buffer. */
1906         rw_buffers = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
1907                                   SI_PARAM_RW_BUFFERS);
1908         buffer = build_indexed_load_const(si_shader_ctx, rw_buffers,
1909                         lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR));
1910
1911         /* Get offsets. */
1912         tf_base = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
1913                                SI_PARAM_TESS_FACTOR_OFFSET);
1914         rel_patch_id = get_rel_patch_id(si_shader_ctx);
1915         byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
1916                                   lp_build_const_int32(gallivm, 4 * stride), "");
1917
1918         /* Store the output. */
1919         if (name == TGSI_SEMANTIC_TESSOUTER) {
1920                 build_tbuffer_store_dwords(si_shader_ctx, buffer, output,
1921                                            outer_comps, byteoffset, tf_base, 0);
1922         } else if (inner_comps) {
1923                 build_tbuffer_store_dwords(si_shader_ctx, buffer, output,
1924                                            inner_comps, byteoffset, tf_base,
1925                                            outer_comps * 4);
1926         }
1927 }
1928
1929 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context * bld_base)
1930 {
1931         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1932         struct si_shader *shader = si_shader_ctx->shader;
1933         struct tgsi_shader_info *info = &shader->selector->info;
1934         struct gallivm_state *gallivm = bld_base->base.gallivm;
1935         unsigned i, chan;
1936         LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
1937                                               si_shader_ctx->param_rel_auto_id);
1938         LLVMValueRef vertex_dw_stride =
1939                 unpack_param(si_shader_ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
1940         LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
1941                                                  vertex_dw_stride, "");
1942
1943         /* Write outputs to LDS. The next shader (TCS aka HS) will read
1944          * its inputs from it. */
1945         for (i = 0; i < info->num_outputs; i++) {
1946                 LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
1947                 unsigned name = info->output_semantic_name[i];
1948                 unsigned index = info->output_semantic_index[i];
1949                 int param = si_shader_io_get_unique_index(name, index);
1950                 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
1951                                         lp_build_const_int32(gallivm, param * 4), "");
1952
1953                 for (chan = 0; chan < 4; chan++) {
1954                         lds_store(bld_base, chan, dw_addr,
1955                                   LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
1956                 }
1957         }
1958 }
1959
1960 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context * bld_base)
1961 {
1962         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1963         struct si_shader *shader = si_shader_ctx->shader;
1964         struct tgsi_shader_info *info = &shader->selector->info;
1965         unsigned i;
1966
1967         /* Only write tessellation factors. Other outputs have already been
1968          * written to LDS by instructions. */
1969         for (i = 0; i < info->num_outputs; i++) {
1970                 LLVMValueRef *out_ptr = si_shader_ctx->radeon_bld.soa.outputs[i];
1971                 unsigned name = info->output_semantic_name[i];
1972
1973                 if (name == TGSI_SEMANTIC_TESSINNER ||
1974                     name == TGSI_SEMANTIC_TESSOUTER) {
1975                         si_write_tess_factors(si_shader_ctx, name, out_ptr);
1976                 }
1977         }
1978 }
1979
1980 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
1981 {
1982         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1983         struct gallivm_state *gallivm = bld_base->base.gallivm;
1984         struct si_shader *es = si_shader_ctx->shader;
1985         struct tgsi_shader_info *info = &es->selector->info;
1986         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
1987         LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
1988                                             si_shader_ctx->param_es2gs_offset);
1989         uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ?
1990                                            es->key.tes.es_enabled_outputs :
1991                                            es->key.vs.es_enabled_outputs;
1992         unsigned chan;
1993         int i;
1994
1995         for (i = 0; i < info->num_outputs; i++) {
1996                 LLVMValueRef *out_ptr =
1997                         si_shader_ctx->radeon_bld.soa.outputs[i];
1998                 int param_index;
1999
2000                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2001                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2002                         continue;
2003
2004                 param_index = get_param_index(info->output_semantic_name[i],
2005                                               info->output_semantic_index[i],
2006                                               enabled_outputs);
2007                 if (param_index < 0)
2008                         continue;
2009
2010                 for (chan = 0; chan < 4; chan++) {
2011                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2012                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
2013
2014                         build_tbuffer_store(si_shader_ctx,
2015                                             si_shader_ctx->esgs_ring,
2016                                             out_val, 1,
2017                                             LLVMGetUndef(i32), soffset,
2018                                             (4 * param_index + chan) * 4,
2019                                             V_008F0C_BUF_DATA_FORMAT_32,
2020                                             V_008F0C_BUF_NUM_FORMAT_UINT,
2021                                             0, 0, 1, 1, 0);
2022                 }
2023         }
2024 }
2025
2026 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2027 {
2028         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2029         struct gallivm_state *gallivm = bld_base->base.gallivm;
2030         LLVMValueRef args[2];
2031
2032         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2033         args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2034         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2035                         LLVMVoidTypeInContext(gallivm->context), args, 2,
2036                         LLVMNoUnwindAttribute);
2037 }
2038
2039 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
2040 {
2041         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2042         struct gallivm_state *gallivm = bld_base->base.gallivm;
2043         struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info;
2044         struct si_shader_output_values *outputs = NULL;
2045         int i,j;
2046
2047         outputs = MALLOC(info->num_outputs * sizeof(outputs[0]));
2048
2049         for (i = 0; i < info->num_outputs; i++) {
2050                 outputs[i].name = info->output_semantic_name[i];
2051                 outputs[i].sid = info->output_semantic_index[i];
2052
2053                 for (j = 0; j < 4; j++)
2054                         outputs[i].values[j] =
2055                                 LLVMBuildLoad(gallivm->builder,
2056                                               si_shader_ctx->radeon_bld.soa.outputs[i][j],
2057                                               "");
2058         }
2059
2060         si_llvm_export_vs(bld_base, outputs, info->num_outputs);
2061         FREE(outputs);
2062 }
2063
2064 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
2065 {
2066         struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
2067         struct si_shader * shader = si_shader_ctx->shader;
2068         struct lp_build_context * base = &bld_base->base;
2069         struct lp_build_context * uint = &bld_base->uint_bld;
2070         struct tgsi_shader_info *info = &shader->selector->info;
2071         LLVMValueRef args[9];
2072         LLVMValueRef last_args[9] = { 0 };
2073         int depth_index = -1, stencil_index = -1, samplemask_index = -1;
2074         int i;
2075
2076         for (i = 0; i < info->num_outputs; i++) {
2077                 unsigned semantic_name = info->output_semantic_name[i];
2078                 unsigned semantic_index = info->output_semantic_index[i];
2079                 unsigned target;
2080                 LLVMValueRef alpha_ptr;
2081
2082                 /* Select the correct target */
2083                 switch (semantic_name) {
2084                 case TGSI_SEMANTIC_POSITION:
2085                         depth_index = i;
2086                         continue;
2087                 case TGSI_SEMANTIC_STENCIL:
2088                         stencil_index = i;
2089                         continue;
2090                 case TGSI_SEMANTIC_SAMPLEMASK:
2091                         samplemask_index = i;
2092                         continue;
2093                 case TGSI_SEMANTIC_COLOR:
2094                         target = V_008DFC_SQ_EXP_MRT + semantic_index;
2095                         alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3];
2096
2097                         if (si_shader_ctx->shader->key.ps.alpha_to_one)
2098                                 LLVMBuildStore(base->gallivm->builder,
2099                                                base->one, alpha_ptr);
2100
2101                         if (semantic_index == 0 &&
2102                             si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
2103                                 si_alpha_test(bld_base, alpha_ptr);
2104
2105                         if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
2106                                 si_scale_alpha_by_sample_mask(bld_base, alpha_ptr);
2107                         break;
2108                 default:
2109                         target = 0;
2110                         fprintf(stderr,
2111                                 "Warning: SI unhandled fs output type:%d\n",
2112                                 semantic_name);
2113                 }
2114
2115                 si_llvm_init_export_args_load(bld_base,
2116                                               si_shader_ctx->radeon_bld.soa.outputs[i],
2117                                               target, args);
2118
2119                 if (semantic_name == TGSI_SEMANTIC_COLOR) {
2120                         /* If there is an export instruction waiting to be emitted, do so now. */
2121                         if (last_args[0]) {
2122                                 lp_build_intrinsic(base->gallivm->builder,
2123                                                    "llvm.SI.export",
2124                                                    LLVMVoidTypeInContext(base->gallivm->context),
2125                                                    last_args, 9, 0);
2126                         }
2127
2128                         /* This instruction will be emitted at the end of the shader. */
2129                         memcpy(last_args, args, sizeof(args));
2130
2131                         /* Handle FS_COLOR0_WRITES_ALL_CBUFS. */
2132                         if (shader->selector->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
2133                             semantic_index == 0 &&
2134                             si_shader_ctx->shader->key.ps.last_cbuf > 0) {
2135                                 for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
2136                                         si_llvm_init_export_args_load(bld_base,
2137                                                                       si_shader_ctx->radeon_bld.soa.outputs[i],
2138                                                                       V_008DFC_SQ_EXP_MRT + c, args);
2139                                         lp_build_intrinsic(base->gallivm->builder,
2140                                                            "llvm.SI.export",
2141                                                            LLVMVoidTypeInContext(base->gallivm->context),
2142                                                            args, 9, 0);
2143                                 }
2144                         }
2145                 } else {
2146                         lp_build_intrinsic(base->gallivm->builder,
2147                                            "llvm.SI.export",
2148                                            LLVMVoidTypeInContext(base->gallivm->context),
2149                                            args, 9, 0);
2150                 }
2151         }
2152
2153         if (depth_index >= 0 || stencil_index >= 0 || samplemask_index >= 0) {
2154                 LLVMValueRef out_ptr;
2155                 unsigned mask = 0;
2156
2157                 /* Specify the target we are exporting */
2158                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2159
2160                 args[5] = base->zero; /* R, depth */
2161                 args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
2162                 args[7] = base->zero; /* B, sample mask */
2163                 args[8] = base->zero; /* A, alpha to mask */
2164
2165                 if (depth_index >= 0) {
2166                         out_ptr = si_shader_ctx->radeon_bld.soa.outputs[depth_index][2];
2167                         args[5] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
2168                         mask |= 0x1;
2169                         si_shader_ctx->shader->db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1);
2170                 }
2171
2172                 if (stencil_index >= 0) {
2173                         out_ptr = si_shader_ctx->radeon_bld.soa.outputs[stencil_index][1];
2174                         args[6] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
2175                         mask |= 0x2;
2176                         si_shader_ctx->shader->db_shader_control |=
2177                                 S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(1);
2178                 }
2179
2180                 if (samplemask_index >= 0) {
2181                         out_ptr = si_shader_ctx->radeon_bld.soa.outputs[samplemask_index][0];
2182                         args[7] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
2183                         mask |= 0x4;
2184                         si_shader_ctx->shader->db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(1);
2185                 }
2186
2187                 /* SI (except OLAND) has a bug that it only looks
2188                  * at the X writemask component. */
2189                 if (si_shader_ctx->screen->b.chip_class == SI &&
2190                     si_shader_ctx->screen->b.family != CHIP_OLAND)
2191                         mask |= 0x1;
2192
2193                 if (samplemask_index >= 0)
2194                         si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_ABGR;
2195                 else if (stencil_index >= 0)
2196                         si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_GR;
2197                 else
2198                         si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_R;
2199
2200                 /* Specify which components to enable */
2201                 args[0] = lp_build_const_int32(base->gallivm, mask);
2202
2203                 args[1] =
2204                 args[2] =
2205                 args[4] = uint->zero;
2206
2207                 if (last_args[0])
2208                         lp_build_intrinsic(base->gallivm->builder,
2209                                            "llvm.SI.export",
2210                                            LLVMVoidTypeInContext(base->gallivm->context),
2211                                            args, 9, 0);
2212                 else
2213                         memcpy(last_args, args, sizeof(args));
2214         }
2215
2216         if (!last_args[0]) {
2217                 /* Specify which components to enable */
2218                 last_args[0] = lp_build_const_int32(base->gallivm, 0x0);
2219
2220                 /* Specify the target we are exporting */
2221                 last_args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
2222
2223                 /* Set COMPR flag to zero to export data as 32-bit */
2224                 last_args[4] = uint->zero;
2225
2226                 /* dummy bits */
2227                 last_args[5]= uint->zero;
2228                 last_args[6]= uint->zero;
2229                 last_args[7]= uint->zero;
2230                 last_args[8]= uint->zero;
2231         }
2232
2233         /* Specify whether the EXEC mask represents the valid mask */
2234         last_args[1] = uint->one;
2235
2236         /* Specify that this is the last export */
2237         last_args[2] = lp_build_const_int32(base->gallivm, 1);
2238
2239         lp_build_intrinsic(base->gallivm->builder,
2240                            "llvm.SI.export",
2241                            LLVMVoidTypeInContext(base->gallivm->context),
2242                            last_args, 9, 0);
2243 }
2244
2245 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
2246                                 struct lp_build_tgsi_context * bld_base,
2247                                 struct lp_build_emit_data * emit_data);
2248
2249 static bool tgsi_is_shadow_sampler(unsigned target)
2250 {
2251         return target == TGSI_TEXTURE_SHADOW1D ||
2252                target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
2253                target == TGSI_TEXTURE_SHADOW2D ||
2254                target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
2255                target == TGSI_TEXTURE_SHADOWCUBE ||
2256                target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
2257                target == TGSI_TEXTURE_SHADOWRECT;
2258 }
2259
2260 static const struct lp_build_tgsi_action tex_action;
2261
2262 static void tex_fetch_args(
2263         struct lp_build_tgsi_context * bld_base,
2264         struct lp_build_emit_data * emit_data)
2265 {
2266         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2267         struct gallivm_state *gallivm = bld_base->base.gallivm;
2268         const struct tgsi_full_instruction * inst = emit_data->inst;
2269         unsigned opcode = inst->Instruction.Opcode;
2270         unsigned target = inst->Texture.Texture;
2271         LLVMValueRef coords[5], derivs[6];
2272         LLVMValueRef address[16];
2273         int ref_pos;
2274         unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
2275         unsigned count = 0;
2276         unsigned chan;
2277         unsigned sampler_src;
2278         unsigned sampler_index;
2279         unsigned num_deriv_channels = 0;
2280         bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
2281         LLVMValueRef res_ptr, samp_ptr;
2282
2283         sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
2284         sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
2285
2286         if (emit_data->inst->Src[sampler_src].Register.Indirect) {
2287                 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
2288                 LLVMValueRef ind_index;
2289
2290                 ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
2291
2292                 res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
2293                 res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index);
2294
2295                 samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
2296                 samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index);
2297         } else {
2298                 res_ptr = si_shader_ctx->resources[sampler_index];
2299                 samp_ptr = si_shader_ctx->samplers[sampler_index];
2300         }
2301
2302         if (target == TGSI_TEXTURE_BUFFER) {
2303                 LLVMTypeRef i128 = LLVMIntTypeInContext(gallivm->context, 128);
2304                 LLVMTypeRef v2i128 = LLVMVectorType(i128, 2);
2305                 LLVMTypeRef i8 = LLVMInt8TypeInContext(gallivm->context);
2306                 LLVMTypeRef v16i8 = LLVMVectorType(i8, 16);
2307
2308                 /* Bitcast and truncate v8i32 to v16i8. */
2309                 LLVMValueRef res = res_ptr;
2310                 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
2311                 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
2312                 res = LLVMBuildBitCast(gallivm->builder, res, v16i8, "");
2313
2314                 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
2315                 emit_data->args[0] = res;
2316                 emit_data->args[1] = bld_base->uint_bld.zero;
2317                 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
2318                 emit_data->arg_count = 3;
2319                 return;
2320         }
2321
2322         /* Fetch and project texture coordinates */
2323         coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
2324         for (chan = 0; chan < 3; chan++ ) {
2325                 coords[chan] = lp_build_emit_fetch(bld_base,
2326                                                    emit_data->inst, 0,
2327                                                    chan);
2328                 if (opcode == TGSI_OPCODE_TXP)
2329                         coords[chan] = lp_build_emit_llvm_binary(bld_base,
2330                                                                  TGSI_OPCODE_DIV,
2331                                                                  coords[chan],
2332                                                                  coords[3]);
2333         }
2334
2335         if (opcode == TGSI_OPCODE_TXP)
2336                 coords[3] = bld_base->base.one;
2337
2338         /* Pack offsets. */
2339         if (has_offset && opcode != TGSI_OPCODE_TXF) {
2340                 /* The offsets are six-bit signed integers packed like this:
2341                  *   X=[5:0], Y=[13:8], and Z=[21:16].
2342                  */
2343                 LLVMValueRef offset[3], pack;
2344
2345                 assert(inst->Texture.NumOffsets == 1);
2346
2347                 for (chan = 0; chan < 3; chan++) {
2348                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
2349                                                                      emit_data->inst, 0, chan);
2350                         offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
2351                                                     lp_build_const_int32(gallivm, 0x3f), "");
2352                         if (chan)
2353                                 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
2354                                                             lp_build_const_int32(gallivm, chan*8), "");
2355                 }
2356
2357                 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
2358                 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
2359                 address[count++] = pack;
2360         }
2361
2362         /* Pack LOD bias value */
2363         if (opcode == TGSI_OPCODE_TXB)
2364                 address[count++] = coords[3];
2365         if (opcode == TGSI_OPCODE_TXB2)
2366                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
2367
2368         /* Pack depth comparison value */
2369         if (tgsi_is_shadow_sampler(target) && opcode != TGSI_OPCODE_LODQ) {
2370                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2371                         address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
2372                 } else {
2373                         assert(ref_pos >= 0);
2374                         address[count++] = coords[ref_pos];
2375                 }
2376         }
2377
2378         /* Pack user derivatives */
2379         if (opcode == TGSI_OPCODE_TXD) {
2380                 int param, num_src_deriv_channels;
2381
2382                 switch (target) {
2383                 case TGSI_TEXTURE_3D:
2384                         num_src_deriv_channels = 3;
2385                         num_deriv_channels = 3;
2386                         break;
2387                 case TGSI_TEXTURE_2D:
2388                 case TGSI_TEXTURE_SHADOW2D:
2389                 case TGSI_TEXTURE_RECT:
2390                 case TGSI_TEXTURE_SHADOWRECT:
2391                 case TGSI_TEXTURE_2D_ARRAY:
2392                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
2393                         num_src_deriv_channels = 2;
2394                         num_deriv_channels = 2;
2395                         break;
2396                 case TGSI_TEXTURE_CUBE:
2397                 case TGSI_TEXTURE_SHADOWCUBE:
2398                 case TGSI_TEXTURE_CUBE_ARRAY:
2399                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
2400                         /* Cube derivatives will be converted to 2D. */
2401                         num_src_deriv_channels = 3;
2402                         num_deriv_channels = 2;
2403                         break;
2404                 case TGSI_TEXTURE_1D:
2405                 case TGSI_TEXTURE_SHADOW1D:
2406                 case TGSI_TEXTURE_1D_ARRAY:
2407                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
2408                         num_src_deriv_channels = 1;
2409                         num_deriv_channels = 1;
2410                         break;
2411                 default:
2412                         assert(0); /* no other targets are valid here */
2413                 }
2414
2415                 for (param = 0; param < 2; param++)
2416                         for (chan = 0; chan < num_src_deriv_channels; chan++)
2417                                 derivs[param * num_src_deriv_channels + chan] =
2418                                         lp_build_emit_fetch(bld_base, inst, param+1, chan);
2419         }
2420
2421         if (target == TGSI_TEXTURE_CUBE ||
2422             target == TGSI_TEXTURE_CUBE_ARRAY ||
2423             target == TGSI_TEXTURE_SHADOWCUBE ||
2424             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
2425                 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
2426
2427         if (opcode == TGSI_OPCODE_TXD)
2428                 for (int i = 0; i < num_deriv_channels * 2; i++)
2429                         address[count++] = derivs[i];
2430
2431         /* Pack texture coordinates */
2432         address[count++] = coords[0];
2433         if (num_coords > 1)
2434                 address[count++] = coords[1];
2435         if (num_coords > 2)
2436                 address[count++] = coords[2];
2437
2438         /* Pack LOD or sample index */
2439         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
2440                 address[count++] = coords[3];
2441         else if (opcode == TGSI_OPCODE_TXL2)
2442                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
2443
2444         if (count > 16) {
2445                 assert(!"Cannot handle more than 16 texture address parameters");
2446                 count = 16;
2447         }
2448
2449         for (chan = 0; chan < count; chan++ ) {
2450                 address[chan] = LLVMBuildBitCast(gallivm->builder,
2451                                                  address[chan],
2452                                                  LLVMInt32TypeInContext(gallivm->context),
2453                                                  "");
2454         }
2455
2456         /* Adjust the sample index according to FMASK.
2457          *
2458          * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
2459          * which is the identity mapping. Each nibble says which physical sample
2460          * should be fetched to get that sample.
2461          *
2462          * For example, 0x11111100 means there are only 2 samples stored and
2463          * the second sample covers 3/4 of the pixel. When reading samples 0
2464          * and 1, return physical sample 0 (determined by the first two 0s
2465          * in FMASK), otherwise return physical sample 1.
2466          *
2467          * The sample index should be adjusted as follows:
2468          *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
2469          */
2470         if (target == TGSI_TEXTURE_2D_MSAA ||
2471             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
2472                 struct lp_build_context *uint_bld = &bld_base->uint_bld;
2473                 struct lp_build_emit_data txf_emit_data = *emit_data;
2474                 LLVMValueRef txf_address[4];
2475                 unsigned txf_count = count;
2476                 struct tgsi_full_instruction inst = {};
2477
2478                 memcpy(txf_address, address, sizeof(txf_address));
2479
2480                 if (target == TGSI_TEXTURE_2D_MSAA) {
2481                         txf_address[2] = bld_base->uint_bld.zero;
2482                 }
2483                 txf_address[3] = bld_base->uint_bld.zero;
2484
2485                 /* Pad to a power-of-two size. */
2486                 while (txf_count < util_next_power_of_two(txf_count))
2487                         txf_address[txf_count++] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2488
2489                 /* Read FMASK using TXF. */
2490                 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
2491                 inst.Texture.Texture = target == TGSI_TEXTURE_2D_MSAA ? TGSI_TEXTURE_2D : TGSI_TEXTURE_2D_ARRAY;
2492                 txf_emit_data.inst = &inst;
2493                 txf_emit_data.chan = 0;
2494                 txf_emit_data.dst_type = LLVMVectorType(
2495                         LLVMInt32TypeInContext(gallivm->context), 4);
2496                 txf_emit_data.args[0] = lp_build_gather_values(gallivm, txf_address, txf_count);
2497                 txf_emit_data.args[1] = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
2498                 txf_emit_data.args[2] = lp_build_const_int32(gallivm, inst.Texture.Texture);
2499                 txf_emit_data.arg_count = 3;
2500
2501                 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
2502
2503                 /* Initialize some constants. */
2504                 LLVMValueRef four = LLVMConstInt(uint_bld->elem_type, 4, 0);
2505                 LLVMValueRef F = LLVMConstInt(uint_bld->elem_type, 0xF, 0);
2506
2507                 /* Apply the formula. */
2508                 LLVMValueRef fmask =
2509                         LLVMBuildExtractElement(gallivm->builder,
2510                                                 txf_emit_data.output[0],
2511                                                 uint_bld->zero, "");
2512
2513                 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
2514
2515                 LLVMValueRef sample_index4 =
2516                         LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
2517
2518                 LLVMValueRef shifted_fmask =
2519                         LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
2520
2521                 LLVMValueRef final_sample =
2522                         LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
2523
2524                 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
2525                  * resource descriptor is 0 (invalid),
2526                  */
2527                 LLVMValueRef fmask_desc =
2528                         LLVMBuildBitCast(gallivm->builder,
2529                                          si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index],
2530                                          LLVMVectorType(uint_bld->elem_type, 8), "");
2531
2532                 LLVMValueRef fmask_word1 =
2533                         LLVMBuildExtractElement(gallivm->builder, fmask_desc,
2534                                                 uint_bld->one, "");
2535
2536                 LLVMValueRef word1_is_nonzero =
2537                         LLVMBuildICmp(gallivm->builder, LLVMIntNE,
2538                                       fmask_word1, uint_bld->zero, "");
2539
2540                 /* Replace the MSAA sample index. */
2541                 address[sample_chan] =
2542                         LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
2543                                         final_sample, address[sample_chan], "");
2544         }
2545
2546         /* Resource */
2547         emit_data->args[1] = res_ptr;
2548
2549         if (opcode == TGSI_OPCODE_TXF) {
2550                 /* add tex offsets */
2551                 if (inst->Texture.NumOffsets) {
2552                         struct lp_build_context *uint_bld = &bld_base->uint_bld;
2553                         struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
2554                         const struct tgsi_texture_offset * off = inst->TexOffsets;
2555
2556                         assert(inst->Texture.NumOffsets == 1);
2557
2558                         switch (target) {
2559                         case TGSI_TEXTURE_3D:
2560                                 address[2] = lp_build_add(uint_bld, address[2],
2561                                                 bld->immediates[off->Index][off->SwizzleZ]);
2562                                 /* fall through */
2563                         case TGSI_TEXTURE_2D:
2564                         case TGSI_TEXTURE_SHADOW2D:
2565                         case TGSI_TEXTURE_RECT:
2566                         case TGSI_TEXTURE_SHADOWRECT:
2567                         case TGSI_TEXTURE_2D_ARRAY:
2568                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
2569                                 address[1] =
2570                                         lp_build_add(uint_bld, address[1],
2571                                                 bld->immediates[off->Index][off->SwizzleY]);
2572                                 /* fall through */
2573                         case TGSI_TEXTURE_1D:
2574                         case TGSI_TEXTURE_SHADOW1D:
2575                         case TGSI_TEXTURE_1D_ARRAY:
2576                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
2577                                 address[0] =
2578                                         lp_build_add(uint_bld, address[0],
2579                                                 bld->immediates[off->Index][off->SwizzleX]);
2580                                 break;
2581                                 /* texture offsets do not apply to other texture targets */
2582                         }
2583                 }
2584
2585                 emit_data->args[2] = lp_build_const_int32(gallivm, target);
2586                 emit_data->arg_count = 3;
2587
2588                 emit_data->dst_type = LLVMVectorType(
2589                         LLVMInt32TypeInContext(gallivm->context),
2590                         4);
2591         } else if (opcode == TGSI_OPCODE_TG4 ||
2592                    opcode == TGSI_OPCODE_LODQ ||
2593                    has_offset) {
2594                 unsigned is_array = target == TGSI_TEXTURE_1D_ARRAY ||
2595                                     target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
2596                                     target == TGSI_TEXTURE_2D_ARRAY ||
2597                                     target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
2598                                     target == TGSI_TEXTURE_CUBE_ARRAY ||
2599                                     target == TGSI_TEXTURE_SHADOWCUBE_ARRAY;
2600                 unsigned is_rect = target == TGSI_TEXTURE_RECT;
2601                 unsigned dmask = 0xf;
2602
2603                 if (opcode == TGSI_OPCODE_TG4) {
2604                         unsigned gather_comp = 0;
2605
2606                         /* DMASK was repurposed for GATHER4. 4 components are always
2607                          * returned and DMASK works like a swizzle - it selects
2608                          * the component to fetch. The only valid DMASK values are
2609                          * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
2610                          * (red,red,red,red) etc.) The ISA document doesn't mention
2611                          * this.
2612                          */
2613
2614                         /* Get the component index from src1.x for Gather4. */
2615                         if (!tgsi_is_shadow_sampler(target)) {
2616                                 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
2617                                 LLVMValueRef comp_imm;
2618                                 struct tgsi_src_register src1 = inst->Src[1].Register;
2619
2620                                 assert(src1.File == TGSI_FILE_IMMEDIATE);
2621
2622                                 comp_imm = imms[src1.Index][src1.SwizzleX];
2623                                 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
2624                                 gather_comp = CLAMP(gather_comp, 0, 3);
2625                         }
2626
2627                         dmask = 1 << gather_comp;
2628                 }
2629
2630                 emit_data->args[2] = samp_ptr;
2631                 emit_data->args[3] = lp_build_const_int32(gallivm, dmask);
2632                 emit_data->args[4] = lp_build_const_int32(gallivm, is_rect); /* unorm */
2633                 emit_data->args[5] = lp_build_const_int32(gallivm, 0); /* r128 */
2634                 emit_data->args[6] = lp_build_const_int32(gallivm, is_array); /* da */
2635                 emit_data->args[7] = lp_build_const_int32(gallivm, 0); /* glc */
2636                 emit_data->args[8] = lp_build_const_int32(gallivm, 0); /* slc */
2637                 emit_data->args[9] = lp_build_const_int32(gallivm, 0); /* tfe */
2638                 emit_data->args[10] = lp_build_const_int32(gallivm, 0); /* lwe */
2639
2640                 emit_data->arg_count = 11;
2641
2642                 emit_data->dst_type = LLVMVectorType(
2643                         LLVMFloatTypeInContext(gallivm->context),
2644                         4);
2645         } else {
2646                 emit_data->args[2] = samp_ptr;
2647                 emit_data->args[3] = lp_build_const_int32(gallivm, target);
2648                 emit_data->arg_count = 4;
2649
2650                 emit_data->dst_type = LLVMVectorType(
2651                         LLVMFloatTypeInContext(gallivm->context),
2652                         4);
2653         }
2654
2655         /* The fetch opcode has been converted to a 2D array fetch.
2656          * This simplifies the LLVM backend. */
2657         if (target == TGSI_TEXTURE_CUBE_ARRAY)
2658                 target = TGSI_TEXTURE_2D_ARRAY;
2659         else if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
2660                 target = TGSI_TEXTURE_SHADOW2D_ARRAY;
2661
2662         /* Pad to power of two vector */
2663         while (count < util_next_power_of_two(count))
2664                 address[count++] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2665
2666         emit_data->args[0] = lp_build_gather_values(gallivm, address, count);
2667 }
2668
2669 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
2670                                 struct lp_build_tgsi_context * bld_base,
2671                                 struct lp_build_emit_data * emit_data)
2672 {
2673         struct lp_build_context * base = &bld_base->base;
2674         unsigned opcode = emit_data->inst->Instruction.Opcode;
2675         unsigned target = emit_data->inst->Texture.Texture;
2676         char intr_name[127];
2677         bool has_offset = HAVE_LLVM >= 0x0305 ?
2678                                 emit_data->inst->Texture.NumOffsets > 0 : false;
2679
2680         if (target == TGSI_TEXTURE_BUFFER) {
2681                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
2682                         base->gallivm->builder,
2683                         "llvm.SI.vs.load.input", emit_data->dst_type,
2684                         emit_data->args, emit_data->arg_count,
2685                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
2686                 return;
2687         }
2688
2689         if (opcode == TGSI_OPCODE_TG4 ||
2690             opcode == TGSI_OPCODE_LODQ ||
2691             (opcode != TGSI_OPCODE_TXF && has_offset)) {
2692                 bool is_shadow = tgsi_is_shadow_sampler(target);
2693                 const char *name = "llvm.SI.image.sample";
2694                 const char *infix = "";
2695
2696                 switch (opcode) {
2697                 case TGSI_OPCODE_TEX:
2698                 case TGSI_OPCODE_TEX2:
2699                 case TGSI_OPCODE_TXP:
2700                         break;
2701                 case TGSI_OPCODE_TXB:
2702                 case TGSI_OPCODE_TXB2:
2703                         infix = ".b";
2704                         break;
2705                 case TGSI_OPCODE_TXL:
2706                 case TGSI_OPCODE_TXL2:
2707                         infix = ".l";
2708                         break;
2709                 case TGSI_OPCODE_TXD:
2710                         infix = ".d";
2711                         break;
2712                 case TGSI_OPCODE_TG4:
2713                         name = "llvm.SI.gather4";
2714                         break;
2715                 case TGSI_OPCODE_LODQ:
2716                         name = "llvm.SI.getlod";
2717                         is_shadow = false;
2718                         has_offset = false;
2719                         break;
2720                 default:
2721                         assert(0);
2722                         return;
2723                 }
2724
2725                 /* Add the type and suffixes .c, .o if needed. */
2726                 sprintf(intr_name, "%s%s%s%s.v%ui32", name,
2727                         is_shadow ? ".c" : "", infix, has_offset ? ".o" : "",
2728                         LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
2729
2730                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
2731                         base->gallivm->builder, intr_name, emit_data->dst_type,
2732                         emit_data->args, emit_data->arg_count,
2733                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
2734         } else {
2735                 LLVMTypeRef i8, v16i8, v32i8;
2736                 const char *name;
2737
2738                 switch (opcode) {
2739                 case TGSI_OPCODE_TEX:
2740                 case TGSI_OPCODE_TEX2:
2741                 case TGSI_OPCODE_TXP:
2742                         name = "llvm.SI.sample";
2743                         break;
2744                 case TGSI_OPCODE_TXB:
2745                 case TGSI_OPCODE_TXB2:
2746                         name = "llvm.SI.sampleb";
2747                         break;
2748                 case TGSI_OPCODE_TXD:
2749                         name = "llvm.SI.sampled";
2750                         break;
2751                 case TGSI_OPCODE_TXF:
2752                         name = "llvm.SI.imageload";
2753                         break;
2754                 case TGSI_OPCODE_TXL:
2755                 case TGSI_OPCODE_TXL2:
2756                         name = "llvm.SI.samplel";
2757                         break;
2758                 default:
2759                         assert(0);
2760                         return;
2761                 }
2762
2763                 i8 = LLVMInt8TypeInContext(base->gallivm->context);
2764                 v16i8 = LLVMVectorType(i8, 16);
2765                 v32i8 = LLVMVectorType(i8, 32);
2766
2767                 emit_data->args[1] = LLVMBuildBitCast(base->gallivm->builder,
2768                                                 emit_data->args[1], v32i8, "");
2769                 if (opcode != TGSI_OPCODE_TXF) {
2770                         emit_data->args[2] = LLVMBuildBitCast(base->gallivm->builder,
2771                                                 emit_data->args[2], v16i8, "");
2772                 }
2773
2774                 sprintf(intr_name, "%s.v%ui32", name,
2775                         LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
2776
2777                 emit_data->output[emit_data->chan] = lp_build_intrinsic(
2778                         base->gallivm->builder, intr_name, emit_data->dst_type,
2779                         emit_data->args, emit_data->arg_count,
2780                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
2781         }
2782 }
2783
2784 static void txq_fetch_args(
2785         struct lp_build_tgsi_context * bld_base,
2786         struct lp_build_emit_data * emit_data)
2787 {
2788         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2789         const struct tgsi_full_instruction *inst = emit_data->inst;
2790         struct gallivm_state *gallivm = bld_base->base.gallivm;
2791         unsigned target = inst->Texture.Texture;
2792         LLVMValueRef res_ptr;
2793
2794         if (inst->Src[1].Register.Indirect) {
2795                 const struct tgsi_full_src_register *reg = &inst->Src[1];
2796                 LLVMValueRef ind_index;
2797
2798                 ind_index = get_indirect_index(si_shader_ctx, &reg->Indirect, reg->Register.Index);
2799
2800                 res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
2801                 res_ptr = build_indexed_load_const(si_shader_ctx, res_ptr,
2802                                                    ind_index);
2803         } else
2804                 res_ptr = si_shader_ctx->resources[inst->Src[1].Register.Index];
2805
2806         if (target == TGSI_TEXTURE_BUFFER) {
2807                 LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
2808                 LLVMTypeRef v8i32 = LLVMVectorType(i32, 8);
2809
2810                 /* Read the size from the buffer descriptor directly. */
2811                 LLVMValueRef size = res_ptr;
2812                 size = LLVMBuildBitCast(gallivm->builder, size, v8i32, "");
2813                 size = LLVMBuildExtractElement(gallivm->builder, size,
2814                                               lp_build_const_int32(gallivm, 6), "");
2815                 emit_data->args[0] = size;
2816                 return;
2817         }
2818
2819         /* Mip level */
2820         emit_data->args[0] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
2821
2822         /* Resource */
2823         emit_data->args[1] = res_ptr;
2824
2825         /* Texture target */
2826         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
2827             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
2828                 target = TGSI_TEXTURE_2D_ARRAY;
2829
2830         emit_data->args[2] = lp_build_const_int32(bld_base->base.gallivm,
2831                                                   target);
2832
2833         emit_data->arg_count = 3;
2834
2835         emit_data->dst_type = LLVMVectorType(
2836                 LLVMInt32TypeInContext(bld_base->base.gallivm->context),
2837                 4);
2838 }
2839
2840 static void build_txq_intrinsic(const struct lp_build_tgsi_action * action,
2841                                 struct lp_build_tgsi_context * bld_base,
2842                                 struct lp_build_emit_data * emit_data)
2843 {
2844         unsigned target = emit_data->inst->Texture.Texture;
2845
2846         if (target == TGSI_TEXTURE_BUFFER) {
2847                 /* Just return the buffer size. */
2848                 emit_data->output[emit_data->chan] = emit_data->args[0];
2849                 return;
2850         }
2851
2852         build_tgsi_intrinsic_nomem(action, bld_base, emit_data);
2853
2854         /* Divide the number of layers by 6 to get the number of cubes. */
2855         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
2856             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2857                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2858                 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
2859                 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
2860
2861                 LLVMValueRef v4 = emit_data->output[emit_data->chan];
2862                 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
2863                 z = LLVMBuildSDiv(builder, z, six, "");
2864
2865                 emit_data->output[emit_data->chan] =
2866                         LLVMBuildInsertElement(builder, v4, z, two, "");
2867         }
2868 }
2869
2870 /*
2871  * SI implements derivatives using the local data store (LDS)
2872  * All writes to the LDS happen in all executing threads at
2873  * the same time. TID is the Thread ID for the current
2874  * thread and is a value between 0 and 63, representing
2875  * the thread's position in the wavefront.
2876  *
2877  * For the pixel shader threads are grouped into quads of four pixels.
2878  * The TIDs of the pixels of a quad are:
2879  *
2880  *  +------+------+
2881  *  |4n + 0|4n + 1|
2882  *  +------+------+
2883  *  |4n + 2|4n + 3|
2884  *  +------+------+
2885  *
2886  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
2887  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
2888  * the current pixel's column, and masking with 0xfffffffe yields the TID
2889  * of the left pixel of the current pixel's row.
2890  *
2891  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
2892  * adding 2 yields the TID of the pixel below the top pixel.
2893  */
2894 /* masks for thread ID. */
2895 #define TID_MASK_TOP_LEFT 0xfffffffc
2896 #define TID_MASK_TOP      0xfffffffd
2897 #define TID_MASK_LEFT     0xfffffffe
2898
2899 static void si_llvm_emit_ddxy(
2900         const struct lp_build_tgsi_action * action,
2901         struct lp_build_tgsi_context * bld_base,
2902         struct lp_build_emit_data * emit_data)
2903 {
2904         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2905         struct gallivm_state *gallivm = bld_base->base.gallivm;
2906         struct lp_build_context * base = &bld_base->base;
2907         const struct tgsi_full_instruction *inst = emit_data->inst;
2908         unsigned opcode = inst->Instruction.Opcode;
2909         LLVMValueRef indices[2];
2910         LLVMValueRef store_ptr, load_ptr0, load_ptr1;
2911         LLVMValueRef tl, trbl, result[4];
2912         LLVMTypeRef i32;
2913         unsigned swizzle[4];
2914         unsigned c;
2915         int idx;
2916         unsigned mask;
2917
2918         i32 = LLVMInt32TypeInContext(gallivm->context);
2919
2920         indices[0] = bld_base->uint_bld.zero;
2921         indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
2922                                      NULL, 0, LLVMReadNoneAttribute);
2923         store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
2924                                  indices, 2, "");
2925
2926         if (opcode == TGSI_OPCODE_DDX_FINE)
2927                 mask = TID_MASK_LEFT;
2928         else if (opcode == TGSI_OPCODE_DDY_FINE)
2929                 mask = TID_MASK_TOP;
2930         else
2931                 mask = TID_MASK_TOP_LEFT;
2932
2933         indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
2934                                   lp_build_const_int32(gallivm, mask), "");
2935         load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
2936                                  indices, 2, "");
2937
2938         /* for DDX we want to next X pixel, DDY next Y pixel. */
2939         idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
2940         indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
2941                                   lp_build_const_int32(gallivm, idx), "");
2942         load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
2943                                  indices, 2, "");
2944
2945         for (c = 0; c < 4; ++c) {
2946                 unsigned i;
2947
2948                 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
2949                 for (i = 0; i < c; ++i) {
2950                         if (swizzle[i] == swizzle[c]) {
2951                                 result[c] = result[i];
2952                                 break;
2953                         }
2954                 }
2955                 if (i != c)
2956                         continue;
2957
2958                 LLVMBuildStore(gallivm->builder,
2959                                LLVMBuildBitCast(gallivm->builder,
2960                                                 lp_build_emit_fetch(bld_base, inst, 0, c),
2961                                                 i32, ""),
2962                                store_ptr);
2963
2964                 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
2965                 tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
2966
2967                 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
2968                 trbl = LLVMBuildBitCast(gallivm->builder, trbl, base->elem_type, "");
2969
2970                 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
2971         }
2972
2973         emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
2974 }
2975
2976 /*
2977  * this takes an I,J coordinate pair,
2978  * and works out the X and Y derivatives.
2979  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
2980  */
2981 static LLVMValueRef si_llvm_emit_ddxy_interp(
2982         struct lp_build_tgsi_context *bld_base,
2983         LLVMValueRef interp_ij)
2984 {
2985         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2986         struct gallivm_state *gallivm = bld_base->base.gallivm;
2987         struct lp_build_context *base = &bld_base->base;
2988         LLVMValueRef indices[2];
2989         LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
2990         LLVMValueRef tl, tr, bl, result[4];
2991         LLVMTypeRef i32;
2992         unsigned c;
2993
2994         i32 = LLVMInt32TypeInContext(gallivm->context);
2995
2996         indices[0] = bld_base->uint_bld.zero;
2997         indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
2998                                         NULL, 0, LLVMReadNoneAttribute);
2999         store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
3000                                  indices, 2, "");
3001
3002         temp = LLVMBuildAnd(gallivm->builder, indices[1],
3003                             lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
3004
3005         temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
3006                              lp_build_const_int32(gallivm, TID_MASK_TOP), "");
3007
3008         indices[1] = temp;
3009         load_ptr_x = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
3010                                   indices, 2, "");
3011
3012         indices[1] = temp2;
3013         load_ptr_y = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
3014                                   indices, 2, "");
3015
3016         indices[1] = LLVMBuildAdd(gallivm->builder, temp,
3017                                   lp_build_const_int32(gallivm, 1), "");
3018         load_ptr_ddx = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
3019                                    indices, 2, "");
3020
3021         indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
3022                                   lp_build_const_int32(gallivm, 2), "");
3023         load_ptr_ddy = LLVMBuildGEP(gallivm->builder, si_shader_ctx->lds,
3024                                    indices, 2, "");
3025
3026         for (c = 0; c < 2; ++c) {
3027                 LLVMValueRef store_val;
3028                 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
3029
3030                 store_val = LLVMBuildExtractElement(gallivm->builder,
3031                                                     interp_ij, c_ll, "");
3032                 LLVMBuildStore(gallivm->builder,
3033                                store_val,
3034                                store_ptr);
3035
3036                 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
3037                 tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
3038
3039                 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
3040                 tr = LLVMBuildBitCast(gallivm->builder, tr, base->elem_type, "");
3041
3042                 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
3043
3044                 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
3045                 tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
3046
3047                 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
3048                 bl = LLVMBuildBitCast(gallivm->builder, bl, base->elem_type, "");
3049
3050                 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
3051         }
3052
3053         return lp_build_gather_values(gallivm, result, 4);
3054 }
3055
3056 static void interp_fetch_args(
3057         struct lp_build_tgsi_context *bld_base,
3058         struct lp_build_emit_data *emit_data)
3059 {
3060         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
3061         struct gallivm_state *gallivm = bld_base->base.gallivm;
3062         const struct tgsi_full_instruction *inst = emit_data->inst;
3063
3064         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3065                 /* offset is in second src, first two channels */
3066                 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3067                                                          emit_data->inst, 1,
3068                                                          0);
3069                 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3070                                                          emit_data->inst, 1,
3071                                                          1);
3072                 emit_data->arg_count = 2;
3073         } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3074                 LLVMValueRef sample_position;
3075                 LLVMValueRef sample_id;
3076                 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
3077
3078                 /* fetch sample ID, then fetch its sample position,
3079                  * and place into first two channels.
3080                  */
3081                 sample_id = lp_build_emit_fetch(bld_base,
3082                                                 emit_data->inst, 1, 0);
3083                 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3084                                              LLVMInt32TypeInContext(gallivm->context),
3085                                              "");
3086                 sample_position = load_sample_position(&si_shader_ctx->radeon_bld, sample_id);
3087
3088                 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3089                                                              sample_position,
3090                                                              lp_build_const_int32(gallivm, 0), "");
3091
3092                 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3093                 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3094                                                              sample_position,
3095                                                              lp_build_const_int32(gallivm, 1), "");
3096                 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3097                 emit_data->arg_count = 2;
3098         }
3099 }
3100
3101 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3102                                 struct lp_build_tgsi_context *bld_base,
3103                                 struct lp_build_emit_data *emit_data)
3104 {
3105         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
3106         struct si_shader *shader = si_shader_ctx->shader;
3107         struct gallivm_state *gallivm = bld_base->base.gallivm;
3108         LLVMValueRef interp_param;
3109         const struct tgsi_full_instruction *inst = emit_data->inst;
3110         const char *intr_name;
3111         int input_index;
3112         int chan;
3113         int i;
3114         LLVMValueRef attr_number;
3115         LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
3116         LLVMValueRef params = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
3117         int interp_param_idx;
3118         unsigned location;
3119
3120         assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
3121         input_index = inst->Src[0].Register.Index;
3122
3123         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3124             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3125                 location = TGSI_INTERPOLATE_LOC_CENTER;
3126         else
3127                 location = TGSI_INTERPOLATE_LOC_CENTROID;
3128
3129         interp_param_idx = lookup_interp_param_index(shader->ps_input_interpolate[input_index],
3130                                                      location);
3131         if (interp_param_idx == -1)
3132                 return;
3133         else if (interp_param_idx)
3134                 interp_param = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, interp_param_idx);
3135         else
3136                 interp_param = NULL;
3137
3138         attr_number = lp_build_const_int32(gallivm,
3139                                            shader->ps_input_param_offset[input_index]);
3140
3141         if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3142             inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3143                 LLVMValueRef ij_out[2];
3144                 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3145
3146                 /*
3147                  * take the I then J parameters, and the DDX/Y for it, and
3148                  * calculate the IJ inputs for the interpolator.
3149                  * temp1 = ddx * offset/sample.x + I;
3150                  * interp_param.I = ddy * offset/sample.y + temp1;
3151                  * temp1 = ddx * offset/sample.x + J;
3152                  * interp_param.J = ddy * offset/sample.y + temp1;
3153                  */
3154                 for (i = 0; i < 2; i++) {
3155                         LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
3156                         LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
3157                         LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3158                                                                       ddxy_out, ix_ll, "");
3159                         LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3160                                                                       ddxy_out, iy_ll, "");
3161                         LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3162                                                                          interp_param, ix_ll, "");
3163                         LLVMValueRef temp1, temp2;
3164
3165                         interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3166                                                      LLVMFloatTypeInContext(gallivm->context), "");
3167
3168                         temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3169
3170                         temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3171
3172                         temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3173
3174                         temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3175
3176                         ij_out[i] = LLVMBuildBitCast(gallivm->builder,
3177                                                      temp2,
3178                                                      LLVMIntTypeInContext(gallivm->context, 32), "");
3179                 }
3180                 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
3181         }
3182
3183         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
3184         for (chan = 0; chan < 2; chan++) {
3185                 LLVMValueRef args[4];
3186                 LLVMValueRef llvm_chan;
3187                 unsigned schan;
3188
3189                 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3190                 llvm_chan = lp_build_const_int32(gallivm, schan);
3191
3192                 args[0] = llvm_chan;
3193                 args[1] = attr_number;
3194                 args[2] = params;
3195                 args[3] = interp_param;
3196
3197                 emit_data->output[chan] =
3198                         lp_build_intrinsic(gallivm->builder, intr_name,
3199                                            input_type, args, args[3] ? 4 : 3,
3200                                            LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
3201         }
3202 }
3203
3204 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3205                                        struct lp_build_emit_data *emit_data)
3206 {
3207         LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
3208         struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
3209         unsigned stream;
3210
3211         assert(src0.File == TGSI_FILE_IMMEDIATE);
3212
3213         stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
3214         return stream;
3215 }
3216
3217 /* Emit one vertex from the geometry shader */
3218 static void si_llvm_emit_vertex(
3219         const struct lp_build_tgsi_action *action,
3220         struct lp_build_tgsi_context *bld_base,
3221         struct lp_build_emit_data *emit_data)
3222 {
3223         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
3224         struct lp_build_context *uint = &bld_base->uint_bld;
3225         struct si_shader *shader = si_shader_ctx->shader;
3226         struct tgsi_shader_info *info = &shader->selector->info;
3227         struct gallivm_state *gallivm = bld_base->base.gallivm;
3228         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
3229         LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
3230                                             SI_PARAM_GS2VS_OFFSET);
3231         LLVMValueRef gs_next_vertex;
3232         LLVMValueRef can_emit, kill;
3233         LLVMValueRef args[2];
3234         unsigned chan;
3235         int i;
3236         unsigned stream;
3237
3238         stream = si_llvm_get_stream(bld_base, emit_data);
3239
3240         /* Write vertex attribute values to GSVS ring */
3241         gs_next_vertex = LLVMBuildLoad(gallivm->builder,
3242                                        si_shader_ctx->gs_next_vertex[stream],
3243                                        "");
3244
3245         /* If this thread has already emitted the declared maximum number of
3246          * vertices, kill it: excessive vertex emissions are not supposed to
3247          * have any effect, and GS threads have no externally observable
3248          * effects other than emitting vertices.
3249          */
3250         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
3251                                  lp_build_const_int32(gallivm,
3252                                                       shader->selector->gs_max_out_vertices), "");
3253         kill = lp_build_select(&bld_base->base, can_emit,
3254                                lp_build_const_float(gallivm, 1.0f),
3255                                lp_build_const_float(gallivm, -1.0f));
3256
3257         lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
3258                            LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
3259
3260         for (i = 0; i < info->num_outputs; i++) {
3261                 LLVMValueRef *out_ptr =
3262                         si_shader_ctx->radeon_bld.soa.outputs[i];
3263
3264                 for (chan = 0; chan < 4; chan++) {
3265                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3266                         LLVMValueRef voffset =
3267                                 lp_build_const_int32(gallivm, (i * 4 + chan) *
3268                                                      shader->selector->gs_max_out_vertices);
3269
3270                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
3271                         voffset = lp_build_mul_imm(uint, voffset, 4);
3272
3273                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
3274
3275                         build_tbuffer_store(si_shader_ctx,
3276                                             si_shader_ctx->gsvs_ring[stream],
3277                                             out_val, 1,
3278                                             voffset, soffset, 0,
3279                                             V_008F0C_BUF_DATA_FORMAT_32,
3280                                             V_008F0C_BUF_NUM_FORMAT_UINT,
3281                                             1, 0, 1, 1, 0);
3282                 }
3283         }
3284         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
3285                                       lp_build_const_int32(gallivm, 1));
3286
3287         LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex[stream]);
3288
3289         /* Signal vertex emission */
3290         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
3291         args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
3292         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
3293                         LLVMVoidTypeInContext(gallivm->context), args, 2,
3294                         LLVMNoUnwindAttribute);
3295 }
3296
3297 /* Cut one primitive from the geometry shader */
3298 static void si_llvm_emit_primitive(
3299         const struct lp_build_tgsi_action *action,
3300         struct lp_build_tgsi_context *bld_base,
3301         struct lp_build_emit_data *emit_data)
3302 {
3303         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
3304         struct gallivm_state *gallivm = bld_base->base.gallivm;
3305         LLVMValueRef args[2];
3306         unsigned stream;
3307
3308         /* Signal primitive cut */
3309         stream = si_llvm_get_stream(bld_base, emit_data);
3310         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
3311         args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
3312         lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
3313                         LLVMVoidTypeInContext(gallivm->context), args, 2,
3314                         LLVMNoUnwindAttribute);
3315 }
3316
3317 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
3318                                  struct lp_build_tgsi_context *bld_base,
3319                                  struct lp_build_emit_data *emit_data)
3320 {
3321         struct gallivm_state *gallivm = bld_base->base.gallivm;
3322
3323         lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.barrier.local",
3324                         LLVMVoidTypeInContext(gallivm->context), NULL, 0,
3325                         LLVMNoUnwindAttribute);
3326 }
3327
3328 static const struct lp_build_tgsi_action tex_action = {
3329         .fetch_args = tex_fetch_args,
3330         .emit = build_tex_intrinsic,
3331 };
3332
3333 static const struct lp_build_tgsi_action txq_action = {
3334         .fetch_args = txq_fetch_args,
3335         .emit = build_txq_intrinsic,
3336         .intr_name = "llvm.SI.resinfo"
3337 };
3338
3339 static const struct lp_build_tgsi_action interp_action = {
3340         .fetch_args = interp_fetch_args,
3341         .emit = build_interp_intrinsic,
3342 };
3343
3344 static void create_meta_data(struct si_shader_context *si_shader_ctx)
3345 {
3346         struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
3347         LLVMValueRef args[3];
3348
3349         args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
3350         args[1] = 0;
3351         args[2] = lp_build_const_int32(gallivm, 1);
3352
3353         si_shader_ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
3354 }
3355
3356 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3357 {
3358         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3359                                CONST_ADDR_SPACE);
3360 }
3361
3362 static void declare_streamout_params(struct si_shader_context *si_shader_ctx,
3363                                      struct pipe_stream_output_info *so,
3364                                      LLVMTypeRef *params, LLVMTypeRef i32,
3365                                      unsigned *num_params)
3366 {
3367         int i;
3368
3369         /* Streamout SGPRs. */
3370         if (so->num_outputs) {
3371                 params[si_shader_ctx->param_streamout_config = (*num_params)++] = i32;
3372                 params[si_shader_ctx->param_streamout_write_index = (*num_params)++] = i32;
3373         }
3374         /* A streamout buffer offset is loaded if the stride is non-zero. */
3375         for (i = 0; i < 4; i++) {
3376                 if (!so->stride[i])
3377                         continue;
3378
3379                 params[si_shader_ctx->param_streamout_offset[i] = (*num_params)++] = i32;
3380         }
3381 }
3382
3383 static void create_function(struct si_shader_context *si_shader_ctx)
3384 {
3385         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
3386         struct gallivm_state *gallivm = bld_base->base.gallivm;
3387         struct si_shader *shader = si_shader_ctx->shader;
3388         LLVMTypeRef params[SI_NUM_PARAMS], f32, i8, i32, v2i32, v3i32, v16i8, v4i32, v8i32;
3389         unsigned i, last_array_pointer, last_sgpr, num_params;
3390
3391         i8 = LLVMInt8TypeInContext(gallivm->context);
3392         i32 = LLVMInt32TypeInContext(gallivm->context);
3393         f32 = LLVMFloatTypeInContext(gallivm->context);
3394         v2i32 = LLVMVectorType(i32, 2);
3395         v3i32 = LLVMVectorType(i32, 3);
3396         v4i32 = LLVMVectorType(i32, 4);
3397         v8i32 = LLVMVectorType(i32, 8);
3398         v16i8 = LLVMVectorType(i8, 16);
3399
3400         params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS);
3401         params[SI_PARAM_CONST] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
3402         params[SI_PARAM_SAMPLER] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
3403         params[SI_PARAM_RESOURCE] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
3404         last_array_pointer = SI_PARAM_RESOURCE;
3405
3406         switch (si_shader_ctx->type) {
3407         case TGSI_PROCESSOR_VERTEX:
3408                 params[SI_PARAM_VERTEX_BUFFER] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
3409                 last_array_pointer = SI_PARAM_VERTEX_BUFFER;
3410                 params[SI_PARAM_BASE_VERTEX] = i32;
3411                 params[SI_PARAM_START_INSTANCE] = i32;
3412                 num_params = SI_PARAM_START_INSTANCE+1;
3413
3414                 if (shader->key.vs.as_es) {
3415                         params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
3416                 } else if (shader->key.vs.as_ls) {
3417                         params[SI_PARAM_LS_OUT_LAYOUT] = i32;
3418                         num_params = SI_PARAM_LS_OUT_LAYOUT+1;
3419                 } else {
3420                         if (shader->is_gs_copy_shader) {
3421                                 last_array_pointer = SI_PARAM_CONST;
3422                                 num_params = SI_PARAM_CONST+1;
3423                         }
3424
3425                         /* The locations of the other parameters are assigned dynamically. */
3426                         declare_streamout_params(si_shader_ctx, &shader->selector->so,
3427                                                  params, i32, &num_params);
3428                 }
3429
3430                 last_sgpr = num_params-1;
3431
3432                 /* VGPRs */
3433                 params[si_shader_ctx->param_vertex_id = num_params++] = i32;
3434                 params[si_shader_ctx->param_rel_auto_id = num_params++] = i32;
3435                 params[num_params++] = i32; /* unused */
3436                 params[si_shader_ctx->param_instance_id = num_params++] = i32;
3437                 break;
3438
3439         case TGSI_PROCESSOR_TESS_CTRL:
3440                 params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
3441                 params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
3442                 params[SI_PARAM_TCS_IN_LAYOUT] = i32;
3443                 params[SI_PARAM_TESS_FACTOR_OFFSET] = i32;
3444                 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
3445
3446                 /* VGPRs */
3447                 params[SI_PARAM_PATCH_ID] = i32;
3448                 params[SI_PARAM_REL_IDS] = i32;
3449                 num_params = SI_PARAM_REL_IDS+1;
3450                 break;
3451
3452         case TGSI_PROCESSOR_TESS_EVAL:
3453                 params[SI_PARAM_TCS_OUT_OFFSETS] = i32;
3454                 params[SI_PARAM_TCS_OUT_LAYOUT] = i32;
3455                 num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
3456
3457                 if (shader->key.tes.as_es) {
3458                         params[si_shader_ctx->param_es2gs_offset = num_params++] = i32;
3459                 } else {
3460                         declare_streamout_params(si_shader_ctx, &shader->selector->so,
3461                                                  params, i32, &num_params);
3462                 }
3463                 last_sgpr = num_params - 1;
3464
3465                 /* VGPRs */
3466                 params[si_shader_ctx->param_tes_u = num_params++] = f32;
3467                 params[si_shader_ctx->param_tes_v = num_params++] = f32;
3468                 params[si_shader_ctx->param_tes_rel_patch_id = num_params++] = i32;
3469                 params[si_shader_ctx->param_tes_patch_id = num_params++] = i32;
3470                 break;
3471
3472         case TGSI_PROCESSOR_GEOMETRY:
3473                 params[SI_PARAM_GS2VS_OFFSET] = i32;
3474                 params[SI_PARAM_GS_WAVE_ID] = i32;
3475                 last_sgpr = SI_PARAM_GS_WAVE_ID;
3476
3477                 /* VGPRs */
3478                 params[SI_PARAM_VTX0_OFFSET] = i32;
3479                 params[SI_PARAM_VTX1_OFFSET] = i32;
3480                 params[SI_PARAM_PRIMITIVE_ID] = i32;
3481                 params[SI_PARAM_VTX2_OFFSET] = i32;
3482                 params[SI_PARAM_VTX3_OFFSET] = i32;
3483                 params[SI_PARAM_VTX4_OFFSET] = i32;
3484                 params[SI_PARAM_VTX5_OFFSET] = i32;
3485                 params[SI_PARAM_GS_INSTANCE_ID] = i32;
3486                 num_params = SI_PARAM_GS_INSTANCE_ID+1;
3487                 break;
3488
3489         case TGSI_PROCESSOR_FRAGMENT:
3490                 params[SI_PARAM_ALPHA_REF] = f32;
3491                 params[SI_PARAM_PRIM_MASK] = i32;
3492                 last_sgpr = SI_PARAM_PRIM_MASK;
3493                 params[SI_PARAM_PERSP_SAMPLE] = v2i32;
3494                 params[SI_PARAM_PERSP_CENTER] = v2i32;
3495                 params[SI_PARAM_PERSP_CENTROID] = v2i32;
3496                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
3497                 params[SI_PARAM_LINEAR_SAMPLE] = v2i32;
3498                 params[SI_PARAM_LINEAR_CENTER] = v2i32;
3499                 params[SI_PARAM_LINEAR_CENTROID] = v2i32;
3500                 params[SI_PARAM_LINE_STIPPLE_TEX] = f32;
3501                 params[SI_PARAM_POS_X_FLOAT] = f32;
3502                 params[SI_PARAM_POS_Y_FLOAT] = f32;
3503                 params[SI_PARAM_POS_Z_FLOAT] = f32;
3504                 params[SI_PARAM_POS_W_FLOAT] = f32;
3505                 params[SI_PARAM_FRONT_FACE] = f32;
3506                 params[SI_PARAM_ANCILLARY] = i32;
3507                 params[SI_PARAM_SAMPLE_COVERAGE] = f32;
3508                 params[SI_PARAM_POS_FIXED_PT] = f32;
3509                 num_params = SI_PARAM_POS_FIXED_PT+1;
3510                 break;
3511
3512         default:
3513                 assert(0 && "unimplemented shader");
3514                 return;
3515         }
3516
3517         assert(num_params <= Elements(params));
3518         radeon_llvm_create_func(&si_shader_ctx->radeon_bld, params, num_params);
3519         radeon_llvm_shader_type(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->type);
3520
3521         if (shader->dx10_clamp_mode)
3522                 LLVMAddTargetDependentFunctionAttr(si_shader_ctx->radeon_bld.main_fn,
3523                                                    "enable-no-nans-fp-math", "true");
3524
3525         for (i = 0; i <= last_sgpr; ++i) {
3526                 LLVMValueRef P = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, i);
3527
3528                 /* We tell llvm that array inputs are passed by value to allow Sinking pass
3529                  * to move load. Inputs are constant so this is fine. */
3530                 if (i <= last_array_pointer)
3531                         LLVMAddAttribute(P, LLVMByValAttribute);
3532                 else
3533                         LLVMAddAttribute(P, LLVMInRegAttribute);
3534         }
3535
3536         if (bld_base->info &&
3537             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
3538              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
3539              bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
3540              bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
3541              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
3542              bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
3543                 si_shader_ctx->lds =
3544                         LLVMAddGlobalInAddressSpace(gallivm->module,
3545                                                     LLVMArrayType(i32, 64),
3546                                                     "ddxy_lds",
3547                                                     LOCAL_ADDR_SPACE);
3548
3549         if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
3550             si_shader_ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
3551             si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL) {
3552                 /* This is the upper bound, maximum is 32 inputs times 32 vertices */
3553                 unsigned vertex_data_dw_size = 32*32*4;
3554                 unsigned patch_data_dw_size = 32*4;
3555                 /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
3556                 unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
3557                 unsigned lds_dwords = patch_dw_size;
3558
3559                 /* The actual size is computed outside of the shader to reduce
3560                  * the number of shader variants. */
3561                 si_shader_ctx->lds =
3562                         LLVMAddGlobalInAddressSpace(gallivm->module,
3563                                                     LLVMArrayType(i32, lds_dwords),
3564                                                     "tess_lds",
3565                                                     LOCAL_ADDR_SPACE);
3566         }
3567 }
3568
3569 static void preload_constants(struct si_shader_context *si_shader_ctx)
3570 {
3571         struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
3572         struct gallivm_state * gallivm = bld_base->base.gallivm;
3573         const struct tgsi_shader_info * info = bld_base->info;
3574         unsigned buf;
3575         LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
3576
3577         for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
3578                 unsigned i, num_const = info->const_file_max[buf] + 1;
3579
3580                 if (num_const == 0)
3581                         continue;
3582
3583                 /* Allocate space for the constant values */
3584                 si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
3585
3586                 /* Load the resource descriptor */
3587                 si_shader_ctx->const_resource[buf] =
3588                         build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf));
3589
3590                 /* Load the constants, we rely on the code sinking to do the rest */
3591                 for (i = 0; i < num_const * 4; ++i) {
3592                         si_shader_ctx->constants[buf][i] =
3593                                 buffer_load_const(gallivm->builder,
3594                                         si_shader_ctx->const_resource[buf],
3595                                         lp_build_const_int32(gallivm, i * 4),
3596                                         bld_base->base.elem_type);
3597                 }
3598         }
3599 }
3600
3601 static void preload_samplers(struct si_shader_context *si_shader_ctx)
3602 {
3603         struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
3604         struct gallivm_state * gallivm = bld_base->base.gallivm;
3605         const struct tgsi_shader_info * info = bld_base->info;
3606
3607         unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
3608
3609         LLVMValueRef res_ptr, samp_ptr;
3610         LLVMValueRef offset;
3611
3612         if (num_samplers == 0)
3613                 return;
3614
3615         res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
3616         samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
3617
3618         /* Load the resources and samplers, we rely on the code sinking to do the rest */
3619         for (i = 0; i < num_samplers; ++i) {
3620                 /* Resource */
3621                 offset = lp_build_const_int32(gallivm, i);
3622                 si_shader_ctx->resources[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
3623
3624                 /* Sampler */
3625                 offset = lp_build_const_int32(gallivm, i);
3626                 si_shader_ctx->samplers[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
3627
3628                 /* FMASK resource */
3629                 if (info->is_msaa_sampler[i]) {
3630                         offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i);
3631                         si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + i] =
3632                                 build_indexed_load_const(si_shader_ctx, res_ptr, offset);
3633                 }
3634         }
3635 }
3636
3637 static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
3638 {
3639         struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
3640         struct gallivm_state * gallivm = bld_base->base.gallivm;
3641         unsigned i;
3642
3643         /* Streamout can only be used if the shader is compiled as VS. */
3644         if (!si_shader_ctx->shader->selector->so.num_outputs ||
3645             (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
3646              (si_shader_ctx->shader->key.vs.as_es ||
3647               si_shader_ctx->shader->key.vs.as_ls)) ||
3648             (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
3649              si_shader_ctx->shader->key.tes.as_es))
3650                 return;
3651
3652         LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
3653                                             SI_PARAM_RW_BUFFERS);
3654
3655         /* Load the resources, we rely on the code sinking to do the rest */
3656         for (i = 0; i < 4; ++i) {
3657                 if (si_shader_ctx->shader->selector->so.stride[i]) {
3658                         LLVMValueRef offset = lp_build_const_int32(gallivm,
3659                                                                    SI_SO_BUF_OFFSET + i);
3660
3661                         si_shader_ctx->so_buffers[i] = build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
3662                 }
3663         }
3664 }
3665
3666 /**
3667  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
3668  * for later use.
3669  */
3670 static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
3671 {
3672         struct gallivm_state *gallivm =
3673                 si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
3674
3675         LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
3676                                             SI_PARAM_RW_BUFFERS);
3677
3678         if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
3679              si_shader_ctx->shader->key.vs.as_es) ||
3680             (si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
3681              si_shader_ctx->shader->key.tes.as_es) ||
3682             si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
3683                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS);
3684
3685                 si_shader_ctx->esgs_ring =
3686                         build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
3687         }
3688
3689         if (si_shader_ctx->shader->is_gs_copy_shader) {
3690                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
3691
3692                 si_shader_ctx->gsvs_ring[0] =
3693                         build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
3694         }
3695         if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
3696                 int i;
3697                 for (i = 0; i < 4; i++) {
3698                         LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i);
3699
3700                         si_shader_ctx->gsvs_ring[i] =
3701                                 build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
3702                 }
3703         }
3704 }
3705
3706 void si_shader_binary_read_config(const struct si_screen *sscreen,
3707                                 struct si_shader *shader,
3708                                 unsigned symbol_offset)
3709 {
3710         unsigned i;
3711         const unsigned char *config =
3712                 radeon_shader_binary_config_start(&shader->binary,
3713                                                 symbol_offset);
3714
3715         /* XXX: We may be able to emit some of these values directly rather than
3716          * extracting fields to be emitted later.
3717          */
3718
3719         for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) {
3720                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
3721                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
3722                 switch (reg) {
3723                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
3724                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
3725                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
3726                 case R_00B848_COMPUTE_PGM_RSRC1:
3727                         shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
3728                         shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
3729                         shader->float_mode =  G_00B028_FLOAT_MODE(value);
3730                         break;
3731                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
3732                         shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
3733                         break;
3734                 case R_00B84C_COMPUTE_PGM_RSRC2:
3735                         shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
3736                         break;
3737                 case R_0286CC_SPI_PS_INPUT_ENA:
3738                         shader->spi_ps_input_ena = value;
3739                         break;
3740                 case R_0286E8_SPI_TMPRING_SIZE:
3741                 case R_00B860_COMPUTE_TMPRING_SIZE:
3742                         /* WAVESIZE is in units of 256 dwords. */
3743                         shader->scratch_bytes_per_wave =
3744                                 G_00B860_WAVESIZE(value) * 256 * 4 * 1;
3745                         break;
3746                 default:
3747                         fprintf(stderr, "Warning: Compiler emitted unknown "
3748                                 "config register: 0x%x\n", reg);
3749                         break;
3750                 }
3751         }
3752 }
3753
3754 void si_shader_apply_scratch_relocs(struct si_context *sctx,
3755                         struct si_shader *shader,
3756                         uint64_t scratch_va)
3757 {
3758         unsigned i;
3759         uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
3760         uint32_t scratch_rsrc_dword1 =
3761                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
3762                 |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
3763
3764         for (i = 0 ; i < shader->binary.reloc_count; i++) {
3765                 const struct radeon_shader_reloc *reloc =
3766                                         &shader->binary.relocs[i];
3767                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
3768                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
3769                         &scratch_rsrc_dword0, 4);
3770                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
3771                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
3772                         &scratch_rsrc_dword1, 4);
3773                 }
3774         }
3775 }
3776
3777 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
3778 {
3779         const struct radeon_shader_binary *binary = &shader->binary;
3780         unsigned code_size = binary->code_size + binary->rodata_size;
3781         unsigned char *ptr;
3782
3783         r600_resource_reference(&shader->bo, NULL);
3784         shader->bo = si_resource_create_custom(&sscreen->b.b,
3785                                                PIPE_USAGE_IMMUTABLE,
3786                                                code_size);
3787         if (!shader->bo)
3788                 return -ENOMEM;
3789
3790         ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
3791                                         PIPE_TRANSFER_READ_WRITE);
3792         util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
3793         if (binary->rodata_size > 0) {
3794                 ptr += binary->code_size;
3795                 util_memcpy_cpu_to_le32(ptr, binary->rodata,
3796                                         binary->rodata_size);
3797         }
3798
3799         sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
3800         return 0;
3801 }
3802
3803 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader)
3804 {
3805         const struct radeon_shader_binary *binary = &shader->binary;
3806         unsigned i;
3807         bool dump  = r600_can_dump_shader(&sscreen->b,
3808                 shader->selector ? shader->selector->tokens : NULL);
3809
3810         si_shader_binary_read_config(sscreen, shader, 0);
3811         si_shader_binary_upload(sscreen, shader);
3812
3813         if (dump) {
3814                 if (binary->disasm_string) {
3815                         fprintf(stderr, "\nShader Disassembly:\n\n");
3816                         fprintf(stderr, "%s\n", binary->disasm_string);
3817                 } else {
3818                         fprintf(stderr, "SI CODE:\n");
3819                         for (i = 0; i < binary->code_size; i+=4 ) {
3820                                 fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
3821                                 binary->code[i + 2], binary->code[i + 1],
3822                                 binary->code[i]);
3823                         }
3824                 }
3825
3826                 fprintf(stderr, "*** SHADER STATS ***\n"
3827                         "SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
3828                         "Scratch: %d bytes per wave\n********************\n",
3829                         shader->num_sgprs, shader->num_vgprs, binary->code_size,
3830                         shader->lds_size, shader->scratch_bytes_per_wave);
3831         }
3832         return 0;
3833 }
3834
3835 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
3836                     LLVMTargetMachineRef tm, LLVMModuleRef mod)
3837 {
3838         int r = 0;
3839         bool dump = r600_can_dump_shader(&sscreen->b,
3840                         shader->selector ? shader->selector->tokens : NULL);
3841         r = radeon_llvm_compile(mod, &shader->binary,
3842                 r600_get_llvm_processor_name(sscreen->b.family), dump, tm);
3843
3844         if (r) {
3845                 return r;
3846         }
3847         r = si_shader_binary_read(sscreen, shader);
3848
3849         FREE(shader->binary.config);
3850         FREE(shader->binary.rodata);
3851         FREE(shader->binary.global_symbol_offsets);
3852         if (shader->scratch_bytes_per_wave == 0) {
3853                 FREE(shader->binary.code);
3854                 FREE(shader->binary.relocs);
3855                 memset(&shader->binary, 0,
3856                        offsetof(struct radeon_shader_binary, disasm_string));
3857         }
3858         return r;
3859 }
3860
3861 /* Generate code for the hardware VS shader stage to go with a geometry shader */
3862 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
3863                                       struct si_shader_context *si_shader_ctx,
3864                                       struct si_shader *gs, bool dump)
3865 {
3866         struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
3867         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
3868         struct lp_build_context *base = &bld_base->base;
3869         struct lp_build_context *uint = &bld_base->uint_bld;
3870         struct si_shader *shader = si_shader_ctx->shader;
3871         struct si_shader_output_values *outputs;
3872         struct tgsi_shader_info *gsinfo = &gs->selector->info;
3873         LLVMValueRef args[9];
3874         int i, r;
3875
3876         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
3877
3878         si_shader_ctx->type = TGSI_PROCESSOR_VERTEX;
3879         shader->is_gs_copy_shader = true;
3880
3881         radeon_llvm_context_init(&si_shader_ctx->radeon_bld);
3882
3883         create_meta_data(si_shader_ctx);
3884         create_function(si_shader_ctx);
3885         preload_streamout_buffers(si_shader_ctx);
3886         preload_ring_buffers(si_shader_ctx);
3887
3888         args[0] = si_shader_ctx->gsvs_ring[0];
3889         args[1] = lp_build_mul_imm(uint,
3890                                    LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
3891                                                 si_shader_ctx->param_vertex_id),
3892                                    4);
3893         args[3] = uint->zero;
3894         args[4] = uint->one;  /* OFFEN */
3895         args[5] = uint->zero; /* IDXEN */
3896         args[6] = uint->one;  /* GLC */
3897         args[7] = uint->one;  /* SLC */
3898         args[8] = uint->zero; /* TFE */
3899
3900         /* Fetch vertex data from GSVS ring */
3901         for (i = 0; i < gsinfo->num_outputs; ++i) {
3902                 unsigned chan;
3903
3904                 outputs[i].name = gsinfo->output_semantic_name[i];
3905                 outputs[i].sid = gsinfo->output_semantic_index[i];
3906
3907                 for (chan = 0; chan < 4; chan++) {
3908                         args[2] = lp_build_const_int32(gallivm,
3909                                                        (i * 4 + chan) *
3910                                                        gs->selector->gs_max_out_vertices * 16 * 4);
3911
3912                         outputs[i].values[chan] =
3913                                 LLVMBuildBitCast(gallivm->builder,
3914                                                  lp_build_intrinsic(gallivm->builder,
3915                                                                  "llvm.SI.buffer.load.dword.i32.i32",
3916                                                                  LLVMInt32TypeInContext(gallivm->context),
3917                                                                  args, 9,
3918                                                                  LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
3919                                                  base->elem_type, "");
3920                 }
3921         }
3922
3923         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
3924
3925         radeon_llvm_finalize_module(&si_shader_ctx->radeon_bld);
3926
3927         if (dump)
3928                 fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
3929
3930         r = si_compile_llvm(sscreen, si_shader_ctx->shader,
3931                             si_shader_ctx->tm, bld_base->base.gallivm->module);
3932
3933         radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
3934
3935         FREE(outputs);
3936         return r;
3937 }
3938
3939 static void si_dump_key(unsigned shader, union si_shader_key *key)
3940 {
3941         int i;
3942
3943         fprintf(stderr, "SHADER KEY\n");
3944
3945         switch (shader) {
3946         case PIPE_SHADER_VERTEX:
3947                 fprintf(stderr, "  instance_divisors = {");
3948                 for (i = 0; i < Elements(key->vs.instance_divisors); i++)
3949                         fprintf(stderr, !i ? "%u" : ", %u",
3950                                 key->vs.instance_divisors[i]);
3951                 fprintf(stderr, "}\n");
3952
3953                 if (key->vs.as_es)
3954                         fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
3955                                 key->vs.es_enabled_outputs);
3956                 fprintf(stderr, "  as_es = %u\n", key->vs.as_es);
3957                 fprintf(stderr, "  as_es = %u\n", key->vs.as_ls);
3958                 break;
3959
3960         case PIPE_SHADER_TESS_CTRL:
3961                 fprintf(stderr, "  prim_mode = %u\n", key->tcs.prim_mode);
3962                 break;
3963
3964         case PIPE_SHADER_TESS_EVAL:
3965                 if (key->tes.as_es)
3966                         fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
3967                                 key->tes.es_enabled_outputs);
3968                 fprintf(stderr, "  as_es = %u\n", key->tes.as_es);
3969                 break;
3970
3971         case PIPE_SHADER_GEOMETRY:
3972                 break;
3973
3974         case PIPE_SHADER_FRAGMENT:
3975                 fprintf(stderr, "  export_16bpc = 0x%X\n", key->ps.export_16bpc);
3976                 fprintf(stderr, "  last_cbuf = %u\n", key->ps.last_cbuf);
3977                 fprintf(stderr, "  color_two_side = %u\n", key->ps.color_two_side);
3978                 fprintf(stderr, "  alpha_func = %u\n", key->ps.alpha_func);
3979                 fprintf(stderr, "  alpha_to_one = %u\n", key->ps.alpha_to_one);
3980                 fprintf(stderr, "  poly_stipple = %u\n", key->ps.poly_stipple);
3981                 break;
3982
3983         default:
3984                 assert(0);
3985         }
3986 }
3987
3988 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
3989                      struct si_shader *shader)
3990 {
3991         struct si_shader_selector *sel = shader->selector;
3992         struct tgsi_token *tokens = sel->tokens;
3993         struct si_shader_context si_shader_ctx;
3994         struct lp_build_tgsi_context * bld_base;
3995         struct tgsi_shader_info stipple_shader_info;
3996         LLVMModuleRef mod;
3997         int r = 0;
3998         bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
3999                             shader->key.ps.poly_stipple;
4000         bool dump = r600_can_dump_shader(&sscreen->b, sel->tokens);
4001
4002         if (poly_stipple) {
4003                 tokens = util_pstipple_create_fragment_shader(tokens, NULL,
4004                                                 SI_POLY_STIPPLE_SAMPLER);
4005                 tgsi_scan_shader(tokens, &stipple_shader_info);
4006         }
4007
4008         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
4009          * conversion fails. */
4010         if (dump) {
4011                 si_dump_key(sel->type, &shader->key);
4012                 tgsi_dump(tokens, 0);
4013                 si_dump_streamout(&sel->so);
4014         }
4015
4016         assert(shader->nparam == 0);
4017
4018         memset(&si_shader_ctx, 0, sizeof(si_shader_ctx));
4019         radeon_llvm_context_init(&si_shader_ctx.radeon_bld);
4020         bld_base = &si_shader_ctx.radeon_bld.soa.bld_base;
4021
4022         if (sel->type != PIPE_SHADER_COMPUTE)
4023                 shader->dx10_clamp_mode = true;
4024
4025         if (sel->info.uses_kill)
4026                 shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
4027
4028         shader->uses_instanceid = sel->info.uses_instanceid;
4029         bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
4030         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
4031
4032         bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
4033         bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
4034         bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
4035
4036         bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
4037         bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
4038         bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
4039         bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
4040         bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
4041         bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
4042         bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
4043         bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
4044         bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
4045         bld_base->op_actions[TGSI_OPCODE_TXQ] = txq_action;
4046         bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
4047         bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
4048
4049         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
4050         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
4051         bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
4052         bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
4053
4054         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
4055         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
4056         bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
4057
4058         if (HAVE_LLVM >= 0x0306) {
4059                 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
4060                 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
4061                 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
4062                 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
4063         }
4064
4065         si_shader_ctx.radeon_bld.load_system_value = declare_system_value;
4066         si_shader_ctx.shader = shader;
4067         si_shader_ctx.type = tgsi_get_processor_type(tokens);
4068         si_shader_ctx.screen = sscreen;
4069         si_shader_ctx.tm = tm;
4070
4071         switch (si_shader_ctx.type) {
4072         case TGSI_PROCESSOR_VERTEX:
4073                 si_shader_ctx.radeon_bld.load_input = declare_input_vs;
4074                 if (shader->key.vs.as_ls)
4075                         bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
4076                 else if (shader->key.vs.as_es)
4077                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
4078                 else
4079                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
4080                 break;
4081         case TGSI_PROCESSOR_TESS_CTRL:
4082                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
4083                 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
4084                 bld_base->emit_store = store_output_tcs;
4085                 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
4086                 break;
4087         case TGSI_PROCESSOR_TESS_EVAL:
4088                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
4089                 if (shader->key.tes.as_es)
4090                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
4091                 else
4092                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
4093                 break;
4094         case TGSI_PROCESSOR_GEOMETRY:
4095                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
4096                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
4097                 break;
4098         case TGSI_PROCESSOR_FRAGMENT:
4099                 si_shader_ctx.radeon_bld.load_input = declare_input_fs;
4100                 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
4101
4102                 switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
4103                 case TGSI_FS_DEPTH_LAYOUT_GREATER:
4104                         shader->db_shader_control |=
4105                                 S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
4106                         break;
4107                 case TGSI_FS_DEPTH_LAYOUT_LESS:
4108                         shader->db_shader_control |=
4109                                 S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
4110                         break;
4111                 }
4112                 break;
4113         default:
4114                 assert(!"Unsupported shader type");
4115                 return -1;
4116         }
4117
4118         create_meta_data(&si_shader_ctx);
4119         create_function(&si_shader_ctx);
4120         preload_constants(&si_shader_ctx);
4121         preload_samplers(&si_shader_ctx);
4122         preload_streamout_buffers(&si_shader_ctx);
4123         preload_ring_buffers(&si_shader_ctx);
4124
4125         if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
4126                 int i;
4127                 for (i = 0; i < 4; i++) {
4128                         si_shader_ctx.gs_next_vertex[i] =
4129                                 lp_build_alloca(bld_base->base.gallivm,
4130                                                 bld_base->uint_bld.elem_type, "");
4131                 }
4132         }
4133
4134         if (!lp_build_tgsi_llvm(bld_base, tokens)) {
4135                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
4136                 goto out;
4137         }
4138
4139         radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
4140
4141         mod = bld_base->base.gallivm->module;
4142         r = si_compile_llvm(sscreen, shader, tm, mod);
4143         if (r) {
4144                 fprintf(stderr, "LLVM failed to compile shader\n");
4145                 goto out;
4146         }
4147
4148         radeon_llvm_dispose(&si_shader_ctx.radeon_bld);
4149
4150         if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
4151                 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
4152                 shader->gs_copy_shader->selector = shader->selector;
4153                 shader->gs_copy_shader->key = shader->key;
4154                 si_shader_ctx.shader = shader->gs_copy_shader;
4155                 if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
4156                                                     shader, dump))) {
4157                         free(shader->gs_copy_shader);
4158                         shader->gs_copy_shader = NULL;
4159                         goto out;
4160                 }
4161         }
4162
4163 out:
4164         for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
4165                 FREE(si_shader_ctx.constants[i]);
4166         if (poly_stipple)
4167                 tgsi_free_tokens(tokens);
4168         return r;
4169 }
4170
4171 void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
4172 {
4173         if (shader->gs_copy_shader)
4174                 si_shader_destroy(ctx, shader->gs_copy_shader);
4175
4176         if (shader->scratch_bo)
4177                 r600_resource_reference(&shader->scratch_bo, NULL);
4178
4179         r600_resource_reference(&shader->bo, NULL);
4180
4181         FREE(shader->binary.code);
4182         FREE(shader->binary.relocs);
4183         FREE(shader->binary.disasm_string);
4184 }