src/gallium/drivers/radeonsi/si_shader.c

   1 /*
   2  * Copyright 2012 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Tom Stellard <thomas.stellard@amd.com>
  25  *      Michel Dänzer <michel.daenzer@amd.com>
  26  *      Christian König <christian.koenig@amd.com>
  27  */
  28
  29 #include "gallivm/lp_bld_const.h"
  30 #include "gallivm/lp_bld_gather.h"
  31 #include "gallivm/lp_bld_intr.h"
  32 #include "gallivm/lp_bld_logic.h"
  33 #include "gallivm/lp_bld_arit.h"
  34 #include "gallivm/lp_bld_flow.h"
  35 #include "radeon/r600_cs.h"
  36 #include "radeon/radeon_llvm.h"
  37 #include "radeon/radeon_elf_util.h"
  38 #include "radeon/radeon_llvm_emit.h"
  39 #include "util/u_memory.h"
  40 #include "util/u_pstipple.h"
  41 #include "tgsi/tgsi_parse.h"
  42 #include "tgsi/tgsi_util.h"
  43 #include "tgsi/tgsi_dump.h"
  44
  45 #include "si_pipe.h"
  46 #include "si_shader.h"
  47 #include "sid.h"
  48
  49 #include <errno.h>
  50
  51 static const char *scratch_rsrc_dword0_symbol =
  52         "SCRATCH_RSRC_DWORD0";
  53
  54 static const char *scratch_rsrc_dword1_symbol =
  55         "SCRATCH_RSRC_DWORD1";
  56
  57 struct si_shader_output_values
  58 {
  59         LLVMValueRef values[4];
  60         unsigned name;
  61         unsigned sid;
  62 };
  63
  64 struct si_shader_context
  65 {
  66         struct radeon_llvm_context radeon_bld;
  67         struct si_shader *shader;
  68         struct si_screen *screen;
  69         unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
  70         int param_streamout_config;
  71         int param_streamout_write_index;
  72         int param_streamout_offset[4];
  73         int param_vertex_id;
  74         int param_instance_id;
  75         LLVMTargetMachineRef tm;
  76         LLVMValueRef const_md;
  77         LLVMValueRef const_resource[SI_NUM_CONST_BUFFERS];
  78         LLVMValueRef ddxy_lds;
  79         LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
  80         LLVMValueRef resources[SI_NUM_SAMPLER_VIEWS];
  81         LLVMValueRef samplers[SI_NUM_SAMPLER_STATES];
  82         LLVMValueRef so_buffers[4];
  83         LLVMValueRef esgs_ring;
  84         LLVMValueRef gsvs_ring;
  85         LLVMValueRef gs_next_vertex;
  86 };
  87
  88 static struct si_shader_context * si_shader_context(
  89         struct lp_build_tgsi_context * bld_base)
  90 {
  91         return (struct si_shader_context *)bld_base;
  92 }
  93
  94
  95 #define PERSPECTIVE_BASE 0
  96 #define LINEAR_BASE 9
  97
  98 #define SAMPLE_OFFSET 0
  99 #define CENTER_OFFSET 2
 100 #define CENTROID_OFSET 4
 101
 102 #define USE_SGPR_MAX_SUFFIX_LEN 5
 103 #define CONST_ADDR_SPACE 2
 104 #define LOCAL_ADDR_SPACE 3
 105 #define USER_SGPR_ADDR_SPACE 8
 106
 107
 108 #define SENDMSG_GS 2
 109 #define SENDMSG_GS_DONE 3
 110
 111 #define SENDMSG_GS_OP_NOP      (0 << 4)
 112 #define SENDMSG_GS_OP_CUT      (1 << 4)
 113 #define SENDMSG_GS_OP_EMIT     (2 << 4)
 114 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
 115
 116 /**
 117  * Returns a unique index for a semantic name and index. The index must be
 118  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
 119  * calculated.
 120  */
 121 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 122 {
 123         switch (semantic_name) {
 124         case TGSI_SEMANTIC_POSITION:
 125                 return 0;
 126         case TGSI_SEMANTIC_PSIZE:
 127                 return 1;
 128         case TGSI_SEMANTIC_CLIPDIST:
 129                 assert(index <= 1);
 130                 return 2 + index;
 131         case TGSI_SEMANTIC_GENERIC:
 132                 assert(index <= 63-4);
 133                 return 4 + index;
 134
 135         default:
 136                 /* Don't fail here. The result of this function is only used
 137                  * for LS, TCS, TES, and GS, where legacy GL semantics can't
 138                  * occur, but this function is called for all vertex shaders
 139                  * before it's known whether LS will be compiled or not.
 140                  */
 141                 return 0;
 142         }
 143 }
 144
 145 /**
 146  * Given a semantic name and index of a parameter and a mask of used parameters
 147  * (inputs or outputs), return the index of the parameter in the list of all
 148  * used parameters.
 149  *
 150  * For example, assume this list of parameters:
 151  *   POSITION, PSIZE, GENERIC0, GENERIC2
 152  * which has the mask:
 153  *   11000000000101
 154  * Then:
 155  *   querying POSITION returns 0,
 156  *   querying PSIZE returns 1,
 157  *   querying GENERIC0 returns 2,
 158  *   querying GENERIC2 returns 3.
 159  *
 160  * Which can be used as an offset to a parameter buffer in units of vec4s.
 161  */
 162 static int get_param_index(unsigned semantic_name, unsigned index,
 163                            uint64_t mask)
 164 {
 165         unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index);
 166         int i, param_index = 0;
 167
 168         /* If not present... */
 169         if (!((1llu << unique_index) & mask))
 170                 return -1;
 171
 172         for (i = 0; mask; i++) {
 173                 uint64_t bit = 1llu << i;
 174
 175                 if (bit & mask) {
 176                         if (i == unique_index)
 177                                 return param_index;
 178
 179                         mask &= ~bit;
 180                         param_index++;
 181                 }
 182         }
 183
 184         assert(!"unreachable");
 185         return -1;
 186 }
 187
 188 /**
 189  * Get the value of a shader input parameter and extract a bitfield.
 190  */
 191 static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
 192                                  unsigned param, unsigned rshift,
 193                                  unsigned bitwidth)
 194 {
 195         struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
 196         LLVMValueRef value = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 197                                           param);
 198
 199         if (rshift)
 200                 value = LLVMBuildLShr(gallivm->builder, value,
 201                                       lp_build_const_int32(gallivm, rshift), "");
 202
 203         if (rshift + bitwidth < 32) {
 204                 unsigned mask = (1 << bitwidth) - 1;
 205                 value = LLVMBuildAnd(gallivm->builder, value,
 206                                      lp_build_const_int32(gallivm, mask), "");
 207         }
 208
 209         return value;
 210 }
 211
 212 /**
 213  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
 214  * It's equivalent to doing a load from &base_ptr[index].
 215  *
 216  * \param base_ptr  Where the array starts.
 217  * \param index     The element index into the array.
 218  */
 219 static LLVMValueRef build_indexed_load(struct si_shader_context *si_shader_ctx,
 220                                        LLVMValueRef base_ptr, LLVMValueRef index)
 221 {
 222         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
 223         struct gallivm_state *gallivm = bld_base->base.gallivm;
 224         LLVMValueRef indices[2], pointer;
 225
 226         indices[0] = bld_base->uint_bld.zero;
 227         indices[1] = index;
 228
 229         pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
 230         return LLVMBuildLoad(gallivm->builder, pointer, "");
 231 }
 232
 233 /**
 234  * Do a load from &base_ptr[index], but also add a flag that it's loading
 235  * a constant.
 236  */
 237 static LLVMValueRef build_indexed_load_const(
 238         struct si_shader_context * si_shader_ctx,
 239         LLVMValueRef base_ptr, LLVMValueRef index)
 240 {
 241         LLVMValueRef result = build_indexed_load(si_shader_ctx, base_ptr, index);
 242         LLVMSetMetadata(result, 1, si_shader_ctx->const_md);
 243         return result;
 244 }
 245
 246 static LLVMValueRef get_instance_index_for_fetch(
 247         struct radeon_llvm_context * radeon_bld,
 248         unsigned divisor)
 249 {
 250         struct si_shader_context *si_shader_ctx =
 251                 si_shader_context(&radeon_bld->soa.bld_base);
 252         struct gallivm_state * gallivm = radeon_bld->soa.bld_base.base.gallivm;
 253
 254         LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
 255                                            si_shader_ctx->param_instance_id);
 256
 257         /* The division must be done before START_INSTANCE is added. */
 258         if (divisor > 1)
 259                 result = LLVMBuildUDiv(gallivm->builder, result,
 260                                 lp_build_const_int32(gallivm, divisor), "");
 261
 262         return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(
 263                         radeon_bld->main_fn, SI_PARAM_START_INSTANCE), "");
 264 }
 265
 266 static void declare_input_vs(
 267         struct radeon_llvm_context *radeon_bld,
 268         unsigned input_index,
 269         const struct tgsi_full_declaration *decl)
 270 {
 271         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 272         struct gallivm_state *gallivm = base->gallivm;
 273         struct si_shader_context *si_shader_ctx =
 274                 si_shader_context(&radeon_bld->soa.bld_base);
 275         unsigned divisor = si_shader_ctx->shader->key.vs.instance_divisors[input_index];
 276
 277         unsigned chan;
 278
 279         LLVMValueRef t_list_ptr;
 280         LLVMValueRef t_offset;
 281         LLVMValueRef t_list;
 282         LLVMValueRef attribute_offset;
 283         LLVMValueRef buffer_index;
 284         LLVMValueRef args[3];
 285         LLVMTypeRef vec4_type;
 286         LLVMValueRef input;
 287
 288         /* Load the T list */
 289         t_list_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFER);
 290
 291         t_offset = lp_build_const_int32(gallivm, input_index);
 292
 293         t_list = build_indexed_load_const(si_shader_ctx, t_list_ptr, t_offset);
 294
 295         /* Build the attribute offset */
 296         attribute_offset = lp_build_const_int32(gallivm, 0);
 297
 298         if (divisor) {
 299                 /* Build index from instance ID, start instance and divisor */
 300                 si_shader_ctx->shader->uses_instanceid = true;
 301                 buffer_index = get_instance_index_for_fetch(&si_shader_ctx->radeon_bld, divisor);
 302         } else {
 303                 /* Load the buffer index for vertices. */
 304                 LLVMValueRef vertex_id = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 305                                                       si_shader_ctx->param_vertex_id);
 306                 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
 307                                                         SI_PARAM_BASE_VERTEX);
 308                 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
 309         }
 310
 311         vec4_type = LLVMVectorType(base->elem_type, 4);
 312         args[0] = t_list;
 313         args[1] = attribute_offset;
 314         args[2] = buffer_index;
 315         input = build_intrinsic(gallivm->builder,
 316                 "llvm.SI.vs.load.input", vec4_type, args, 3,
 317                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 318
 319         /* Break up the vec4 into individual components */
 320         for (chan = 0; chan < 4; chan++) {
 321                 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 322                 /* XXX: Use a helper function for this.  There is one in
 323                  * tgsi_llvm.c. */
 324                 si_shader_ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
 325                                 LLVMBuildExtractElement(gallivm->builder,
 326                                 input, llvm_chan, "");
 327         }
 328 }
 329
 330 static LLVMValueRef fetch_input_gs(
 331         struct lp_build_tgsi_context *bld_base,
 332         const struct tgsi_full_src_register *reg,
 333         enum tgsi_opcode_type type,
 334         unsigned swizzle)
 335 {
 336         struct lp_build_context *base = &bld_base->base;
 337         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 338         struct si_shader *shader = si_shader_ctx->shader;
 339         struct lp_build_context *uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 340         struct gallivm_state *gallivm = base->gallivm;
 341         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 342         LLVMValueRef vtx_offset;
 343         LLVMValueRef args[9];
 344         unsigned vtx_offset_param;
 345         struct tgsi_shader_info *info = &shader->selector->info;
 346         unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 347         unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
 348
 349         if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) {
 350                 if (swizzle == 0)
 351                         return LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 352                                             SI_PARAM_PRIMITIVE_ID);
 353                 else
 354                         return uint->zero;
 355         }
 356
 357         if (!reg->Register.Dimension)
 358                 return NULL;
 359
 360         if (swizzle == ~0) {
 361                 LLVMValueRef values[TGSI_NUM_CHANNELS];
 362                 unsigned chan;
 363                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 364                         values[chan] = fetch_input_gs(bld_base, reg, type, chan);
 365                 }
 366                 return lp_build_gather_values(bld_base->base.gallivm, values,
 367                                               TGSI_NUM_CHANNELS);
 368         }
 369
 370         /* Get the vertex offset parameter */
 371         vtx_offset_param = reg->Dimension.Index;
 372         if (vtx_offset_param < 2) {
 373                 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
 374         } else {
 375                 assert(vtx_offset_param < 6);
 376                 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
 377         }
 378         vtx_offset = lp_build_mul_imm(uint,
 379                                       LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 380                                                    vtx_offset_param),
 381                                       4);
 382
 383         args[0] = si_shader_ctx->esgs_ring;
 384         args[1] = vtx_offset;
 385         args[2] = lp_build_const_int32(gallivm,
 386                                        (get_param_index(semantic_name, semantic_index,
 387                                                         shader->selector->gs_used_inputs) * 4 +
 388                                         swizzle) * 256);
 389         args[3] = uint->zero;
 390         args[4] = uint->one;  /* OFFEN */
 391         args[5] = uint->zero; /* IDXEN */
 392         args[6] = uint->one;  /* GLC */
 393         args[7] = uint->zero; /* SLC */
 394         args[8] = uint->zero; /* TFE */
 395
 396         return LLVMBuildBitCast(gallivm->builder,
 397                                 build_intrinsic(gallivm->builder,
 398                                                 "llvm.SI.buffer.load.dword.i32.i32",
 399                                                 i32, args, 9,
 400                                                 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
 401                                 tgsi2llvmtype(bld_base, type), "");
 402 }
 403
 404 static void declare_input_fs(
 405         struct radeon_llvm_context *radeon_bld,
 406         unsigned input_index,
 407         const struct tgsi_full_declaration *decl)
 408 {
 409         struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
 410         struct si_shader_context *si_shader_ctx =
 411                 si_shader_context(&radeon_bld->soa.bld_base);
 412         struct si_shader *shader = si_shader_ctx->shader;
 413         struct lp_build_context *uint = &radeon_bld->soa.bld_base.uint_bld;
 414         struct gallivm_state *gallivm = base->gallivm;
 415         LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
 416         LLVMValueRef main_fn = radeon_bld->main_fn;
 417
 418         LLVMValueRef interp_param;
 419         const char * intr_name;
 420
 421         /* This value is:
 422          * [15:0] NewPrimMask (Bit mask for each quad.  It is set it the
 423          *                     quad begins a new primitive.  Bit 0 always needs
 424          *                     to be unset)
 425          * [32:16] ParamOffset
 426          *
 427          */
 428         LLVMValueRef params = LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
 429         LLVMValueRef attr_number;
 430
 431         unsigned chan;
 432
 433         if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
 434                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 435                         unsigned soa_index =
 436                                 radeon_llvm_reg_index_soa(input_index, chan);
 437                         radeon_bld->inputs[soa_index] =
 438                                 LLVMGetParam(main_fn, SI_PARAM_POS_X_FLOAT + chan);
 439
 440                         if (chan == 3)
 441                                 /* RCP for fragcoord.w */
 442                                 radeon_bld->inputs[soa_index] =
 443                                         LLVMBuildFDiv(gallivm->builder,
 444                                                       lp_build_const_float(gallivm, 1.0f),
 445                                                       radeon_bld->inputs[soa_index],
 446                                                       "");
 447                 }
 448                 return;
 449         }
 450
 451         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
 452                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
 453                         LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
 454                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
 455                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
 456                         lp_build_const_float(gallivm, 0.0f);
 457                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
 458                         lp_build_const_float(gallivm, 1.0f);
 459
 460                 return;
 461         }
 462
 463         shader->ps_input_param_offset[input_index] = shader->nparam++;
 464         attr_number = lp_build_const_int32(gallivm,
 465                                            shader->ps_input_param_offset[input_index]);
 466
 467         switch (decl->Interp.Interpolate) {
 468         case TGSI_INTERPOLATE_CONSTANT:
 469                 interp_param = 0;
 470                 break;
 471         case TGSI_INTERPOLATE_LINEAR:
 472                 if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE)
 473                         interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_SAMPLE);
 474                 else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID)
 475                         interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTROID);
 476                 else
 477                         interp_param = LLVMGetParam(main_fn, SI_PARAM_LINEAR_CENTER);
 478                 break;
 479         case TGSI_INTERPOLATE_COLOR:
 480         case TGSI_INTERPOLATE_PERSPECTIVE:
 481                 if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_SAMPLE)
 482                         interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_SAMPLE);
 483                 else if (decl->Interp.Location == TGSI_INTERPOLATE_LOC_CENTROID)
 484                         interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTROID);
 485                 else
 486                         interp_param = LLVMGetParam(main_fn, SI_PARAM_PERSP_CENTER);
 487                 break;
 488         default:
 489                 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
 490                 return;
 491         }
 492
 493         /* fs.constant returns the param from the middle vertex, so it's not
 494          * really useful for flat shading. It's meant to be used for custom
 495          * interpolation (but the intrinsic can't fetch from the other two
 496          * vertices).
 497          *
 498          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
 499          * to do the right thing. The only reason we use fs.constant is that
 500          * fs.interp cannot be used on integers, because they can be equal
 501          * to NaN.
 502          */
 503         intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
 504
 505         if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
 506             si_shader_ctx->shader->key.ps.color_two_side) {
 507                 LLVMValueRef args[4];
 508                 LLVMValueRef face, is_face_positive;
 509                 LLVMValueRef back_attr_number =
 510                         lp_build_const_int32(gallivm,
 511                                              shader->ps_input_param_offset[input_index] + 1);
 512
 513                 face = LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
 514
 515                 is_face_positive = LLVMBuildFCmp(gallivm->builder,
 516                                                  LLVMRealOGT, face,
 517                                                  lp_build_const_float(gallivm, 0.0f),
 518                                                  "");
 519
 520                 args[2] = params;
 521                 args[3] = interp_param;
 522                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 523                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 524                         unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
 525                         LLVMValueRef front, back;
 526
 527                         args[0] = llvm_chan;
 528                         args[1] = attr_number;
 529                         front = build_intrinsic(gallivm->builder, intr_name,
 530                                                 input_type, args, args[3] ? 4 : 3,
 531                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 532
 533                         args[1] = back_attr_number;
 534                         back = build_intrinsic(gallivm->builder, intr_name,
 535                                                input_type, args, args[3] ? 4 : 3,
 536                                                LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 537
 538                         radeon_bld->inputs[soa_index] =
 539                                 LLVMBuildSelect(gallivm->builder,
 540                                                 is_face_positive,
 541                                                 front,
 542                                                 back,
 543                                                 "");
 544                 }
 545
 546                 shader->nparam++;
 547         } else if (decl->Semantic.Name == TGSI_SEMANTIC_FOG) {
 548                 LLVMValueRef args[4];
 549
 550                 args[0] = uint->zero;
 551                 args[1] = attr_number;
 552                 args[2] = params;
 553                 args[3] = interp_param;
 554                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
 555                         build_intrinsic(gallivm->builder, intr_name,
 556                                         input_type, args, args[3] ? 4 : 3,
 557                                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 558                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
 559                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
 560                         lp_build_const_float(gallivm, 0.0f);
 561                 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
 562                         lp_build_const_float(gallivm, 1.0f);
 563         } else {
 564                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 565                         LLVMValueRef args[4];
 566                         LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 567                         unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
 568                         args[0] = llvm_chan;
 569                         args[1] = attr_number;
 570                         args[2] = params;
 571                         args[3] = interp_param;
 572                         radeon_bld->inputs[soa_index] =
 573                                 build_intrinsic(gallivm->builder, intr_name,
 574                                                 input_type, args, args[3] ? 4 : 3,
 575                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 576                 }
 577         }
 578 }
 579
 580 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
 581 {
 582         return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
 583                             SI_PARAM_ANCILLARY, 8, 4);
 584 }
 585
 586 /**
 587  * Load a dword from a constant buffer.
 588  */
 589 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
 590                                       LLVMValueRef offset, LLVMTypeRef return_type)
 591 {
 592         LLVMValueRef args[2] = {resource, offset};
 593
 594         return build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
 595                                LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 596 }
 597
 598 static void declare_system_value(
 599         struct radeon_llvm_context * radeon_bld,
 600         unsigned index,
 601         const struct tgsi_full_declaration *decl)
 602 {
 603         struct si_shader_context *si_shader_ctx =
 604                 si_shader_context(&radeon_bld->soa.bld_base);
 605         struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
 606         struct gallivm_state *gallivm = &radeon_bld->gallivm;
 607         LLVMValueRef value = 0;
 608
 609         switch (decl->Semantic.Name) {
 610         case TGSI_SEMANTIC_INSTANCEID:
 611                 value = LLVMGetParam(radeon_bld->main_fn,
 612                                      si_shader_ctx->param_instance_id);
 613                 break;
 614
 615         case TGSI_SEMANTIC_VERTEXID:
 616                 value = LLVMBuildAdd(gallivm->builder,
 617                                      LLVMGetParam(radeon_bld->main_fn,
 618                                                   si_shader_ctx->param_vertex_id),
 619                                      LLVMGetParam(radeon_bld->main_fn,
 620                                                   SI_PARAM_BASE_VERTEX), "");
 621                 break;
 622
 623         case TGSI_SEMANTIC_VERTEXID_NOBASE:
 624                 value = LLVMGetParam(radeon_bld->main_fn,
 625                                      si_shader_ctx->param_vertex_id);
 626                 break;
 627
 628         case TGSI_SEMANTIC_BASEVERTEX:
 629                 value = LLVMGetParam(radeon_bld->main_fn,
 630                                      SI_PARAM_BASE_VERTEX);
 631                 break;
 632
 633         case TGSI_SEMANTIC_INVOCATIONID:
 634                 value = LLVMGetParam(radeon_bld->main_fn,
 635                                      SI_PARAM_GS_INSTANCE_ID);
 636                 break;
 637
 638         case TGSI_SEMANTIC_SAMPLEID:
 639                 value = get_sample_id(radeon_bld);
 640                 break;
 641
 642         case TGSI_SEMANTIC_SAMPLEPOS:
 643         {
 644                 LLVMBuilderRef builder = gallivm->builder;
 645                 LLVMValueRef desc = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
 646                 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
 647                 LLVMValueRef resource = build_indexed_load_const(si_shader_ctx, desc, buf_index);
 648
 649                 /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
 650                 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, get_sample_id(radeon_bld), 8);
 651                 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
 652
 653                 LLVMValueRef pos[4] = {
 654                         buffer_load_const(builder, resource, offset0, radeon_bld->soa.bld_base.base.elem_type),
 655                         buffer_load_const(builder, resource, offset1, radeon_bld->soa.bld_base.base.elem_type),
 656                         lp_build_const_float(gallivm, 0),
 657                         lp_build_const_float(gallivm, 0)
 658                 };
 659                 value = lp_build_gather_values(gallivm, pos, 4);
 660                 break;
 661         }
 662
 663         case TGSI_SEMANTIC_SAMPLEMASK:
 664                 /* Smoothing isn't MSAA in GL, but it's MSAA in hardware.
 665                  * Therefore, force gl_SampleMaskIn to 1 for GL. */
 666                 if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
 667                         value = uint_bld->one;
 668                 else
 669                         value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
 670                 break;
 671
 672         default:
 673                 assert(!"unknown system value");
 674                 return;
 675         }
 676
 677         radeon_bld->system_values[index] = value;
 678 }
 679
 680 static LLVMValueRef fetch_constant(
 681         struct lp_build_tgsi_context * bld_base,
 682         const struct tgsi_full_src_register *reg,
 683         enum tgsi_opcode_type type,
 684         unsigned swizzle)
 685 {
 686         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 687         struct lp_build_context * base = &bld_base->base;
 688         const struct tgsi_ind_register *ireg = &reg->Indirect;
 689         unsigned buf, idx;
 690
 691         LLVMValueRef addr;
 692         LLVMValueRef result;
 693
 694         if (swizzle == LP_CHAN_ALL) {
 695                 unsigned chan;
 696                 LLVMValueRef values[4];
 697                 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
 698                         values[chan] = fetch_constant(bld_base, reg, type, chan);
 699
 700                 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
 701         }
 702
 703         buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
 704         idx = reg->Register.Index * 4 + swizzle;
 705
 706         if (!reg->Register.Indirect) {
 707                 if (type != TGSI_TYPE_DOUBLE)
 708                         return bitcast(bld_base, type, si_shader_ctx->constants[buf][idx]);
 709                 else {
 710                         return radeon_llvm_emit_fetch_double(bld_base,
 711                                                              si_shader_ctx->constants[buf][idx],
 712                                                              si_shader_ctx->constants[buf][idx + 1]);
 713                 }
 714         }
 715
 716         addr = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 717         addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
 718         addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
 719         addr = lp_build_add(&bld_base->uint_bld, addr,
 720                             lp_build_const_int32(base->gallivm, idx * 4));
 721
 722         result = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
 723                                    addr, bld_base->base.elem_type);
 724
 725         if (type != TGSI_TYPE_DOUBLE)
 726                 result = bitcast(bld_base, type, result);
 727         else {
 728                 LLVMValueRef addr2, result2;
 729                 addr2 = si_shader_ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
 730                 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
 731                 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
 732                 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
 733                                      lp_build_const_int32(base->gallivm, idx * 4));
 734
 735                 result2 = buffer_load_const(base->gallivm->builder, si_shader_ctx->const_resource[buf],
 736                                    addr2, bld_base->base.elem_type);
 737
 738                 result = radeon_llvm_emit_fetch_double(bld_base,
 739                                                        result, result2);
 740         }
 741         return result;
 742 }
 743
 744 /* Initialize arguments for the shader export intrinsic */
 745 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
 746                                      LLVMValueRef *values,
 747                                      unsigned target,
 748                                      LLVMValueRef *args)
 749 {
 750         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 751         struct lp_build_context *uint =
 752                                 &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 753         struct lp_build_context *base = &bld_base->base;
 754         unsigned compressed = 0;
 755         unsigned chan;
 756
 757         if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 758                 int cbuf = target - V_008DFC_SQ_EXP_MRT;
 759
 760                 if (cbuf >= 0 && cbuf < 8) {
 761                         compressed = (si_shader_ctx->shader->key.ps.export_16bpc >> cbuf) & 0x1;
 762
 763                         if (compressed)
 764                                 si_shader_ctx->shader->spi_shader_col_format |=
 765                                         V_028714_SPI_SHADER_FP16_ABGR << (4 * cbuf);
 766                         else
 767                                 si_shader_ctx->shader->spi_shader_col_format |=
 768                                         V_028714_SPI_SHADER_32_ABGR << (4 * cbuf);
 769
 770                         si_shader_ctx->shader->cb_shader_mask |= 0xf << (4 * cbuf);
 771                 }
 772         }
 773
 774         if (compressed) {
 775                 /* Pixel shader needs to pack output values before export */
 776                 for (chan = 0; chan < 2; chan++ ) {
 777                         args[0] = values[2 * chan];
 778                         args[1] = values[2 * chan + 1];
 779                         args[chan + 5] =
 780                                 build_intrinsic(base->gallivm->builder,
 781                                                 "llvm.SI.packf16",
 782                                                 LLVMInt32TypeInContext(base->gallivm->context),
 783                                                 args, 2,
 784                                                 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 785                         args[chan + 7] = args[chan + 5] =
 786                                 LLVMBuildBitCast(base->gallivm->builder,
 787                                                  args[chan + 5],
 788                                                  LLVMFloatTypeInContext(base->gallivm->context),
 789                                                  "");
 790                 }
 791
 792                 /* Set COMPR flag */
 793                 args[4] = uint->one;
 794         } else {
 795                 for (chan = 0; chan < 4; chan++ )
 796                         /* +5 because the first output value will be
 797                          * the 6th argument to the intrinsic. */
 798                         args[chan + 5] = values[chan];
 799
 800                 /* Clear COMPR flag */
 801                 args[4] = uint->zero;
 802         }
 803
 804         /* XXX: This controls which components of the output
 805          * registers actually get exported. (e.g bit 0 means export
 806          * X component, bit 1 means export Y component, etc.)  I'm
 807          * hard coding this to 0xf for now.  In the future, we might
 808          * want to do something else. */
 809         args[0] = lp_build_const_int32(base->gallivm, 0xf);
 810
 811         /* Specify whether the EXEC mask represents the valid mask */
 812         args[1] = uint->zero;
 813
 814         /* Specify whether this is the last export */
 815         args[2] = uint->zero;
 816
 817         /* Specify the target we are exporting */
 818         args[3] = lp_build_const_int32(base->gallivm, target);
 819
 820         /* XXX: We probably need to keep track of the output
 821          * values, so we know what we are passing to the next
 822          * stage. */
 823 }
 824
 825 /* Load from output pointers and initialize arguments for the shader export intrinsic */
 826 static void si_llvm_init_export_args_load(struct lp_build_tgsi_context *bld_base,
 827                                           LLVMValueRef *out_ptr,
 828                                           unsigned target,
 829                                           LLVMValueRef *args)
 830 {
 831         struct gallivm_state *gallivm = bld_base->base.gallivm;
 832         LLVMValueRef values[4];
 833         int i;
 834
 835         for (i = 0; i < 4; i++)
 836                 values[i] = LLVMBuildLoad(gallivm->builder, out_ptr[i], "");
 837
 838         si_llvm_init_export_args(bld_base, values, target, args);
 839 }
 840
 841 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
 842                           LLVMValueRef alpha_ptr)
 843 {
 844         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 845         struct gallivm_state *gallivm = bld_base->base.gallivm;
 846
 847         if (si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_NEVER) {
 848                 LLVMValueRef alpha_ref = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 849                                 SI_PARAM_ALPHA_REF);
 850
 851                 LLVMValueRef alpha_pass =
 852                         lp_build_cmp(&bld_base->base,
 853                                      si_shader_ctx->shader->key.ps.alpha_func,
 854                                      LLVMBuildLoad(gallivm->builder, alpha_ptr, ""),
 855                                      alpha_ref);
 856                 LLVMValueRef arg =
 857                         lp_build_select(&bld_base->base,
 858                                         alpha_pass,
 859                                         lp_build_const_float(gallivm, 1.0f),
 860                                         lp_build_const_float(gallivm, -1.0f));
 861
 862                 build_intrinsic(gallivm->builder,
 863                                 "llvm.AMDGPU.kill",
 864                                 LLVMVoidTypeInContext(gallivm->context),
 865                                 &arg, 1, 0);
 866         } else {
 867                 build_intrinsic(gallivm->builder,
 868                                 "llvm.AMDGPU.kilp",
 869                                 LLVMVoidTypeInContext(gallivm->context),
 870                                 NULL, 0, 0);
 871         }
 872
 873         si_shader_ctx->shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
 874 }
 875
 876 static void si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
 877                                           LLVMValueRef alpha_ptr)
 878 {
 879         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 880         struct gallivm_state *gallivm = bld_base->base.gallivm;
 881         LLVMValueRef coverage, alpha;
 882
 883         /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
 884         coverage = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 885                                 SI_PARAM_SAMPLE_COVERAGE);
 886         coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
 887
 888         coverage = build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
 889                                    bld_base->int_bld.elem_type,
 890                                    &coverage, 1, LLVMReadNoneAttribute);
 891
 892         coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
 893                                    bld_base->base.elem_type, "");
 894
 895         coverage = LLVMBuildFMul(gallivm->builder, coverage,
 896                                  lp_build_const_float(gallivm,
 897                                         1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
 898
 899         alpha = LLVMBuildLoad(gallivm->builder, alpha_ptr, "");
 900         alpha = LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
 901         LLVMBuildStore(gallivm->builder, alpha, alpha_ptr);
 902 }
 903
 904 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context * bld_base,
 905                                     LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
 906 {
 907         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
 908         struct lp_build_context *base = &bld_base->base;
 909         struct lp_build_context *uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
 910         unsigned reg_index;
 911         unsigned chan;
 912         unsigned const_chan;
 913         LLVMValueRef base_elt;
 914         LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
 915         LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF);
 916         LLVMValueRef const_resource = build_indexed_load_const(si_shader_ctx, ptr, constbuf_index);
 917
 918         for (reg_index = 0; reg_index < 2; reg_index ++) {
 919                 LLVMValueRef *args = pos[2 + reg_index];
 920
 921                 args[5] =
 922                 args[6] =
 923                 args[7] =
 924                 args[8] = lp_build_const_float(base->gallivm, 0.0f);
 925
 926                 /* Compute dot products of position and user clip plane vectors */
 927                 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 928                         for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
 929                                 args[1] = lp_build_const_int32(base->gallivm,
 930                                                                ((reg_index * 4 + chan) * 4 +
 931                                                                 const_chan) * 4);
 932                                 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
 933                                                       args[1], base->elem_type);
 934                                 args[5 + chan] =
 935                                         lp_build_add(base, args[5 + chan],
 936                                                      lp_build_mul(base, base_elt,
 937                                                                   out_elts[const_chan]));
 938                         }
 939                 }
 940
 941                 args[0] = lp_build_const_int32(base->gallivm, 0xf);
 942                 args[1] = uint->zero;
 943                 args[2] = uint->zero;
 944                 args[3] = lp_build_const_int32(base->gallivm,
 945                                                V_008DFC_SQ_EXP_POS + 2 + reg_index);
 946                 args[4] = uint->zero;
 947         }
 948 }
 949
 950 static void si_dump_streamout(struct pipe_stream_output_info *so)
 951 {
 952         unsigned i;
 953
 954         if (so->num_outputs)
 955                 fprintf(stderr, "STREAMOUT\n");
 956
 957         for (i = 0; i < so->num_outputs; i++) {
 958                 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
 959                                 so->output[i].start_component;
 960                 fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
 961                         i, so->output[i].output_buffer,
 962                         so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
 963                         so->output[i].register_index,
 964                         mask & 1 ? "x" : "",
 965                         mask & 2 ? "y" : "",
 966                         mask & 4 ? "z" : "",
 967                         mask & 8 ? "w" : "");
 968         }
 969 }
 970
 971 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
 972  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
 973  * or v4i32 (num_channels=3,4). */
 974 static void build_tbuffer_store(struct si_shader_context *shader,
 975                                 LLVMValueRef rsrc,
 976                                 LLVMValueRef vdata,
 977                                 unsigned num_channels,
 978                                 LLVMValueRef vaddr,
 979                                 LLVMValueRef soffset,
 980                                 unsigned inst_offset,
 981                                 unsigned dfmt,
 982                                 unsigned nfmt,
 983                                 unsigned offen,
 984                                 unsigned idxen,
 985                                 unsigned glc,
 986                                 unsigned slc,
 987                                 unsigned tfe)
 988 {
 989         struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
 990         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 991         LLVMValueRef args[] = {
 992                 rsrc,
 993                 vdata,
 994                 LLVMConstInt(i32, num_channels, 0),
 995                 vaddr,
 996                 soffset,
 997                 LLVMConstInt(i32, inst_offset, 0),
 998                 LLVMConstInt(i32, dfmt, 0),
 999                 LLVMConstInt(i32, nfmt, 0),
1000                 LLVMConstInt(i32, offen, 0),
1001                 LLVMConstInt(i32, idxen, 0),
1002                 LLVMConstInt(i32, glc, 0),
1003                 LLVMConstInt(i32, slc, 0),
1004                 LLVMConstInt(i32, tfe, 0)
1005         };
1006
1007         /* The instruction offset field has 12 bits */
1008         assert(offen || inst_offset < (1 << 12));
1009
1010         /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
1011         unsigned func = CLAMP(num_channels, 1, 3) - 1;
1012         const char *types[] = {"i32", "v2i32", "v4i32"};
1013         char name[256];
1014         snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
1015
1016         lp_build_intrinsic(gallivm->builder, name,
1017                            LLVMVoidTypeInContext(gallivm->context),
1018                            args, Elements(args));
1019 }
1020
1021 static void build_streamout_store(struct si_shader_context *shader,
1022                                   LLVMValueRef rsrc,
1023                                   LLVMValueRef vdata,
1024                                   unsigned num_channels,
1025                                   LLVMValueRef vaddr,
1026                                   LLVMValueRef soffset,
1027                                   unsigned inst_offset)
1028 {
1029         static unsigned dfmt[] = {
1030                 V_008F0C_BUF_DATA_FORMAT_32,
1031                 V_008F0C_BUF_DATA_FORMAT_32_32,
1032                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1033                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1034         };
1035         assert(num_channels >= 1 && num_channels <= 4);
1036
1037         build_tbuffer_store(shader, rsrc, vdata, num_channels, vaddr, soffset,
1038                             inst_offset, dfmt[num_channels-1],
1039                             V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
1040 }
1041
1042 /* On SI, the vertex shader is responsible for writing streamout data
1043  * to buffers. */
1044 static void si_llvm_emit_streamout(struct si_shader_context *shader,
1045                                    struct si_shader_output_values *outputs,
1046                                    unsigned noutput)
1047 {
1048         struct pipe_stream_output_info *so = &shader->shader->selector->so;
1049         struct gallivm_state *gallivm = &shader->radeon_bld.gallivm;
1050         LLVMBuilderRef builder = gallivm->builder;
1051         int i, j;
1052         struct lp_build_if_state if_ctx;
1053
1054         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
1055
1056         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1057         LLVMValueRef so_vtx_count =
1058                 unpack_param(shader, shader->param_streamout_config, 16, 7);
1059
1060         LLVMValueRef tid = build_intrinsic(builder, "llvm.SI.tid", i32,
1061                                            NULL, 0, LLVMReadNoneAttribute);
1062
1063         /* can_emit = tid < so_vtx_count; */
1064         LLVMValueRef can_emit =
1065                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
1066
1067         /* Emit the streamout code conditionally. This actually avoids
1068          * out-of-bounds buffer access. The hw tells us via the SGPR
1069          * (so_vtx_count) which threads are allowed to emit streamout data. */
1070         lp_build_if(&if_ctx, gallivm, can_emit);
1071         {
1072                 /* The buffer offset is computed as follows:
1073                  *   ByteOffset = streamout_offset[buffer_id]*4 +
1074                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
1075                  *                attrib_offset
1076                  */
1077
1078                 LLVMValueRef so_write_index =
1079                         LLVMGetParam(shader->radeon_bld.main_fn,
1080                                      shader->param_streamout_write_index);
1081
1082                 /* Compute (streamout_write_index + thread_id). */
1083                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
1084
1085                 /* Compute the write offset for each enabled buffer. */
1086                 LLVMValueRef so_write_offset[4] = {};
1087                 for (i = 0; i < 4; i++) {
1088                         if (!so->stride[i])
1089                                 continue;
1090
1091                         LLVMValueRef so_offset = LLVMGetParam(shader->radeon_bld.main_fn,
1092                                                               shader->param_streamout_offset[i]);
1093                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(i32, 4, 0), "");
1094
1095                         so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
1096                                                           LLVMConstInt(i32, so->stride[i]*4, 0), "");
1097                         so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
1098                 }
1099
1100                 /* Write streamout data. */
1101                 for (i = 0; i < so->num_outputs; i++) {
1102                         unsigned buf_idx = so->output[i].output_buffer;
1103                         unsigned reg = so->output[i].register_index;
1104                         unsigned start = so->output[i].start_component;
1105                         unsigned num_comps = so->output[i].num_components;
1106                         LLVMValueRef out[4];
1107
1108                         assert(num_comps && num_comps <= 4);
1109                         if (!num_comps || num_comps > 4)
1110                                 continue;
1111
1112                         if (reg >= noutput)
1113                                 continue;
1114
1115                         /* Load the output as int. */
1116                         for (j = 0; j < num_comps; j++) {
1117                                 out[j] = LLVMBuildBitCast(builder,
1118                                                           outputs[reg].values[start+j],
1119                                                 i32, "");
1120                         }
1121
1122                         /* Pack the output. */
1123                         LLVMValueRef vdata = NULL;
1124
1125                         switch (num_comps) {
1126                         case 1: /* as i32 */
1127                                 vdata = out[0];
1128                                 break;
1129                         case 2: /* as v2i32 */
1130                         case 3: /* as v4i32 (aligned to 4) */
1131                         case 4: /* as v4i32 */
1132                                 vdata = LLVMGetUndef(LLVMVectorType(i32, util_next_power_of_two(num_comps)));
1133                                 for (j = 0; j < num_comps; j++) {
1134                                         vdata = LLVMBuildInsertElement(builder, vdata, out[j],
1135                                                                        LLVMConstInt(i32, j, 0), "");
1136                                 }
1137                                 break;
1138                         }
1139
1140                         build_streamout_store(shader, shader->so_buffers[buf_idx],
1141                                               vdata, num_comps,
1142                                               so_write_offset[buf_idx],
1143                                               LLVMConstInt(i32, 0, 0),
1144                                               so->output[i].dst_offset*4);
1145                 }
1146         }
1147         lp_build_endif(&if_ctx);
1148 }
1149
1150
1151 /* Generate export instructions for hardware VS shader stage */
1152 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
1153                               struct si_shader_output_values *outputs,
1154                               unsigned noutput)
1155 {
1156         struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
1157         struct si_shader * shader = si_shader_ctx->shader;
1158         struct lp_build_context * base = &bld_base->base;
1159         struct lp_build_context * uint =
1160                                 &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
1161         LLVMValueRef args[9];
1162         LLVMValueRef pos_args[4][9] = { { 0 } };
1163         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
1164         unsigned semantic_name, semantic_index;
1165         unsigned target;
1166         unsigned param_count = 0;
1167         unsigned pos_idx;
1168         int i;
1169
1170         if (outputs && si_shader_ctx->shader->selector->so.num_outputs) {
1171                 si_llvm_emit_streamout(si_shader_ctx, outputs, noutput);
1172         }
1173
1174         for (i = 0; i < noutput; i++) {
1175                 semantic_name = outputs[i].name;
1176                 semantic_index = outputs[i].sid;
1177
1178 handle_semantic:
1179                 /* Select the correct target */
1180                 switch(semantic_name) {
1181                 case TGSI_SEMANTIC_PSIZE:
1182                         psize_value = outputs[i].values[0];
1183                         continue;
1184                 case TGSI_SEMANTIC_EDGEFLAG:
1185                         edgeflag_value = outputs[i].values[0];
1186                         continue;
1187                 case TGSI_SEMANTIC_LAYER:
1188                         layer_value = outputs[i].values[0];
1189                         semantic_name = TGSI_SEMANTIC_GENERIC;
1190                         goto handle_semantic;
1191                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1192                         viewport_index_value = outputs[i].values[0];
1193                         semantic_name = TGSI_SEMANTIC_GENERIC;
1194                         goto handle_semantic;
1195                 case TGSI_SEMANTIC_POSITION:
1196                         target = V_008DFC_SQ_EXP_POS;
1197                         break;
1198                 case TGSI_SEMANTIC_COLOR:
1199                 case TGSI_SEMANTIC_BCOLOR:
1200                         target = V_008DFC_SQ_EXP_PARAM + param_count;
1201                         shader->vs_output_param_offset[i] = param_count;
1202                         param_count++;
1203                         break;
1204                 case TGSI_SEMANTIC_CLIPDIST:
1205                         target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
1206                         break;
1207                 case TGSI_SEMANTIC_CLIPVERTEX:
1208                         si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
1209                         continue;
1210                 case TGSI_SEMANTIC_PRIMID:
1211                 case TGSI_SEMANTIC_FOG:
1212                 case TGSI_SEMANTIC_TEXCOORD:
1213                 case TGSI_SEMANTIC_GENERIC:
1214                         target = V_008DFC_SQ_EXP_PARAM + param_count;
1215                         shader->vs_output_param_offset[i] = param_count;
1216                         param_count++;
1217                         break;
1218                 default:
1219                         target = 0;
1220                         fprintf(stderr,
1221                                 "Warning: SI unhandled vs output type:%d\n",
1222                                 semantic_name);
1223                 }
1224
1225                 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
1226
1227                 if (target >= V_008DFC_SQ_EXP_POS &&
1228                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
1229                         memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
1230                                args, sizeof(args));
1231                 } else {
1232                         lp_build_intrinsic(base->gallivm->builder,
1233                                            "llvm.SI.export",
1234                                            LLVMVoidTypeInContext(base->gallivm->context),
1235                                            args, 9);
1236                 }
1237
1238                 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
1239                         semantic_name = TGSI_SEMANTIC_GENERIC;
1240                         goto handle_semantic;
1241                 }
1242         }
1243
1244         shader->nr_param_exports = param_count;
1245
1246         /* We need to add the position output manually if it's missing. */
1247         if (!pos_args[0][0]) {
1248                 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1249                 pos_args[0][1] = uint->zero; /* EXEC mask */
1250                 pos_args[0][2] = uint->zero; /* last export? */
1251                 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
1252                 pos_args[0][4] = uint->zero; /* COMPR flag */
1253                 pos_args[0][5] = base->zero; /* X */
1254                 pos_args[0][6] = base->zero; /* Y */
1255                 pos_args[0][7] = base->zero; /* Z */
1256                 pos_args[0][8] = base->one;  /* W */
1257         }
1258
1259         /* Write the misc vector (point size, edgeflag, layer, viewport). */
1260         if (shader->selector->info.writes_psize ||
1261             shader->selector->info.writes_edgeflag ||
1262             shader->selector->info.writes_viewport_index ||
1263             shader->selector->info.writes_layer) {
1264                 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
1265                                                       shader->selector->info.writes_psize |
1266                                                       (shader->selector->info.writes_edgeflag << 1) |
1267                                                       (shader->selector->info.writes_layer << 2) |
1268                                                       (shader->selector->info.writes_viewport_index << 3));
1269                 pos_args[1][1] = uint->zero; /* EXEC mask */
1270                 pos_args[1][2] = uint->zero; /* last export? */
1271                 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
1272                 pos_args[1][4] = uint->zero; /* COMPR flag */
1273                 pos_args[1][5] = base->zero; /* X */
1274                 pos_args[1][6] = base->zero; /* Y */
1275                 pos_args[1][7] = base->zero; /* Z */
1276                 pos_args[1][8] = base->zero; /* W */
1277
1278                 if (shader->selector->info.writes_psize)
1279                         pos_args[1][5] = psize_value;
1280
1281                 if (shader->selector->info.writes_edgeflag) {
1282                         /* The output is a float, but the hw expects an integer
1283                          * with the first bit containing the edge flag. */
1284                         edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
1285                                                          edgeflag_value,
1286                                                          bld_base->uint_bld.elem_type, "");
1287                         edgeflag_value = lp_build_min(&bld_base->int_bld,
1288                                                       edgeflag_value,
1289                                                       bld_base->int_bld.one);
1290
1291                         /* The LLVM intrinsic expects a float. */
1292                         pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
1293                                                           edgeflag_value,
1294                                                           base->elem_type, "");
1295                 }
1296
1297                 if (shader->selector->info.writes_layer)
1298                         pos_args[1][7] = layer_value;
1299
1300                 if (shader->selector->info.writes_viewport_index)
1301                         pos_args[1][8] = viewport_index_value;
1302         }
1303
1304         for (i = 0; i < 4; i++)
1305                 if (pos_args[i][0])
1306                         shader->nr_pos_exports++;
1307
1308         pos_idx = 0;
1309         for (i = 0; i < 4; i++) {
1310                 if (!pos_args[i][0])
1311                         continue;
1312
1313                 /* Specify the target we are exporting */
1314                 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
1315
1316                 if (pos_idx == shader->nr_pos_exports)
1317                         /* Specify that this is the last export */
1318                         pos_args[i][2] = uint->one;
1319
1320                 lp_build_intrinsic(base->gallivm->builder,
1321                                    "llvm.SI.export",
1322                                    LLVMVoidTypeInContext(base->gallivm->context),
1323                                    pos_args[i], 9);
1324         }
1325 }
1326
1327 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
1328 {
1329         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1330         struct gallivm_state *gallivm = bld_base->base.gallivm;
1331         struct si_shader *es = si_shader_ctx->shader;
1332         struct tgsi_shader_info *info = &es->selector->info;
1333         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
1334         LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
1335                                             SI_PARAM_ES2GS_OFFSET);
1336         unsigned chan;
1337         int i;
1338
1339         for (i = 0; i < info->num_outputs; i++) {
1340                 LLVMValueRef *out_ptr =
1341                         si_shader_ctx->radeon_bld.soa.outputs[i];
1342                 int param_index;
1343
1344                 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
1345                     info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
1346                         continue;
1347
1348                 param_index = get_param_index(info->output_semantic_name[i],
1349                                               info->output_semantic_index[i],
1350                                               es->key.vs.gs_used_inputs);
1351                 if (param_index < 0)
1352                         continue;
1353
1354                 for (chan = 0; chan < 4; chan++) {
1355                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
1356                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
1357
1358                         build_tbuffer_store(si_shader_ctx,
1359                                             si_shader_ctx->esgs_ring,
1360                                             out_val, 1,
1361                                             LLVMGetUndef(i32), soffset,
1362                                             (4 * param_index + chan) * 4,
1363                                             V_008F0C_BUF_DATA_FORMAT_32,
1364                                             V_008F0C_BUF_NUM_FORMAT_UINT,
1365                                             0, 0, 1, 1, 0);
1366                 }
1367         }
1368 }
1369
1370 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
1371 {
1372         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1373         struct gallivm_state *gallivm = bld_base->base.gallivm;
1374         LLVMValueRef args[2];
1375
1376         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
1377         args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
1378         build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
1379                         LLVMVoidTypeInContext(gallivm->context), args, 2,
1380                         LLVMNoUnwindAttribute);
1381 }
1382
1383 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context * bld_base)
1384 {
1385         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1386         struct gallivm_state *gallivm = bld_base->base.gallivm;
1387         struct tgsi_shader_info *info = &si_shader_ctx->shader->selector->info;
1388         struct si_shader_output_values *outputs = NULL;
1389         int i,j;
1390
1391         outputs = MALLOC(info->num_outputs * sizeof(outputs[0]));
1392
1393         for (i = 0; i < info->num_outputs; i++) {
1394                 outputs[i].name = info->output_semantic_name[i];
1395                 outputs[i].sid = info->output_semantic_index[i];
1396
1397                 for (j = 0; j < 4; j++)
1398                         outputs[i].values[j] =
1399                                 LLVMBuildLoad(gallivm->builder,
1400                                               si_shader_ctx->radeon_bld.soa.outputs[i][j],
1401                                               "");
1402         }
1403
1404         si_llvm_export_vs(bld_base, outputs, info->num_outputs);
1405         FREE(outputs);
1406 }
1407
1408 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context * bld_base)
1409 {
1410         struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
1411         struct si_shader * shader = si_shader_ctx->shader;
1412         struct lp_build_context * base = &bld_base->base;
1413         struct lp_build_context * uint = &bld_base->uint_bld;
1414         struct tgsi_shader_info *info = &shader->selector->info;
1415         LLVMValueRef args[9];
1416         LLVMValueRef last_args[9] = { 0 };
1417         int depth_index = -1, stencil_index = -1, samplemask_index = -1;
1418         int i;
1419
1420         for (i = 0; i < info->num_outputs; i++) {
1421                 unsigned semantic_name = info->output_semantic_name[i];
1422                 unsigned semantic_index = info->output_semantic_index[i];
1423                 unsigned target;
1424                 LLVMValueRef alpha_ptr;
1425
1426                 /* Select the correct target */
1427                 switch (semantic_name) {
1428                 case TGSI_SEMANTIC_POSITION:
1429                         depth_index = i;
1430                         continue;
1431                 case TGSI_SEMANTIC_STENCIL:
1432                         stencil_index = i;
1433                         continue;
1434                 case TGSI_SEMANTIC_SAMPLEMASK:
1435                         samplemask_index = i;
1436                         continue;
1437                 case TGSI_SEMANTIC_COLOR:
1438                         target = V_008DFC_SQ_EXP_MRT + semantic_index;
1439                         alpha_ptr = si_shader_ctx->radeon_bld.soa.outputs[i][3];
1440
1441                         if (si_shader_ctx->shader->key.ps.alpha_to_one)
1442                                 LLVMBuildStore(base->gallivm->builder,
1443                                                base->one, alpha_ptr);
1444
1445                         if (semantic_index == 0 &&
1446                             si_shader_ctx->shader->key.ps.alpha_func != PIPE_FUNC_ALWAYS)
1447                                 si_alpha_test(bld_base, alpha_ptr);
1448
1449                         if (si_shader_ctx->shader->key.ps.poly_line_smoothing)
1450                                 si_scale_alpha_by_sample_mask(bld_base, alpha_ptr);
1451                         break;
1452                 default:
1453                         target = 0;
1454                         fprintf(stderr,
1455                                 "Warning: SI unhandled fs output type:%d\n",
1456                                 semantic_name);
1457                 }
1458
1459                 si_llvm_init_export_args_load(bld_base,
1460                                               si_shader_ctx->radeon_bld.soa.outputs[i],
1461                                               target, args);
1462
1463                 if (semantic_name == TGSI_SEMANTIC_COLOR) {
1464                         /* If there is an export instruction waiting to be emitted, do so now. */
1465                         if (last_args[0]) {
1466                                 lp_build_intrinsic(base->gallivm->builder,
1467                                                    "llvm.SI.export",
1468                                                    LLVMVoidTypeInContext(base->gallivm->context),
1469                                                    last_args, 9);
1470                         }
1471
1472                         /* This instruction will be emitted at the end of the shader. */
1473                         memcpy(last_args, args, sizeof(args));
1474
1475                         /* Handle FS_COLOR0_WRITES_ALL_CBUFS. */
1476                         if (shader->selector->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
1477                             semantic_index == 0 &&
1478                             si_shader_ctx->shader->key.ps.last_cbuf > 0) {
1479                                 for (int c = 1; c <= si_shader_ctx->shader->key.ps.last_cbuf; c++) {
1480                                         si_llvm_init_export_args_load(bld_base,
1481                                                                       si_shader_ctx->radeon_bld.soa.outputs[i],
1482                                                                       V_008DFC_SQ_EXP_MRT + c, args);
1483                                         lp_build_intrinsic(base->gallivm->builder,
1484                                                            "llvm.SI.export",
1485                                                            LLVMVoidTypeInContext(base->gallivm->context),
1486                                                            args, 9);
1487                                 }
1488                         }
1489                 } else {
1490                         lp_build_intrinsic(base->gallivm->builder,
1491                                            "llvm.SI.export",
1492                                            LLVMVoidTypeInContext(base->gallivm->context),
1493                                            args, 9);
1494                 }
1495         }
1496
1497         if (depth_index >= 0 || stencil_index >= 0 || samplemask_index >= 0) {
1498                 LLVMValueRef out_ptr;
1499                 unsigned mask = 0;
1500
1501                 /* Specify the target we are exporting */
1502                 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
1503
1504                 args[5] = base->zero; /* R, depth */
1505                 args[6] = base->zero; /* G, stencil test value[0:7], stencil op value[8:15] */
1506                 args[7] = base->zero; /* B, sample mask */
1507                 args[8] = base->zero; /* A, alpha to mask */
1508
1509                 if (depth_index >= 0) {
1510                         out_ptr = si_shader_ctx->radeon_bld.soa.outputs[depth_index][2];
1511                         args[5] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
1512                         mask |= 0x1;
1513                         si_shader_ctx->shader->db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1);
1514                 }
1515
1516                 if (stencil_index >= 0) {
1517                         out_ptr = si_shader_ctx->radeon_bld.soa.outputs[stencil_index][1];
1518                         args[6] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
1519                         mask |= 0x2;
1520                         si_shader_ctx->shader->db_shader_control |=
1521                                 S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(1);
1522                 }
1523
1524                 if (samplemask_index >= 0) {
1525                         out_ptr = si_shader_ctx->radeon_bld.soa.outputs[samplemask_index][0];
1526                         args[7] = LLVMBuildLoad(base->gallivm->builder, out_ptr, "");
1527                         mask |= 0x4;
1528                         si_shader_ctx->shader->db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(1);
1529                 }
1530
1531                 /* SI (except OLAND) has a bug that it only looks
1532                  * at the X writemask component. */
1533                 if (si_shader_ctx->screen->b.chip_class == SI &&
1534                     si_shader_ctx->screen->b.family != CHIP_OLAND)
1535                         mask |= 0x1;
1536
1537                 if (samplemask_index >= 0)
1538                         si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_ABGR;
1539                 else if (stencil_index >= 0)
1540                         si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_GR;
1541                 else
1542                         si_shader_ctx->shader->spi_shader_z_format = V_028710_SPI_SHADER_32_R;
1543
1544                 /* Specify which components to enable */
1545                 args[0] = lp_build_const_int32(base->gallivm, mask);
1546
1547                 args[1] =
1548                 args[2] =
1549                 args[4] = uint->zero;
1550
1551                 if (last_args[0])
1552                         lp_build_intrinsic(base->gallivm->builder,
1553                                            "llvm.SI.export",
1554                                            LLVMVoidTypeInContext(base->gallivm->context),
1555                                            args, 9);
1556                 else
1557                         memcpy(last_args, args, sizeof(args));
1558         }
1559
1560         if (!last_args[0]) {
1561                 /* Specify which components to enable */
1562                 last_args[0] = lp_build_const_int32(base->gallivm, 0x0);
1563
1564                 /* Specify the target we are exporting */
1565                 last_args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRT);
1566
1567                 /* Set COMPR flag to zero to export data as 32-bit */
1568                 last_args[4] = uint->zero;
1569
1570                 /* dummy bits */
1571                 last_args[5]= uint->zero;
1572                 last_args[6]= uint->zero;
1573                 last_args[7]= uint->zero;
1574                 last_args[8]= uint->zero;
1575         }
1576
1577         /* Specify whether the EXEC mask represents the valid mask */
1578         last_args[1] = uint->one;
1579
1580         /* Specify that this is the last export */
1581         last_args[2] = lp_build_const_int32(base->gallivm, 1);
1582
1583         lp_build_intrinsic(base->gallivm->builder,
1584                            "llvm.SI.export",
1585                            LLVMVoidTypeInContext(base->gallivm->context),
1586                            last_args, 9);
1587 }
1588
1589 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
1590                                 struct lp_build_tgsi_context * bld_base,
1591                                 struct lp_build_emit_data * emit_data);
1592
1593 static bool tgsi_is_shadow_sampler(unsigned target)
1594 {
1595         return target == TGSI_TEXTURE_SHADOW1D ||
1596                target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
1597                target == TGSI_TEXTURE_SHADOW2D ||
1598                target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
1599                target == TGSI_TEXTURE_SHADOWCUBE ||
1600                target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
1601                target == TGSI_TEXTURE_SHADOWRECT;
1602 }
1603
1604 static const struct lp_build_tgsi_action tex_action;
1605
1606 static void tex_fetch_args(
1607         struct lp_build_tgsi_context * bld_base,
1608         struct lp_build_emit_data * emit_data)
1609 {
1610         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
1611         struct gallivm_state *gallivm = bld_base->base.gallivm;
1612         const struct tgsi_full_instruction * inst = emit_data->inst;
1613         unsigned opcode = inst->Instruction.Opcode;
1614         unsigned target = inst->Texture.Texture;
1615         LLVMValueRef coords[5];
1616         LLVMValueRef address[16];
1617         int ref_pos;
1618         unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
1619         unsigned count = 0;
1620         unsigned chan;
1621         unsigned sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
1622         unsigned sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
1623         bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
1624
1625         if (target == TGSI_TEXTURE_BUFFER) {
1626                 LLVMTypeRef i128 = LLVMIntTypeInContext(gallivm->context, 128);
1627                 LLVMTypeRef v2i128 = LLVMVectorType(i128, 2);
1628                 LLVMTypeRef i8 = LLVMInt8TypeInContext(gallivm->context);
1629                 LLVMTypeRef v16i8 = LLVMVectorType(i8, 16);
1630
1631                 /* Bitcast and truncate v8i32 to v16i8. */
1632                 LLVMValueRef res = si_shader_ctx->resources[sampler_index];
1633                 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
1634                 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
1635                 res = LLVMBuildBitCast(gallivm->builder, res, v16i8, "");
1636
1637                 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
1638                 emit_data->args[0] = res;
1639                 emit_data->args[1] = bld_base->uint_bld.zero;
1640                 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
1641                 emit_data->arg_count = 3;
1642                 return;
1643         }
1644
1645         /* Fetch and project texture coordinates */
1646         coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
1647         for (chan = 0; chan < 3; chan++ ) {
1648                 coords[chan] = lp_build_emit_fetch(bld_base,
1649                                                    emit_data->inst, 0,
1650                                                    chan);
1651                 if (opcode == TGSI_OPCODE_TXP)
1652                         coords[chan] = lp_build_emit_llvm_binary(bld_base,
1653                                                                  TGSI_OPCODE_DIV,
1654                                                                  coords[chan],
1655                                                                  coords[3]);
1656         }
1657
1658         if (opcode == TGSI_OPCODE_TXP)
1659                 coords[3] = bld_base->base.one;
1660
1661         /* Pack offsets. */
1662         if (has_offset && opcode != TGSI_OPCODE_TXF) {
1663                 /* The offsets are six-bit signed integers packed like this:
1664                  *   X=[5:0], Y=[13:8], and Z=[21:16].
1665                  */
1666                 LLVMValueRef offset[3], pack;
1667
1668                 assert(inst->Texture.NumOffsets == 1);
1669
1670                 for (chan = 0; chan < 3; chan++) {
1671                         offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
1672                                                                      emit_data->inst, 0, chan);
1673                         offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
1674                                                     lp_build_const_int32(gallivm, 0x3f), "");
1675                         if (chan)
1676                                 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
1677                                                             lp_build_const_int32(gallivm, chan*8), "");
1678                 }
1679
1680                 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
1681                 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
1682                 address[count++] = pack;
1683         }
1684
1685         /* Pack LOD bias value */
1686         if (opcode == TGSI_OPCODE_TXB)
1687                 address[count++] = coords[3];
1688         if (opcode == TGSI_OPCODE_TXB2)
1689                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
1690
1691         /* Pack depth comparison value */
1692         if (tgsi_is_shadow_sampler(target) && opcode != TGSI_OPCODE_LODQ) {
1693                 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1694                         address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
1695                 } else {
1696                         assert(ref_pos >= 0);
1697                         address[count++] = coords[ref_pos];
1698                 }
1699         }
1700
1701         if (target == TGSI_TEXTURE_CUBE ||
1702             target == TGSI_TEXTURE_CUBE_ARRAY ||
1703             target == TGSI_TEXTURE_SHADOWCUBE ||
1704             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
1705                 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords);
1706
1707         /* Pack user derivatives */
1708         if (opcode == TGSI_OPCODE_TXD) {
1709                 int num_deriv_channels, param;
1710
1711                 switch (target) {
1712                 case TGSI_TEXTURE_3D:
1713                         num_deriv_channels = 3;
1714                         break;
1715                 case TGSI_TEXTURE_2D:
1716                 case TGSI_TEXTURE_SHADOW2D:
1717                 case TGSI_TEXTURE_RECT:
1718                 case TGSI_TEXTURE_SHADOWRECT:
1719                 case TGSI_TEXTURE_2D_ARRAY:
1720                 case TGSI_TEXTURE_SHADOW2D_ARRAY:
1721                 case TGSI_TEXTURE_CUBE:
1722                 case TGSI_TEXTURE_SHADOWCUBE:
1723                 case TGSI_TEXTURE_CUBE_ARRAY:
1724                 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
1725                         num_deriv_channels = 2;
1726                         break;
1727                 case TGSI_TEXTURE_1D:
1728                 case TGSI_TEXTURE_SHADOW1D:
1729                 case TGSI_TEXTURE_1D_ARRAY:
1730                 case TGSI_TEXTURE_SHADOW1D_ARRAY:
1731                         num_deriv_channels = 1;
1732                         break;
1733                 default:
1734                         assert(0); /* no other targets are valid here */
1735                 }
1736
1737                 for (param = 1; param <= 2; param++)
1738                         for (chan = 0; chan < num_deriv_channels; chan++)
1739                                 address[count++] = lp_build_emit_fetch(bld_base, inst, param, chan);
1740         }
1741
1742         /* Pack texture coordinates */
1743         address[count++] = coords[0];
1744         if (num_coords > 1)
1745                 address[count++] = coords[1];
1746         if (num_coords > 2)
1747                 address[count++] = coords[2];
1748
1749         /* Pack LOD or sample index */
1750         if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
1751                 address[count++] = coords[3];
1752         else if (opcode == TGSI_OPCODE_TXL2)
1753                 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 0);
1754
1755         if (count > 16) {
1756                 assert(!"Cannot handle more than 16 texture address parameters");
1757                 count = 16;
1758         }
1759
1760         for (chan = 0; chan < count; chan++ ) {
1761                 address[chan] = LLVMBuildBitCast(gallivm->builder,
1762                                                  address[chan],
1763                                                  LLVMInt32TypeInContext(gallivm->context),
1764                                                  "");
1765         }
1766
1767         /* Adjust the sample index according to FMASK.
1768          *
1769          * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
1770          * which is the identity mapping. Each nibble says which physical sample
1771          * should be fetched to get that sample.
1772          *
1773          * For example, 0x11111100 means there are only 2 samples stored and
1774          * the second sample covers 3/4 of the pixel. When reading samples 0
1775          * and 1, return physical sample 0 (determined by the first two 0s
1776          * in FMASK), otherwise return physical sample 1.
1777          *
1778          * The sample index should be adjusted as follows:
1779          *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
1780          */
1781         if (target == TGSI_TEXTURE_2D_MSAA ||
1782             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
1783                 struct lp_build_context *uint_bld = &bld_base->uint_bld;
1784                 struct lp_build_emit_data txf_emit_data = *emit_data;
1785                 LLVMValueRef txf_address[4];
1786                 unsigned txf_count = count;
1787                 struct tgsi_full_instruction inst = {};
1788
1789                 memcpy(txf_address, address, sizeof(txf_address));
1790
1791                 if (target == TGSI_TEXTURE_2D_MSAA) {
1792                         txf_address[2] = bld_base->uint_bld.zero;
1793                 }
1794                 txf_address[3] = bld_base->uint_bld.zero;
1795
1796                 /* Pad to a power-of-two size. */
1797                 while (txf_count < util_next_power_of_two(txf_count))
1798                         txf_address[txf_count++] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1799
1800                 /* Read FMASK using TXF. */
1801                 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
1802                 inst.Texture.Texture = target == TGSI_TEXTURE_2D_MSAA ? TGSI_TEXTURE_2D : TGSI_TEXTURE_2D_ARRAY;
1803                 txf_emit_data.inst = &inst;
1804                 txf_emit_data.chan = 0;
1805                 txf_emit_data.dst_type = LLVMVectorType(
1806                         LLVMInt32TypeInContext(gallivm->context), 4);
1807                 txf_emit_data.args[0] = lp_build_gather_values(gallivm, txf_address, txf_count);
1808                 txf_emit_data.args[1] = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
1809                 txf_emit_data.args[2] = lp_build_const_int32(gallivm, inst.Texture.Texture);
1810                 txf_emit_data.arg_count = 3;
1811
1812                 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
1813
1814                 /* Initialize some constants. */
1815                 LLVMValueRef four = LLVMConstInt(uint_bld->elem_type, 4, 0);
1816                 LLVMValueRef F = LLVMConstInt(uint_bld->elem_type, 0xF, 0);
1817
1818                 /* Apply the formula. */
1819                 LLVMValueRef fmask =
1820                         LLVMBuildExtractElement(gallivm->builder,
1821                                                 txf_emit_data.output[0],
1822                                                 uint_bld->zero, "");
1823
1824                 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
1825
1826                 LLVMValueRef sample_index4 =
1827                         LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
1828
1829                 LLVMValueRef shifted_fmask =
1830                         LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
1831
1832                 LLVMValueRef final_sample =
1833                         LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
1834
1835                 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
1836                  * resource descriptor is 0 (invalid),
1837                  */
1838                 LLVMValueRef fmask_desc =
1839                         LLVMBuildBitCast(gallivm->builder,
1840                                          si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index],
1841                                          LLVMVectorType(uint_bld->elem_type, 8), "");
1842
1843                 LLVMValueRef fmask_word1 =
1844                         LLVMBuildExtractElement(gallivm->builder, fmask_desc,
1845                                                 uint_bld->one, "");
1846
1847                 LLVMValueRef word1_is_nonzero =
1848                         LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1849                                       fmask_word1, uint_bld->zero, "");
1850
1851                 /* Replace the MSAA sample index. */
1852                 address[sample_chan] =
1853                         LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
1854                                         final_sample, address[sample_chan], "");
1855         }
1856
1857         /* Resource */
1858         emit_data->args[1] = si_shader_ctx->resources[sampler_index];
1859
1860         if (opcode == TGSI_OPCODE_TXF) {
1861                 /* add tex offsets */
1862                 if (inst->Texture.NumOffsets) {
1863                         struct lp_build_context *uint_bld = &bld_base->uint_bld;
1864                         struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
1865                         const struct tgsi_texture_offset * off = inst->TexOffsets;
1866
1867                         assert(inst->Texture.NumOffsets == 1);
1868
1869                         switch (target) {
1870                         case TGSI_TEXTURE_3D:
1871                                 address[2] = lp_build_add(uint_bld, address[2],
1872                                                 bld->immediates[off->Index][off->SwizzleZ]);
1873                                 /* fall through */
1874                         case TGSI_TEXTURE_2D:
1875                         case TGSI_TEXTURE_SHADOW2D:
1876                         case TGSI_TEXTURE_RECT:
1877                         case TGSI_TEXTURE_SHADOWRECT:
1878                         case TGSI_TEXTURE_2D_ARRAY:
1879                         case TGSI_TEXTURE_SHADOW2D_ARRAY:
1880                                 address[1] =
1881                                         lp_build_add(uint_bld, address[1],
1882                                                 bld->immediates[off->Index][off->SwizzleY]);
1883                                 /* fall through */
1884                         case TGSI_TEXTURE_1D:
1885                         case TGSI_TEXTURE_SHADOW1D:
1886                         case TGSI_TEXTURE_1D_ARRAY:
1887                         case TGSI_TEXTURE_SHADOW1D_ARRAY:
1888                                 address[0] =
1889                                         lp_build_add(uint_bld, address[0],
1890                                                 bld->immediates[off->Index][off->SwizzleX]);
1891                                 break;
1892                                 /* texture offsets do not apply to other texture targets */
1893                         }
1894                 }
1895
1896                 emit_data->args[2] = lp_build_const_int32(gallivm, target);
1897                 emit_data->arg_count = 3;
1898
1899                 emit_data->dst_type = LLVMVectorType(
1900                         LLVMInt32TypeInContext(gallivm->context),
1901                         4);
1902         } else if (opcode == TGSI_OPCODE_TG4 ||
1903                    opcode == TGSI_OPCODE_LODQ ||
1904                    has_offset) {
1905                 unsigned is_array = target == TGSI_TEXTURE_1D_ARRAY ||
1906                                     target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
1907                                     target == TGSI_TEXTURE_2D_ARRAY ||
1908                                     target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
1909                                     target == TGSI_TEXTURE_CUBE_ARRAY ||
1910                                     target == TGSI_TEXTURE_SHADOWCUBE_ARRAY;
1911                 unsigned is_rect = target == TGSI_TEXTURE_RECT;
1912                 unsigned dmask = 0xf;
1913
1914                 if (opcode == TGSI_OPCODE_TG4) {
1915                         unsigned gather_comp = 0;
1916
1917                         /* DMASK was repurposed for GATHER4. 4 components are always
1918                          * returned and DMASK works like a swizzle - it selects
1919                          * the component to fetch. The only valid DMASK values are
1920                          * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
1921                          * (red,red,red,red) etc.) The ISA document doesn't mention
1922                          * this.
1923                          */
1924
1925                         /* Get the component index from src1.x for Gather4. */
1926                         if (!tgsi_is_shadow_sampler(target)) {
1927                                 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
1928                                 LLVMValueRef comp_imm;
1929                                 struct tgsi_src_register src1 = inst->Src[1].Register;
1930
1931                                 assert(src1.File == TGSI_FILE_IMMEDIATE);
1932
1933                                 comp_imm = imms[src1.Index][src1.SwizzleX];
1934                                 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
1935                                 gather_comp = CLAMP(gather_comp, 0, 3);
1936                         }
1937
1938                         dmask = 1 << gather_comp;
1939                 }
1940
1941                 emit_data->args[2] = si_shader_ctx->samplers[sampler_index];
1942                 emit_data->args[3] = lp_build_const_int32(gallivm, dmask);
1943                 emit_data->args[4] = lp_build_const_int32(gallivm, is_rect); /* unorm */
1944                 emit_data->args[5] = lp_build_const_int32(gallivm, 0); /* r128 */
1945                 emit_data->args[6] = lp_build_const_int32(gallivm, is_array); /* da */
1946                 emit_data->args[7] = lp_build_const_int32(gallivm, 0); /* glc */
1947                 emit_data->args[8] = lp_build_const_int32(gallivm, 0); /* slc */
1948                 emit_data->args[9] = lp_build_const_int32(gallivm, 0); /* tfe */
1949                 emit_data->args[10] = lp_build_const_int32(gallivm, 0); /* lwe */
1950
1951                 emit_data->arg_count = 11;
1952
1953                 emit_data->dst_type = LLVMVectorType(
1954                         LLVMFloatTypeInContext(gallivm->context),
1955                         4);
1956         } else {
1957                 emit_data->args[2] = si_shader_ctx->samplers[sampler_index];
1958                 emit_data->args[3] = lp_build_const_int32(gallivm, target);
1959                 emit_data->arg_count = 4;
1960
1961                 emit_data->dst_type = LLVMVectorType(
1962                         LLVMFloatTypeInContext(gallivm->context),
1963                         4);
1964         }
1965
1966         /* The fetch opcode has been converted to a 2D array fetch.
1967          * This simplifies the LLVM backend. */
1968         if (target == TGSI_TEXTURE_CUBE_ARRAY)
1969                 target = TGSI_TEXTURE_2D_ARRAY;
1970         else if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
1971                 target = TGSI_TEXTURE_SHADOW2D_ARRAY;
1972
1973         /* Pad to power of two vector */
1974         while (count < util_next_power_of_two(count))
1975                 address[count++] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1976
1977         emit_data->args[0] = lp_build_gather_values(gallivm, address, count);
1978 }
1979
1980 static void build_tex_intrinsic(const struct lp_build_tgsi_action * action,
1981                                 struct lp_build_tgsi_context * bld_base,
1982                                 struct lp_build_emit_data * emit_data)
1983 {
1984         struct lp_build_context * base = &bld_base->base;
1985         unsigned opcode = emit_data->inst->Instruction.Opcode;
1986         unsigned target = emit_data->inst->Texture.Texture;
1987         char intr_name[127];
1988         bool has_offset = HAVE_LLVM >= 0x0305 ?
1989                                 emit_data->inst->Texture.NumOffsets > 0 : false;
1990
1991         if (target == TGSI_TEXTURE_BUFFER) {
1992                 emit_data->output[emit_data->chan] = build_intrinsic(
1993                         base->gallivm->builder,
1994                         "llvm.SI.vs.load.input", emit_data->dst_type,
1995                         emit_data->args, emit_data->arg_count,
1996                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1997                 return;
1998         }
1999
2000         if (opcode == TGSI_OPCODE_TG4 ||
2001             opcode == TGSI_OPCODE_LODQ ||
2002             (opcode != TGSI_OPCODE_TXF && has_offset)) {
2003                 bool is_shadow = tgsi_is_shadow_sampler(target);
2004                 const char *name = "llvm.SI.image.sample";
2005                 const char *infix = "";
2006
2007                 switch (opcode) {
2008                 case TGSI_OPCODE_TEX:
2009                 case TGSI_OPCODE_TEX2:
2010                 case TGSI_OPCODE_TXP:
2011                         break;
2012                 case TGSI_OPCODE_TXB:
2013                 case TGSI_OPCODE_TXB2:
2014                         infix = ".b";
2015                         break;
2016                 case TGSI_OPCODE_TXL:
2017                 case TGSI_OPCODE_TXL2:
2018                         infix = ".l";
2019                         break;
2020                 case TGSI_OPCODE_TXD:
2021                         infix = ".d";
2022                         break;
2023                 case TGSI_OPCODE_TG4:
2024                         name = "llvm.SI.gather4";
2025                         break;
2026                 case TGSI_OPCODE_LODQ:
2027                         name = "llvm.SI.getlod";
2028                         is_shadow = false;
2029                         has_offset = false;
2030                         break;
2031                 default:
2032                         assert(0);
2033                         return;
2034                 }
2035
2036                 /* Add the type and suffixes .c, .o if needed. */
2037                 sprintf(intr_name, "%s%s%s%s.v%ui32", name,
2038                         is_shadow ? ".c" : "", infix, has_offset ? ".o" : "",
2039                         LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
2040
2041                 emit_data->output[emit_data->chan] = build_intrinsic(
2042                         base->gallivm->builder, intr_name, emit_data->dst_type,
2043                         emit_data->args, emit_data->arg_count,
2044                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
2045         } else {
2046                 LLVMTypeRef i8, v16i8, v32i8;
2047                 const char *name;
2048
2049                 switch (opcode) {
2050                 case TGSI_OPCODE_TEX:
2051                 case TGSI_OPCODE_TEX2:
2052                 case TGSI_OPCODE_TXP:
2053                         name = "llvm.SI.sample";
2054                         break;
2055                 case TGSI_OPCODE_TXB:
2056                 case TGSI_OPCODE_TXB2:
2057                         name = "llvm.SI.sampleb";
2058                         break;
2059                 case TGSI_OPCODE_TXD:
2060                         name = "llvm.SI.sampled";
2061                         break;
2062                 case TGSI_OPCODE_TXF:
2063                         name = "llvm.SI.imageload";
2064                         break;
2065                 case TGSI_OPCODE_TXL:
2066                 case TGSI_OPCODE_TXL2:
2067                         name = "llvm.SI.samplel";
2068                         break;
2069                 default:
2070                         assert(0);
2071                         return;
2072                 }
2073
2074                 i8 = LLVMInt8TypeInContext(base->gallivm->context);
2075                 v16i8 = LLVMVectorType(i8, 16);
2076                 v32i8 = LLVMVectorType(i8, 32);
2077
2078                 emit_data->args[1] = LLVMBuildBitCast(base->gallivm->builder,
2079                                                 emit_data->args[1], v32i8, "");
2080                 if (opcode != TGSI_OPCODE_TXF) {
2081                         emit_data->args[2] = LLVMBuildBitCast(base->gallivm->builder,
2082                                                 emit_data->args[2], v16i8, "");
2083                 }
2084
2085                 sprintf(intr_name, "%s.v%ui32", name,
2086                         LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
2087
2088                 emit_data->output[emit_data->chan] = build_intrinsic(
2089                         base->gallivm->builder, intr_name, emit_data->dst_type,
2090                         emit_data->args, emit_data->arg_count,
2091                         LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
2092         }
2093 }
2094
2095 static void txq_fetch_args(
2096         struct lp_build_tgsi_context * bld_base,
2097         struct lp_build_emit_data * emit_data)
2098 {
2099         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2100         const struct tgsi_full_instruction *inst = emit_data->inst;
2101         struct gallivm_state *gallivm = bld_base->base.gallivm;
2102         unsigned target = inst->Texture.Texture;
2103
2104         if (target == TGSI_TEXTURE_BUFFER) {
2105                 LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
2106                 LLVMTypeRef v8i32 = LLVMVectorType(i32, 8);
2107
2108                 /* Read the size from the buffer descriptor directly. */
2109                 LLVMValueRef size = si_shader_ctx->resources[inst->Src[1].Register.Index];
2110                 size = LLVMBuildBitCast(gallivm->builder, size, v8i32, "");
2111                 size = LLVMBuildExtractElement(gallivm->builder, size,
2112                                               lp_build_const_int32(gallivm, 6), "");
2113                 emit_data->args[0] = size;
2114                 return;
2115         }
2116
2117         /* Mip level */
2118         emit_data->args[0] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
2119
2120         /* Resource */
2121         emit_data->args[1] = si_shader_ctx->resources[inst->Src[1].Register.Index];
2122
2123         /* Texture target */
2124         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
2125             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
2126                 target = TGSI_TEXTURE_2D_ARRAY;
2127
2128         emit_data->args[2] = lp_build_const_int32(bld_base->base.gallivm,
2129                                                   target);
2130
2131         emit_data->arg_count = 3;
2132
2133         emit_data->dst_type = LLVMVectorType(
2134                 LLVMInt32TypeInContext(bld_base->base.gallivm->context),
2135                 4);
2136 }
2137
2138 static void build_txq_intrinsic(const struct lp_build_tgsi_action * action,
2139                                 struct lp_build_tgsi_context * bld_base,
2140                                 struct lp_build_emit_data * emit_data)
2141 {
2142         unsigned target = emit_data->inst->Texture.Texture;
2143
2144         if (target == TGSI_TEXTURE_BUFFER) {
2145                 /* Just return the buffer size. */
2146                 emit_data->output[emit_data->chan] = emit_data->args[0];
2147                 return;
2148         }
2149
2150         build_tgsi_intrinsic_nomem(action, bld_base, emit_data);
2151
2152         /* Divide the number of layers by 6 to get the number of cubes. */
2153         if (target == TGSI_TEXTURE_CUBE_ARRAY ||
2154             target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2155                 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2156                 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
2157                 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
2158
2159                 LLVMValueRef v4 = emit_data->output[emit_data->chan];
2160                 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
2161                 z = LLVMBuildSDiv(builder, z, six, "");
2162
2163                 emit_data->output[emit_data->chan] =
2164                         LLVMBuildInsertElement(builder, v4, z, two, "");
2165         }
2166 }
2167
2168 static void si_llvm_emit_ddxy(
2169         const struct lp_build_tgsi_action * action,
2170         struct lp_build_tgsi_context * bld_base,
2171         struct lp_build_emit_data * emit_data)
2172 {
2173         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2174         struct gallivm_state *gallivm = bld_base->base.gallivm;
2175         struct lp_build_context * base = &bld_base->base;
2176         const struct tgsi_full_instruction *inst = emit_data->inst;
2177         unsigned opcode = inst->Instruction.Opcode;
2178         LLVMValueRef indices[2];
2179         LLVMValueRef store_ptr, load_ptr0, load_ptr1;
2180         LLVMValueRef tl, trbl, result[4];
2181         LLVMTypeRef i32;
2182         unsigned swizzle[4];
2183         unsigned c;
2184
2185         i32 = LLVMInt32TypeInContext(gallivm->context);
2186
2187         indices[0] = bld_base->uint_bld.zero;
2188         indices[1] = build_intrinsic(gallivm->builder, "llvm.SI.tid", i32,
2189                                      NULL, 0, LLVMReadNoneAttribute);
2190         store_ptr = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
2191                                  indices, 2, "");
2192
2193         indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
2194                                   lp_build_const_int32(gallivm, 0xfffffffc), "");
2195         load_ptr0 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
2196                                  indices, 2, "");
2197
2198         indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
2199                                   lp_build_const_int32(gallivm,
2200                                                        opcode == TGSI_OPCODE_DDX ? 1 : 2),
2201                                   "");
2202         load_ptr1 = LLVMBuildGEP(gallivm->builder, si_shader_ctx->ddxy_lds,
2203                                  indices, 2, "");
2204
2205         for (c = 0; c < 4; ++c) {
2206                 unsigned i;
2207
2208                 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
2209                 for (i = 0; i < c; ++i) {
2210                         if (swizzle[i] == swizzle[c]) {
2211                                 result[c] = result[i];
2212                                 break;
2213                         }
2214                 }
2215                 if (i != c)
2216                         continue;
2217
2218                 LLVMBuildStore(gallivm->builder,
2219                                LLVMBuildBitCast(gallivm->builder,
2220                                                 lp_build_emit_fetch(bld_base, inst, 0, c),
2221                                                 i32, ""),
2222                                store_ptr);
2223
2224                 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
2225                 tl = LLVMBuildBitCast(gallivm->builder, tl, base->elem_type, "");
2226
2227                 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
2228                 trbl = LLVMBuildBitCast(gallivm->builder, trbl, base->elem_type, "");
2229
2230                 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
2231         }
2232
2233         emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
2234 }
2235
2236 /* Emit one vertex from the geometry shader */
2237 static void si_llvm_emit_vertex(
2238         const struct lp_build_tgsi_action *action,
2239         struct lp_build_tgsi_context *bld_base,
2240         struct lp_build_emit_data *emit_data)
2241 {
2242         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2243         struct lp_build_context *uint = &bld_base->uint_bld;
2244         struct si_shader *shader = si_shader_ctx->shader;
2245         struct tgsi_shader_info *info = &shader->selector->info;
2246         struct gallivm_state *gallivm = bld_base->base.gallivm;
2247         LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
2248         LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
2249                                             SI_PARAM_GS2VS_OFFSET);
2250         LLVMValueRef gs_next_vertex;
2251         LLVMValueRef can_emit, kill;
2252         LLVMValueRef args[2];
2253         unsigned chan;
2254         int i;
2255
2256         /* Write vertex attribute values to GSVS ring */
2257         gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, "");
2258
2259         /* If this thread has already emitted the declared maximum number of
2260          * vertices, kill it: excessive vertex emissions are not supposed to
2261          * have any effect, and GS threads have no externally observable
2262          * effects other than emitting vertices.
2263          */
2264         can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
2265                                  lp_build_const_int32(gallivm,
2266                                                       shader->selector->gs_max_out_vertices), "");
2267         kill = lp_build_select(&bld_base->base, can_emit,
2268                                lp_build_const_float(gallivm, 1.0f),
2269                                lp_build_const_float(gallivm, -1.0f));
2270         build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2271                         LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
2272
2273         for (i = 0; i < info->num_outputs; i++) {
2274                 LLVMValueRef *out_ptr =
2275                         si_shader_ctx->radeon_bld.soa.outputs[i];
2276
2277                 for (chan = 0; chan < 4; chan++) {
2278                         LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2279                         LLVMValueRef voffset =
2280                                 lp_build_const_int32(gallivm, (i * 4 + chan) *
2281                                                      shader->selector->gs_max_out_vertices);
2282
2283                         voffset = lp_build_add(uint, voffset, gs_next_vertex);
2284                         voffset = lp_build_mul_imm(uint, voffset, 4);
2285
2286                         out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
2287
2288                         build_tbuffer_store(si_shader_ctx,
2289                                             si_shader_ctx->gsvs_ring,
2290                                             out_val, 1,
2291                                             voffset, soffset, 0,
2292                                             V_008F0C_BUF_DATA_FORMAT_32,
2293                                             V_008F0C_BUF_NUM_FORMAT_UINT,
2294                                             1, 0, 1, 1, 0);
2295                 }
2296         }
2297         gs_next_vertex = lp_build_add(uint, gs_next_vertex,
2298                                       lp_build_const_int32(gallivm, 1));
2299         LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex);
2300
2301         /* Signal vertex emission */
2302         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS);
2303         args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2304         build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2305                         LLVMVoidTypeInContext(gallivm->context), args, 2,
2306                         LLVMNoUnwindAttribute);
2307 }
2308
2309 /* Cut one primitive from the geometry shader */
2310 static void si_llvm_emit_primitive(
2311         const struct lp_build_tgsi_action *action,
2312         struct lp_build_tgsi_context *bld_base,
2313         struct lp_build_emit_data *emit_data)
2314 {
2315         struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
2316         struct gallivm_state *gallivm = bld_base->base.gallivm;
2317         LLVMValueRef args[2];
2318
2319         /* Signal primitive cut */
2320         args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS);
2321         args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2322         build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2323                         LLVMVoidTypeInContext(gallivm->context), args, 2,
2324                         LLVMNoUnwindAttribute);
2325 }
2326
2327 static const struct lp_build_tgsi_action tex_action = {
2328         .fetch_args = tex_fetch_args,
2329         .emit = build_tex_intrinsic,
2330 };
2331
2332 static const struct lp_build_tgsi_action txq_action = {
2333         .fetch_args = txq_fetch_args,
2334         .emit = build_txq_intrinsic,
2335         .intr_name = "llvm.SI.resinfo"
2336 };
2337
2338 static void create_meta_data(struct si_shader_context *si_shader_ctx)
2339 {
2340         struct gallivm_state *gallivm = si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
2341         LLVMValueRef args[3];
2342
2343         args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
2344         args[1] = 0;
2345         args[2] = lp_build_const_int32(gallivm, 1);
2346
2347         si_shader_ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
2348 }
2349
2350 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
2351 {
2352         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
2353                                CONST_ADDR_SPACE);
2354 }
2355
2356 static void create_function(struct si_shader_context *si_shader_ctx)
2357 {
2358         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
2359         struct gallivm_state *gallivm = bld_base->base.gallivm;
2360         struct si_shader *shader = si_shader_ctx->shader;
2361         LLVMTypeRef params[SI_NUM_PARAMS], f32, i8, i32, v2i32, v3i32, v16i8, v4i32, v8i32;
2362         unsigned i, last_array_pointer, last_sgpr, num_params;
2363
2364         i8 = LLVMInt8TypeInContext(gallivm->context);
2365         i32 = LLVMInt32TypeInContext(gallivm->context);
2366         f32 = LLVMFloatTypeInContext(gallivm->context);
2367         v2i32 = LLVMVectorType(i32, 2);
2368         v3i32 = LLVMVectorType(i32, 3);
2369         v4i32 = LLVMVectorType(i32, 4);
2370         v8i32 = LLVMVectorType(i32, 8);
2371         v16i8 = LLVMVectorType(i8, 16);
2372
2373         params[SI_PARAM_RW_BUFFERS] = const_array(v16i8, SI_NUM_RW_BUFFERS);
2374         params[SI_PARAM_CONST] = const_array(v16i8, SI_NUM_CONST_BUFFERS);
2375         params[SI_PARAM_SAMPLER] = const_array(v4i32, SI_NUM_SAMPLER_STATES);
2376         params[SI_PARAM_RESOURCE] = const_array(v8i32, SI_NUM_SAMPLER_VIEWS);
2377         last_array_pointer = SI_PARAM_RESOURCE;
2378
2379         switch (si_shader_ctx->type) {
2380         case TGSI_PROCESSOR_VERTEX:
2381                 params[SI_PARAM_VERTEX_BUFFER] = const_array(v16i8, SI_NUM_VERTEX_BUFFERS);
2382                 last_array_pointer = SI_PARAM_VERTEX_BUFFER;
2383                 params[SI_PARAM_BASE_VERTEX] = i32;
2384                 params[SI_PARAM_START_INSTANCE] = i32;
2385                 num_params = SI_PARAM_START_INSTANCE+1;
2386
2387                 if (shader->key.vs.as_es) {
2388                         params[SI_PARAM_ES2GS_OFFSET] = i32;
2389                         num_params++;
2390                 } else {
2391                         if (shader->is_gs_copy_shader) {
2392                                 last_array_pointer = SI_PARAM_CONST;
2393                                 num_params = SI_PARAM_CONST+1;
2394                         }
2395
2396                         /* The locations of the other parameters are assigned dynamically. */
2397
2398                         /* Streamout SGPRs. */
2399                         if (shader->selector->so.num_outputs) {
2400                                 params[si_shader_ctx->param_streamout_config = num_params++] = i32;
2401                                 params[si_shader_ctx->param_streamout_write_index = num_params++] = i32;
2402                         }
2403                         /* A streamout buffer offset is loaded if the stride is non-zero. */
2404                         for (i = 0; i < 4; i++) {
2405                                 if (!shader->selector->so.stride[i])
2406                                         continue;
2407
2408                                 params[si_shader_ctx->param_streamout_offset[i] = num_params++] = i32;
2409                         }
2410                 }
2411
2412                 last_sgpr = num_params-1;
2413
2414                 /* VGPRs */
2415                 params[si_shader_ctx->param_vertex_id = num_params++] = i32;
2416                 params[num_params++] = i32; /* unused*/
2417                 params[num_params++] = i32; /* unused */
2418                 params[si_shader_ctx->param_instance_id = num_params++] = i32;
2419                 break;
2420
2421         case TGSI_PROCESSOR_GEOMETRY:
2422                 params[SI_PARAM_GS2VS_OFFSET] = i32;
2423                 params[SI_PARAM_GS_WAVE_ID] = i32;
2424                 last_sgpr = SI_PARAM_GS_WAVE_ID;
2425
2426                 /* VGPRs */
2427                 params[SI_PARAM_VTX0_OFFSET] = i32;
2428                 params[SI_PARAM_VTX1_OFFSET] = i32;
2429                 params[SI_PARAM_PRIMITIVE_ID] = i32;
2430                 params[SI_PARAM_VTX2_OFFSET] = i32;
2431                 params[SI_PARAM_VTX3_OFFSET] = i32;
2432                 params[SI_PARAM_VTX4_OFFSET] = i32;
2433                 params[SI_PARAM_VTX5_OFFSET] = i32;
2434                 params[SI_PARAM_GS_INSTANCE_ID] = i32;
2435                 num_params = SI_PARAM_GS_INSTANCE_ID+1;
2436                 break;
2437
2438         case TGSI_PROCESSOR_FRAGMENT:
2439                 params[SI_PARAM_ALPHA_REF] = f32;
2440                 params[SI_PARAM_PRIM_MASK] = i32;
2441                 last_sgpr = SI_PARAM_PRIM_MASK;
2442                 params[SI_PARAM_PERSP_SAMPLE] = v2i32;
2443                 params[SI_PARAM_PERSP_CENTER] = v2i32;
2444                 params[SI_PARAM_PERSP_CENTROID] = v2i32;
2445                 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
2446                 params[SI_PARAM_LINEAR_SAMPLE] = v2i32;
2447                 params[SI_PARAM_LINEAR_CENTER] = v2i32;
2448                 params[SI_PARAM_LINEAR_CENTROID] = v2i32;
2449                 params[SI_PARAM_LINE_STIPPLE_TEX] = f32;
2450                 params[SI_PARAM_POS_X_FLOAT] = f32;
2451                 params[SI_PARAM_POS_Y_FLOAT] = f32;
2452                 params[SI_PARAM_POS_Z_FLOAT] = f32;
2453                 params[SI_PARAM_POS_W_FLOAT] = f32;
2454                 params[SI_PARAM_FRONT_FACE] = f32;
2455                 params[SI_PARAM_ANCILLARY] = i32;
2456                 params[SI_PARAM_SAMPLE_COVERAGE] = f32;
2457                 params[SI_PARAM_POS_FIXED_PT] = f32;
2458                 num_params = SI_PARAM_POS_FIXED_PT+1;
2459                 break;
2460
2461         default:
2462                 assert(0 && "unimplemented shader");
2463                 return;
2464         }
2465
2466         assert(num_params <= Elements(params));
2467         radeon_llvm_create_func(&si_shader_ctx->radeon_bld, params, num_params);
2468         radeon_llvm_shader_type(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->type);
2469
2470         if (shader->dx10_clamp_mode)
2471                 LLVMAddTargetDependentFunctionAttr(si_shader_ctx->radeon_bld.main_fn,
2472                                                    "enable-no-nans-fp-math", "true");
2473
2474         for (i = 0; i <= last_sgpr; ++i) {
2475                 LLVMValueRef P = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, i);
2476
2477                 /* We tell llvm that array inputs are passed by value to allow Sinking pass
2478                  * to move load. Inputs are constant so this is fine. */
2479                 if (i <= last_array_pointer)
2480                         LLVMAddAttribute(P, LLVMByValAttribute);
2481                 else
2482                         LLVMAddAttribute(P, LLVMInRegAttribute);
2483         }
2484
2485         if (bld_base->info &&
2486             (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
2487              bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0))
2488                 si_shader_ctx->ddxy_lds =
2489                         LLVMAddGlobalInAddressSpace(gallivm->module,
2490                                                     LLVMArrayType(i32, 64),
2491                                                     "ddxy_lds",
2492                                                     LOCAL_ADDR_SPACE);
2493 }
2494
2495 static void preload_constants(struct si_shader_context *si_shader_ctx)
2496 {
2497         struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
2498         struct gallivm_state * gallivm = bld_base->base.gallivm;
2499         const struct tgsi_shader_info * info = bld_base->info;
2500         unsigned buf;
2501         LLVMValueRef ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_CONST);
2502
2503         for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
2504                 unsigned i, num_const = info->const_file_max[buf] + 1;
2505
2506                 if (num_const == 0)
2507                         continue;
2508
2509                 /* Allocate space for the constant values */
2510                 si_shader_ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
2511
2512                 /* Load the resource descriptor */
2513                 si_shader_ctx->const_resource[buf] =
2514                         build_indexed_load_const(si_shader_ctx, ptr, lp_build_const_int32(gallivm, buf));
2515
2516                 /* Load the constants, we rely on the code sinking to do the rest */
2517                 for (i = 0; i < num_const * 4; ++i) {
2518                         si_shader_ctx->constants[buf][i] =
2519                                 buffer_load_const(gallivm->builder,
2520                                         si_shader_ctx->const_resource[buf],
2521                                         lp_build_const_int32(gallivm, i * 4),
2522                                         bld_base->base.elem_type);
2523                 }
2524         }
2525 }
2526
2527 static void preload_samplers(struct si_shader_context *si_shader_ctx)
2528 {
2529         struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
2530         struct gallivm_state * gallivm = bld_base->base.gallivm;
2531         const struct tgsi_shader_info * info = bld_base->info;
2532
2533         unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
2534
2535         LLVMValueRef res_ptr, samp_ptr;
2536         LLVMValueRef offset;
2537
2538         if (num_samplers == 0)
2539                 return;
2540
2541         res_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
2542         samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
2543
2544         /* Load the resources and samplers, we rely on the code sinking to do the rest */
2545         for (i = 0; i < num_samplers; ++i) {
2546                 /* Resource */
2547                 offset = lp_build_const_int32(gallivm, i);
2548                 si_shader_ctx->resources[i] = build_indexed_load_const(si_shader_ctx, res_ptr, offset);
2549
2550                 /* Sampler */
2551                 offset = lp_build_const_int32(gallivm, i);
2552                 si_shader_ctx->samplers[i] = build_indexed_load_const(si_shader_ctx, samp_ptr, offset);
2553
2554                 /* FMASK resource */
2555                 if (info->is_msaa_sampler[i]) {
2556                         offset = lp_build_const_int32(gallivm, SI_FMASK_TEX_OFFSET + i);
2557                         si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + i] =
2558                                 build_indexed_load_const(si_shader_ctx, res_ptr, offset);
2559                 }
2560         }
2561 }
2562
2563 static void preload_streamout_buffers(struct si_shader_context *si_shader_ctx)
2564 {
2565         struct lp_build_tgsi_context * bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
2566         struct gallivm_state * gallivm = bld_base->base.gallivm;
2567         unsigned i;
2568
2569         if (si_shader_ctx->type != TGSI_PROCESSOR_VERTEX ||
2570             si_shader_ctx->shader->key.vs.as_es ||
2571             !si_shader_ctx->shader->selector->so.num_outputs)
2572                 return;
2573
2574         LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
2575                                             SI_PARAM_RW_BUFFERS);
2576
2577         /* Load the resources, we rely on the code sinking to do the rest */
2578         for (i = 0; i < 4; ++i) {
2579                 if (si_shader_ctx->shader->selector->so.stride[i]) {
2580                         LLVMValueRef offset = lp_build_const_int32(gallivm,
2581                                                                    SI_SO_BUF_OFFSET + i);
2582
2583                         si_shader_ctx->so_buffers[i] = build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
2584                 }
2585         }
2586 }
2587
2588 /**
2589  * Load ESGS and GSVS ring buffer resource descriptors and save the variables
2590  * for later use.
2591  */
2592 static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
2593 {
2594         struct gallivm_state *gallivm =
2595                 si_shader_ctx->radeon_bld.soa.bld_base.base.gallivm;
2596
2597         LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
2598                                             SI_PARAM_RW_BUFFERS);
2599
2600         if ((si_shader_ctx->type == TGSI_PROCESSOR_VERTEX &&
2601              si_shader_ctx->shader->key.vs.as_es) ||
2602             si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY) {
2603                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS);
2604
2605                 si_shader_ctx->esgs_ring =
2606                         build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
2607         }
2608
2609         if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY ||
2610             si_shader_ctx->shader->is_gs_copy_shader) {
2611                 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
2612
2613                 si_shader_ctx->gsvs_ring =
2614                         build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
2615         }
2616 }
2617
2618 void si_shader_binary_read_config(const struct si_screen *sscreen,
2619                                 struct si_shader *shader,
2620                                 unsigned symbol_offset)
2621 {
2622         unsigned i;
2623         const unsigned char *config =
2624                 radeon_shader_binary_config_start(&shader->binary,
2625                                                 symbol_offset);
2626
2627         /* XXX: We may be able to emit some of these values directly rather than
2628          * extracting fields to be emitted later.
2629          */
2630
2631         for (i = 0; i < shader->binary.config_size_per_symbol; i+= 8) {
2632                 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
2633                 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
2634                 switch (reg) {
2635                 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
2636                 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
2637                 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
2638                 case R_00B848_COMPUTE_PGM_RSRC1:
2639                         shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
2640                         shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
2641                         shader->float_mode =  G_00B028_FLOAT_MODE(value);
2642                         break;
2643                 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
2644                         shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
2645                         break;
2646                 case R_00B84C_COMPUTE_PGM_RSRC2:
2647                         shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
2648                         break;
2649                 case R_0286CC_SPI_PS_INPUT_ENA:
2650                         shader->spi_ps_input_ena = value;
2651                         break;
2652                 case R_0286E8_SPI_TMPRING_SIZE:
2653                 case R_00B860_COMPUTE_TMPRING_SIZE:
2654                         /* WAVESIZE is in units of 256 dwords. */
2655                         shader->scratch_bytes_per_wave =
2656                                 G_00B860_WAVESIZE(value) * 256 * 4 * 1;
2657                         break;
2658                 default:
2659                         fprintf(stderr, "Warning: Compiler emitted unknown "
2660                                 "config register: 0x%x\n", reg);
2661                         break;
2662                 }
2663         }
2664 }
2665
2666 void si_shader_apply_scratch_relocs(struct si_context *sctx,
2667                         struct si_shader *shader,
2668                         uint64_t scratch_va)
2669 {
2670         unsigned i;
2671         uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
2672         uint32_t scratch_rsrc_dword1 =
2673                 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
2674                 |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
2675
2676         for (i = 0 ; i < shader->binary.reloc_count; i++) {
2677                 const struct radeon_shader_reloc *reloc =
2678                                         &shader->binary.relocs[i];
2679                 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
2680                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
2681                         &scratch_rsrc_dword0, 4);
2682                 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
2683                         util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
2684                         &scratch_rsrc_dword1, 4);
2685                 }
2686         }
2687 }
2688
2689 int si_shader_binary_read(struct si_screen *sscreen,
2690                         struct si_shader *shader,
2691                         const struct radeon_shader_binary *binary)
2692 {
2693
2694         unsigned i;
2695         unsigned code_size;
2696         unsigned char *ptr;
2697         bool dump  = r600_can_dump_shader(&sscreen->b,
2698                 shader->selector ? shader->selector->tokens : NULL);
2699
2700         si_shader_binary_read_config(sscreen, shader, 0);
2701
2702         if (dump) {
2703                 if (!binary->disassembled) {
2704                         fprintf(stderr, "SI CODE:\n");
2705                         for (i = 0; i < binary->code_size; i+=4 ) {
2706                                 fprintf(stderr, "@0x%x: %02x%02x%02x%02x\n", i, binary->code[i + 3],
2707                                 binary->code[i + 2], binary->code[i + 1],
2708                                 binary->code[i]);
2709                         }
2710                 }
2711
2712                 fprintf(stderr, "*** SHADER STATS ***\n"
2713                         "SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
2714                         "Scratch: %d bytes per wave\n********************\n",
2715                         shader->num_sgprs, shader->num_vgprs, binary->code_size,
2716                         shader->lds_size, shader->scratch_bytes_per_wave);
2717         }
2718
2719         /* copy new shader */
2720         code_size = binary->code_size + binary->rodata_size;
2721         r600_resource_reference(&shader->bo, NULL);
2722         shader->bo = si_resource_create_custom(&sscreen->b.b, PIPE_USAGE_IMMUTABLE,
2723                                                code_size);
2724         if (shader->bo == NULL) {
2725                 return -ENOMEM;
2726         }
2727
2728
2729         ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_READ_WRITE);
2730         util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
2731         if (binary->rodata_size > 0) {
2732                 ptr += binary->code_size;
2733                 util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
2734         }
2735
2736         sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
2737
2738         return 0;
2739 }
2740
2741 int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
2742                     LLVMTargetMachineRef tm, LLVMModuleRef mod)
2743 {
2744         int r = 0;
2745         bool dump = r600_can_dump_shader(&sscreen->b,
2746                         shader->selector ? shader->selector->tokens : NULL);
2747         r = radeon_llvm_compile(mod, &shader->binary,
2748                 r600_get_llvm_processor_name(sscreen->b.family), dump, tm);
2749
2750         if (r) {
2751                 return r;
2752         }
2753         r = si_shader_binary_read(sscreen, shader, &shader->binary);
2754
2755         FREE(shader->binary.config);
2756         FREE(shader->binary.rodata);
2757         FREE(shader->binary.global_symbol_offsets);
2758         if (shader->scratch_bytes_per_wave == 0) {
2759                 FREE(shader->binary.code);
2760                 FREE(shader->binary.relocs);
2761                 memset(&shader->binary, 0, sizeof(shader->binary));
2762         }
2763         return r;
2764 }
2765
2766 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2767 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
2768                                       struct si_shader_context *si_shader_ctx,
2769                                       struct si_shader *gs, bool dump)
2770 {
2771         struct gallivm_state *gallivm = &si_shader_ctx->radeon_bld.gallivm;
2772         struct lp_build_tgsi_context *bld_base = &si_shader_ctx->radeon_bld.soa.bld_base;
2773         struct lp_build_context *base = &bld_base->base;
2774         struct lp_build_context *uint = &bld_base->uint_bld;
2775         struct si_shader *shader = si_shader_ctx->shader;
2776         struct si_shader_output_values *outputs;
2777         struct tgsi_shader_info *gsinfo = &gs->selector->info;
2778         LLVMValueRef args[9];
2779         int i, r;
2780
2781         outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
2782
2783         si_shader_ctx->type = TGSI_PROCESSOR_VERTEX;
2784         shader->is_gs_copy_shader = true;
2785
2786         radeon_llvm_context_init(&si_shader_ctx->radeon_bld);
2787
2788         create_meta_data(si_shader_ctx);
2789         create_function(si_shader_ctx);
2790         preload_streamout_buffers(si_shader_ctx);
2791         preload_ring_buffers(si_shader_ctx);
2792
2793         args[0] = si_shader_ctx->gsvs_ring;
2794         args[1] = lp_build_mul_imm(uint,
2795                                    LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
2796                                                 si_shader_ctx->param_vertex_id),
2797                                    4);
2798         args[3] = uint->zero;
2799         args[4] = uint->one;  /* OFFEN */
2800         args[5] = uint->zero; /* IDXEN */
2801         args[6] = uint->one;  /* GLC */
2802         args[7] = uint->one;  /* SLC */
2803         args[8] = uint->zero; /* TFE */
2804
2805         /* Fetch vertex data from GSVS ring */
2806         for (i = 0; i < gsinfo->num_outputs; ++i) {
2807                 unsigned chan;
2808
2809                 outputs[i].name = gsinfo->output_semantic_name[i];
2810                 outputs[i].sid = gsinfo->output_semantic_index[i];
2811
2812                 for (chan = 0; chan < 4; chan++) {
2813                         args[2] = lp_build_const_int32(gallivm,
2814                                                        (i * 4 + chan) *
2815                                                        gs->selector->gs_max_out_vertices * 16 * 4);
2816
2817                         outputs[i].values[chan] =
2818                                 LLVMBuildBitCast(gallivm->builder,
2819                                                  build_intrinsic(gallivm->builder,
2820                                                                  "llvm.SI.buffer.load.dword.i32.i32",
2821                                                                  LLVMInt32TypeInContext(gallivm->context),
2822                                                                  args, 9,
2823                                                                  LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
2824                                                  base->elem_type, "");
2825                 }
2826         }
2827
2828         si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
2829
2830         radeon_llvm_finalize_module(&si_shader_ctx->radeon_bld);
2831
2832         if (dump)
2833                 fprintf(stderr, "Copy Vertex Shader for Geometry Shader:\n\n");
2834
2835         r = si_compile_llvm(sscreen, si_shader_ctx->shader,
2836                             si_shader_ctx->tm, bld_base->base.gallivm->module);
2837
2838         radeon_llvm_dispose(&si_shader_ctx->radeon_bld);
2839
2840         FREE(outputs);
2841         return r;
2842 }
2843
2844 static void si_dump_key(unsigned shader, union si_shader_key *key)
2845 {
2846         int i;
2847
2848         fprintf(stderr, "SHADER KEY\n");
2849
2850         switch (shader) {
2851         case PIPE_SHADER_VERTEX:
2852                 fprintf(stderr, "  instance_divisors = {");
2853                 for (i = 0; i < Elements(key->vs.instance_divisors); i++)
2854                         fprintf(stderr, !i ? "%u" : ", %u",
2855                                 key->vs.instance_divisors[i]);
2856                 fprintf(stderr, "}\n");
2857
2858                 if (key->vs.as_es)
2859                         fprintf(stderr, "  gs_used_inputs = 0x%"PRIx64"\n",
2860                                 key->vs.gs_used_inputs);
2861                 fprintf(stderr, "  as_es = %u\n", key->vs.as_es);
2862                 break;
2863
2864         case PIPE_SHADER_GEOMETRY:
2865                 break;
2866
2867         case PIPE_SHADER_FRAGMENT:
2868                 fprintf(stderr, "  export_16bpc = 0x%X\n", key->ps.export_16bpc);
2869                 fprintf(stderr, "  last_cbuf = %u\n", key->ps.last_cbuf);
2870                 fprintf(stderr, "  color_two_side = %u\n", key->ps.color_two_side);
2871                 fprintf(stderr, "  alpha_func = %u\n", key->ps.alpha_func);
2872                 fprintf(stderr, "  alpha_to_one = %u\n", key->ps.alpha_to_one);
2873                 fprintf(stderr, "  poly_stipple = %u\n", key->ps.poly_stipple);
2874                 break;
2875
2876         default:
2877                 assert(0);
2878         }
2879 }
2880
2881 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
2882                      struct si_shader *shader)
2883 {
2884         struct si_shader_selector *sel = shader->selector;
2885         struct tgsi_token *tokens = sel->tokens;
2886         struct si_shader_context si_shader_ctx;
2887         struct lp_build_tgsi_context * bld_base;
2888         struct tgsi_shader_info stipple_shader_info;
2889         LLVMModuleRef mod;
2890         int r = 0;
2891         bool poly_stipple = sel->type == PIPE_SHADER_FRAGMENT &&
2892                             shader->key.ps.poly_stipple;
2893         bool dump = r600_can_dump_shader(&sscreen->b, sel->tokens);
2894
2895         if (poly_stipple) {
2896                 tokens = util_pstipple_create_fragment_shader(tokens, NULL,
2897                                                 SI_POLY_STIPPLE_SAMPLER);
2898                 tgsi_scan_shader(tokens, &stipple_shader_info);
2899         }
2900
2901         /* Dump TGSI code before doing TGSI->LLVM conversion in case the
2902          * conversion fails. */
2903         if (dump) {
2904                 si_dump_key(sel->type, &shader->key);
2905                 tgsi_dump(tokens, 0);
2906                 si_dump_streamout(&sel->so);
2907         }
2908
2909         assert(shader->nparam == 0);
2910
2911         memset(&si_shader_ctx, 0, sizeof(si_shader_ctx));
2912         radeon_llvm_context_init(&si_shader_ctx.radeon_bld);
2913         bld_base = &si_shader_ctx.radeon_bld.soa.bld_base;
2914
2915         if (sel->type != PIPE_SHADER_COMPUTE)
2916                 shader->dx10_clamp_mode = true;
2917
2918         if (sel->info.uses_kill)
2919                 shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
2920
2921         shader->uses_instanceid = sel->info.uses_instanceid;
2922         bld_base->info = poly_stipple ? &stipple_shader_info : &sel->info;
2923         bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
2924
2925         bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
2926         bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
2927         bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
2928         bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
2929         bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
2930         bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
2931         bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
2932         bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
2933         bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
2934         bld_base->op_actions[TGSI_OPCODE_TXQ] = txq_action;
2935         bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
2936         bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
2937
2938         bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
2939         bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
2940
2941         bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
2942         bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
2943
2944         if (HAVE_LLVM >= 0x0306) {
2945                 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
2946                 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
2947                 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
2948                 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
2949         }
2950
2951         si_shader_ctx.radeon_bld.load_system_value = declare_system_value;
2952         si_shader_ctx.shader = shader;
2953         si_shader_ctx.type = tgsi_get_processor_type(tokens);
2954         si_shader_ctx.screen = sscreen;
2955         si_shader_ctx.tm = tm;
2956
2957         switch (si_shader_ctx.type) {
2958         case TGSI_PROCESSOR_VERTEX:
2959                 si_shader_ctx.radeon_bld.load_input = declare_input_vs;
2960                 if (shader->key.vs.as_es) {
2961                         bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
2962                 } else {
2963                         bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
2964                 }
2965                 break;
2966         case TGSI_PROCESSOR_GEOMETRY:
2967                 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
2968                 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
2969                 break;
2970         case TGSI_PROCESSOR_FRAGMENT:
2971                 si_shader_ctx.radeon_bld.load_input = declare_input_fs;
2972                 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
2973
2974                 switch (sel->info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]) {
2975                 case TGSI_FS_DEPTH_LAYOUT_GREATER:
2976                         shader->db_shader_control |=
2977                                 S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
2978                         break;
2979                 case TGSI_FS_DEPTH_LAYOUT_LESS:
2980                         shader->db_shader_control |=
2981                                 S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
2982                         break;
2983                 }
2984                 break;
2985         default:
2986                 assert(!"Unsupported shader type");
2987                 return -1;
2988         }
2989
2990         create_meta_data(&si_shader_ctx);
2991         create_function(&si_shader_ctx);
2992         preload_constants(&si_shader_ctx);
2993         preload_samplers(&si_shader_ctx);
2994         preload_streamout_buffers(&si_shader_ctx);
2995         preload_ring_buffers(&si_shader_ctx);
2996
2997         if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2998                 si_shader_ctx.gs_next_vertex =
2999                         lp_build_alloca(bld_base->base.gallivm,
3000                                         bld_base->uint_bld.elem_type, "");
3001         }
3002
3003         if (!lp_build_tgsi_llvm(bld_base, tokens)) {
3004                 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
3005                 goto out;
3006         }
3007
3008         radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
3009
3010         mod = bld_base->base.gallivm->module;
3011         r = si_compile_llvm(sscreen, shader, tm, mod);
3012         if (r) {
3013                 fprintf(stderr, "LLVM failed to compile shader\n");
3014                 goto out;
3015         }
3016
3017         radeon_llvm_dispose(&si_shader_ctx.radeon_bld);
3018
3019         if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
3020                 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3021                 shader->gs_copy_shader->selector = shader->selector;
3022                 shader->gs_copy_shader->key = shader->key;
3023                 si_shader_ctx.shader = shader->gs_copy_shader;
3024                 if ((r = si_generate_gs_copy_shader(sscreen, &si_shader_ctx,
3025                                                     shader, dump))) {
3026                         free(shader->gs_copy_shader);
3027                         shader->gs_copy_shader = NULL;
3028                         goto out;
3029                 }
3030         }
3031
3032 out:
3033         for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
3034                 FREE(si_shader_ctx.constants[i]);
3035         if (poly_stipple)
3036                 tgsi_free_tokens(tokens);
3037         return r;
3038 }
3039
3040 void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
3041 {
3042         if (shader->gs_copy_shader)
3043                 si_shader_destroy(ctx, shader->gs_copy_shader);
3044
3045         if (shader->scratch_bo)
3046                 r600_resource_reference(&shader->scratch_bo, NULL);
3047
3048         r600_resource_reference(&shader->bo, NULL);
3049
3050         FREE(shader->binary.code);
3051         FREE(shader->binary.relocs);
3052 }