src/gallium/drivers/radeonsi/si_shader_llvm_vs.c

   1 /*
   2  * Copyright 2020 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_shader_internal.h"
  26 #include "si_pipe.h"
  27 #include "sid.h"
  28 #include "util/u_memory.h"
  29
  30 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
  31                                  LLVMValueRef i32, unsigned index)
  32 {
  33         assert(index <= 1);
  34
  35         if (index == 1)
  36                 return LLVMBuildAShr(ctx->ac.builder, i32,
  37                                      LLVMConstInt(ctx->ac.i32, 16, 0), "");
  38
  39         return LLVMBuildSExt(ctx->ac.builder,
  40                              LLVMBuildTrunc(ctx->ac.builder, i32,
  41                                             ctx->ac.i16, ""),
  42                              ctx->ac.i32, "");
  43 }
  44
  45 static void load_input_vs(struct si_shader_context *ctx, unsigned input_index,
  46                           LLVMValueRef out[4])
  47 {
  48         const struct si_shader_info *info = &ctx->shader->selector->info;
  49         unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
  50
  51         if (vs_blit_property) {
  52                 LLVMValueRef vertex_id = ctx->abi.vertex_id;
  53                 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
  54                                                     LLVMIntULE, vertex_id,
  55                                                     ctx->ac.i32_1, "");
  56                 /* Use LLVMIntNE, because we have 3 vertices and only
  57                  * the middle one should use y2.
  58                  */
  59                 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
  60                                                     LLVMIntNE, vertex_id,
  61                                                     ctx->ac.i32_1, "");
  62
  63                 unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
  64                 if (input_index == 0) {
  65                         /* Position: */
  66                         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
  67                                                          param_vs_blit_inputs);
  68                         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
  69                                                          param_vs_blit_inputs + 1);
  70
  71                         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
  72                         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
  73                         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
  74                         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
  75
  76                         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
  77                                                          x1, x2, "");
  78                         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
  79                                                          y1, y2, "");
  80
  81                         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
  82                         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
  83                         out[2] = LLVMGetParam(ctx->main_fn,
  84                                               param_vs_blit_inputs + 2);
  85                         out[3] = ctx->ac.f32_1;
  86                         return;
  87                 }
  88
  89                 /* Color or texture coordinates: */
  90                 assert(input_index == 1);
  91
  92                 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
  93                         for (int i = 0; i < 4; i++) {
  94                                 out[i] = LLVMGetParam(ctx->main_fn,
  95                                                       param_vs_blit_inputs + 3 + i);
  96                         }
  97                 } else {
  98                         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
  99                         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
 100                                                        param_vs_blit_inputs + 3);
 101                         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
 102                                                        param_vs_blit_inputs + 4);
 103                         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
 104                                                        param_vs_blit_inputs + 5);
 105                         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
 106                                                        param_vs_blit_inputs + 6);
 107
 108                         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
 109                                                  x1, x2, "");
 110                         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
 111                                                  y1, y2, "");
 112                         out[2] = LLVMGetParam(ctx->main_fn,
 113                                               param_vs_blit_inputs + 7);
 114                         out[3] = LLVMGetParam(ctx->main_fn,
 115                                               param_vs_blit_inputs + 8);
 116                 }
 117                 return;
 118         }
 119
 120         unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
 121         union si_vs_fix_fetch fix_fetch;
 122         LLVMValueRef vb_desc;
 123         LLVMValueRef vertex_index;
 124         LLVMValueRef tmp;
 125
 126         if (input_index < num_vbos_in_user_sgprs) {
 127                 vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
 128         } else {
 129                 unsigned index= input_index - num_vbos_in_user_sgprs;
 130                 vb_desc = ac_build_load_to_sgpr(&ctx->ac,
 131                                                 ac_get_arg(&ctx->ac, ctx->vertex_buffers),
 132                                                 LLVMConstInt(ctx->ac.i32, index, 0));
 133         }
 134
 135         vertex_index = LLVMGetParam(ctx->main_fn,
 136                                     ctx->vertex_index0.arg_index +
 137                                     input_index);
 138
 139         /* Use the open-coded implementation for all loads of doubles and
 140          * of dword-sized data that needs fixups. We need to insert conversion
 141          * code anyway, and the amd/common code does it for us.
 142          *
 143          * Note: On LLVM <= 8, we can only open-code formats with
 144          * channel size >= 4 bytes.
 145          */
 146         bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
 147         fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
 148         if (opencode ||
 149             (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
 150             (fix_fetch.u.log_size == 2)) {
 151                 tmp = ac_build_opencoded_load_format(
 152                                 &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
 153                                 fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
 154                                 vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
 155                 for (unsigned i = 0; i < 4; ++i)
 156                         out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
 157                 return;
 158         }
 159
 160         /* Do multiple loads for special formats. */
 161         unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
 162         LLVMValueRef fetches[4];
 163         unsigned num_fetches;
 164         unsigned fetch_stride;
 165         unsigned channels_per_fetch;
 166
 167         if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
 168                 num_fetches = MIN2(required_channels, 3);
 169                 fetch_stride = 1 << fix_fetch.u.log_size;
 170                 channels_per_fetch = 1;
 171         } else {
 172                 num_fetches = 1;
 173                 fetch_stride = 0;
 174                 channels_per_fetch = required_channels;
 175         }
 176
 177         for (unsigned i = 0; i < num_fetches; ++i) {
 178                 LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
 179                 fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
 180                                                          channels_per_fetch, 0, true);
 181         }
 182
 183         if (num_fetches == 1 && channels_per_fetch > 1) {
 184                 LLVMValueRef fetch = fetches[0];
 185                 for (unsigned i = 0; i < channels_per_fetch; ++i) {
 186                         tmp = LLVMConstInt(ctx->ac.i32, i, false);
 187                         fetches[i] = LLVMBuildExtractElement(
 188                                 ctx->ac.builder, fetch, tmp, "");
 189                 }
 190                 num_fetches = channels_per_fetch;
 191                 channels_per_fetch = 1;
 192         }
 193
 194         for (unsigned i = num_fetches; i < 4; ++i)
 195                 fetches[i] = LLVMGetUndef(ctx->ac.f32);
 196
 197         if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
 198             required_channels == 4) {
 199                 if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
 200                         fetches[3] = ctx->ac.i32_1;
 201                 else
 202                         fetches[3] = ctx->ac.f32_1;
 203         } else if (fix_fetch.u.log_size == 3 &&
 204                    (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
 205                     fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
 206                     fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
 207                    required_channels == 4) {
 208                 /* For 2_10_10_10, the hardware returns an unsigned value;
 209                  * convert it to a signed one.
 210                  */
 211                 LLVMValueRef tmp = fetches[3];
 212                 LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
 213
 214                 /* First, recover the sign-extended signed integer value. */
 215                 if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
 216                         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
 217                 else
 218                         tmp = ac_to_integer(&ctx->ac, tmp);
 219
 220                 /* For the integer-like cases, do a natural sign extension.
 221                  *
 222                  * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 223                  * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 224                  * exponent.
 225                  */
 226                 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
 227                                    fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
 228                                    LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
 229                 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 230
 231                 /* Convert back to the right type. */
 232                 if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
 233                         LLVMValueRef clamp;
 234                         LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
 235                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
 236                         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
 237                         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
 238                 } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
 239                         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
 240                 }
 241
 242                 fetches[3] = tmp;
 243         }
 244
 245         for (unsigned i = 0; i < 4; ++i)
 246                 out[i] = ac_to_float(&ctx->ac, fetches[i]);
 247 }
 248
 249 static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index)
 250 {
 251         LLVMValueRef input[4];
 252
 253         load_input_vs(ctx, input_index / 4, input);
 254
 255         for (unsigned chan = 0; chan < 4; chan++) {
 256                 ctx->inputs[input_index + chan] =
 257                         LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
 258         }
 259 }
 260
 261 void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
 262 {
 263         uint64_t processed_inputs = 0;
 264
 265         nir_foreach_variable(variable, &nir->inputs) {
 266                 unsigned attrib_count = glsl_count_attribute_slots(variable->type,
 267                                                                    true);
 268                 unsigned input_idx = variable->data.driver_location;
 269                 unsigned loc = variable->data.location;
 270
 271                 for (unsigned i = 0; i < attrib_count; i++) {
 272                         /* Packed components share the same location so skip
 273                          * them if we have already processed the location.
 274                          */
 275                         if (processed_inputs & ((uint64_t)1 << (loc + i))) {
 276                                 input_idx += 4;
 277                                 continue;
 278                         }
 279
 280                         declare_input_vs(ctx, input_idx);
 281                         if (glsl_type_is_dual_slot(variable->type)) {
 282                                 input_idx += 4;
 283                                 declare_input_vs(ctx, input_idx);
 284                         }
 285
 286                         processed_inputs |= ((uint64_t)1 << (loc + i));
 287                         input_idx += 4;
 288                 }
 289         }
 290 }
 291
 292 void si_llvm_streamout_store_output(struct si_shader_context *ctx,
 293                                     LLVMValueRef const *so_buffers,
 294                                     LLVMValueRef const *so_write_offsets,
 295                                     struct pipe_stream_output *stream_out,
 296                                     struct si_shader_output_values *shader_out)
 297 {
 298         unsigned buf_idx = stream_out->output_buffer;
 299         unsigned start = stream_out->start_component;
 300         unsigned num_comps = stream_out->num_components;
 301         LLVMValueRef out[4];
 302
 303         assert(num_comps && num_comps <= 4);
 304         if (!num_comps || num_comps > 4)
 305                 return;
 306
 307         /* Load the output as int. */
 308         for (int j = 0; j < num_comps; j++) {
 309                 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
 310
 311                 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
 312         }
 313
 314         /* Pack the output. */
 315         LLVMValueRef vdata = NULL;
 316
 317         switch (num_comps) {
 318         case 1: /* as i32 */
 319                 vdata = out[0];
 320                 break;
 321         case 2: /* as v2i32 */
 322         case 3: /* as v3i32 */
 323                 if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
 324                         vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
 325                         break;
 326                 }
 327                 /* as v4i32 (aligned to 4) */
 328                 out[3] = LLVMGetUndef(ctx->ac.i32);
 329                 /* fall through */
 330         case 4: /* as v4i32 */
 331                 vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
 332                 break;
 333         }
 334
 335         ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
 336                                     vdata, num_comps,
 337                                     so_write_offsets[buf_idx],
 338                                     ctx->ac.i32_0,
 339                                     stream_out->dst_offset * 4, ac_glc | ac_slc);
 340 }
 341
 342 /**
 343  * Write streamout data to buffers for vertex stream @p stream (different
 344  * vertex streams can occur for GS copy shaders).
 345  */
 346 void si_llvm_emit_streamout(struct si_shader_context *ctx,
 347                             struct si_shader_output_values *outputs,
 348                             unsigned noutput, unsigned stream)
 349 {
 350         struct si_shader_selector *sel = ctx->shader->selector;
 351         struct pipe_stream_output_info *so = &sel->so;
 352         LLVMBuilderRef builder = ctx->ac.builder;
 353         int i;
 354
 355         /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
 356         LLVMValueRef so_vtx_count =
 357                 si_unpack_param(ctx, ctx->streamout_config, 16, 7);
 358
 359         LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
 360
 361         /* can_emit = tid < so_vtx_count; */
 362         LLVMValueRef can_emit =
 363                 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
 364
 365         /* Emit the streamout code conditionally. This actually avoids
 366          * out-of-bounds buffer access. The hw tells us via the SGPR
 367          * (so_vtx_count) which threads are allowed to emit streamout data. */
 368         ac_build_ifcc(&ctx->ac, can_emit, 6501);
 369         {
 370                 /* The buffer offset is computed as follows:
 371                  *   ByteOffset = streamout_offset[buffer_id]*4 +
 372                  *                (streamout_write_index + thread_id)*stride[buffer_id] +
 373                  *                attrib_offset
 374                  */
 375
 376                 LLVMValueRef so_write_index =
 377                         ac_get_arg(&ctx->ac,
 378                                    ctx->streamout_write_index);
 379
 380                 /* Compute (streamout_write_index + thread_id). */
 381                 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
 382
 383                 /* Load the descriptor and compute the write offset for each
 384                  * enabled buffer. */
 385                 LLVMValueRef so_write_offset[4] = {};
 386                 LLVMValueRef so_buffers[4];
 387                 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
 388                                                   ctx->rw_buffers);
 389
 390                 for (i = 0; i < 4; i++) {
 391                         if (!so->stride[i])
 392                                 continue;
 393
 394                         LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
 395                                                            SI_VS_STREAMOUT_BUF0 + i, 0);
 396
 397                         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
 398
 399                         LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
 400                                                             ctx->streamout_offset[i]);
 401                         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
 402
 403                         so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
 404                                                            LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
 405                                                            so_offset);
 406                 }
 407
 408                 /* Write streamout data. */
 409                 for (i = 0; i < so->num_outputs; i++) {
 410                         unsigned reg = so->output[i].register_index;
 411
 412                         if (reg >= noutput)
 413                                 continue;
 414
 415                         if (stream != so->output[i].stream)
 416                                 continue;
 417
 418                         si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset,
 419                                                        &so->output[i], &outputs[reg]);
 420                 }
 421         }
 422         ac_build_endif(&ctx->ac, 6501);
 423 }
 424
 425 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
 426                                     struct ac_export_args *pos, LLVMValueRef *out_elts)
 427 {
 428         unsigned reg_index;
 429         unsigned chan;
 430         unsigned const_chan;
 431         LLVMValueRef base_elt;
 432         LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 433         LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
 434                                                    SI_VS_CONST_CLIP_PLANES, 0);
 435         LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
 436
 437         for (reg_index = 0; reg_index < 2; reg_index ++) {
 438                 struct ac_export_args *args = &pos[2 + reg_index];
 439
 440                 args->out[0] =
 441                 args->out[1] =
 442                 args->out[2] =
 443                 args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
 444
 445                 /* Compute dot products of position and user clip plane vectors */
 446                 for (chan = 0; chan < 4; chan++) {
 447                         for (const_chan = 0; const_chan < 4; const_chan++) {
 448                                 LLVMValueRef addr =
 449                                         LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
 450                                                                 const_chan) * 4, 0);
 451                                 base_elt = si_buffer_load_const(ctx, const_resource,
 452                                                                 addr);
 453                                 args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
 454                                                                 out_elts[const_chan], args->out[chan]);
 455                         }
 456                 }
 457
 458                 args->enabled_channels = 0xf;
 459                 args->valid_mask = 0;
 460                 args->done = 0;
 461                 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
 462                 args->compr = 0;
 463         }
 464 }
 465
 466 /* Initialize arguments for the shader export intrinsic */
 467 static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
 468                                         LLVMValueRef *values,
 469                                         unsigned target,
 470                                         struct ac_export_args *args)
 471 {
 472         args->enabled_channels = 0xf; /* writemask - default is 0xf */
 473         args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
 474         args->done = 0; /* Specify whether this is the last export */
 475         args->target = target; /* Specify the target we are exporting */
 476         args->compr = false;
 477
 478         memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 479 }
 480
 481 static void si_export_param(struct si_shader_context *ctx, unsigned index,
 482                             LLVMValueRef *values)
 483 {
 484         struct ac_export_args args;
 485
 486         si_llvm_init_vs_export_args(ctx, values,
 487                                     V_008DFC_SQ_EXP_PARAM + index, &args);
 488         ac_build_export(&ctx->ac, &args);
 489 }
 490
 491 static void si_build_param_exports(struct si_shader_context *ctx,
 492                                    struct si_shader_output_values *outputs,
 493                                    unsigned noutput)
 494 {
 495         struct si_shader *shader = ctx->shader;
 496         unsigned param_count = 0;
 497
 498         for (unsigned i = 0; i < noutput; i++) {
 499                 unsigned semantic_name = outputs[i].semantic_name;
 500                 unsigned semantic_index = outputs[i].semantic_index;
 501
 502                 if (outputs[i].vertex_stream[0] != 0 &&
 503                     outputs[i].vertex_stream[1] != 0 &&
 504                     outputs[i].vertex_stream[2] != 0 &&
 505                     outputs[i].vertex_stream[3] != 0)
 506                         continue;
 507
 508                 switch (semantic_name) {
 509                 case TGSI_SEMANTIC_LAYER:
 510                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
 511                 case TGSI_SEMANTIC_CLIPDIST:
 512                 case TGSI_SEMANTIC_COLOR:
 513                 case TGSI_SEMANTIC_BCOLOR:
 514                 case TGSI_SEMANTIC_PRIMID:
 515                 case TGSI_SEMANTIC_FOG:
 516                 case TGSI_SEMANTIC_TEXCOORD:
 517                 case TGSI_SEMANTIC_GENERIC:
 518                         break;
 519                 default:
 520                         continue;
 521                 }
 522
 523                 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
 524                      semantic_index < SI_MAX_IO_GENERIC) &&
 525                     shader->key.opt.kill_outputs &
 526                     (1ull << si_shader_io_get_unique_index(semantic_name,
 527                                                            semantic_index, true)))
 528                         continue;
 529
 530                 si_export_param(ctx, param_count, outputs[i].values);
 531
 532                 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
 533                 shader->info.vs_output_param_offset[i] = param_count++;
 534         }
 535
 536         shader->info.nr_param_exports = param_count;
 537 }
 538
 539 /**
 540  * Vertex color clamping.
 541  *
 542  * This uses a state constant loaded in a user data SGPR and
 543  * an IF statement is added that clamps all colors if the constant
 544  * is true.
 545  */
 546 static void si_vertex_color_clamping(struct si_shader_context *ctx,
 547                                      struct si_shader_output_values *outputs,
 548                                      unsigned noutput)
 549 {
 550         LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
 551         bool has_colors = false;
 552
 553         /* Store original colors to alloca variables. */
 554         for (unsigned i = 0; i < noutput; i++) {
 555                 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
 556                     outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
 557                         continue;
 558
 559                 for (unsigned j = 0; j < 4; j++) {
 560                         addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
 561                         LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
 562                 }
 563                 has_colors = true;
 564         }
 565
 566         if (!has_colors)
 567                 return;
 568
 569         /* The state is in the first bit of the user SGPR. */
 570         LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
 571         cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
 572
 573         ac_build_ifcc(&ctx->ac, cond, 6502);
 574
 575         /* Store clamped colors to alloca variables within the conditional block. */
 576         for (unsigned i = 0; i < noutput; i++) {
 577                 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
 578                     outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
 579                         continue;
 580
 581                 for (unsigned j = 0; j < 4; j++) {
 582                         LLVMBuildStore(ctx->ac.builder,
 583                                        ac_build_clamp(&ctx->ac, outputs[i].values[j]),
 584                                        addr[i][j]);
 585                 }
 586         }
 587         ac_build_endif(&ctx->ac, 6502);
 588
 589         /* Load clamped colors */
 590         for (unsigned i = 0; i < noutput; i++) {
 591                 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
 592                     outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
 593                         continue;
 594
 595                 for (unsigned j = 0; j < 4; j++) {
 596                         outputs[i].values[j] =
 597                                 LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
 598                 }
 599         }
 600 }
 601
 602 /* Generate export instructions for hardware VS shader stage or NGG GS stage
 603  * (position and parameter data only).
 604  */
 605 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
 606                               struct si_shader_output_values *outputs,
 607                               unsigned noutput)
 608 {
 609         struct si_shader *shader = ctx->shader;
 610         struct ac_export_args pos_args[4] = {};
 611         LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
 612         unsigned pos_idx;
 613         int i;
 614
 615         si_vertex_color_clamping(ctx, outputs, noutput);
 616
 617         /* Build position exports. */
 618         for (i = 0; i < noutput; i++) {
 619                 switch (outputs[i].semantic_name) {
 620                 case TGSI_SEMANTIC_POSITION:
 621                         si_llvm_init_vs_export_args(ctx, outputs[i].values,
 622                                                     V_008DFC_SQ_EXP_POS, &pos_args[0]);
 623                         break;
 624                 case TGSI_SEMANTIC_PSIZE:
 625                         psize_value = outputs[i].values[0];
 626                         break;
 627                 case TGSI_SEMANTIC_LAYER:
 628                         layer_value = outputs[i].values[0];
 629                         break;
 630                 case TGSI_SEMANTIC_VIEWPORT_INDEX:
 631                         viewport_index_value = outputs[i].values[0];
 632                         break;
 633                 case TGSI_SEMANTIC_EDGEFLAG:
 634                         edgeflag_value = outputs[i].values[0];
 635                         break;
 636                 case TGSI_SEMANTIC_CLIPDIST:
 637                         if (!shader->key.opt.clip_disable) {
 638                                 unsigned index = 2 + outputs[i].semantic_index;
 639                                 si_llvm_init_vs_export_args(ctx, outputs[i].values,
 640                                                             V_008DFC_SQ_EXP_POS + index,
 641                                                             &pos_args[index]);
 642                         }
 643                         break;
 644                 case TGSI_SEMANTIC_CLIPVERTEX:
 645                         if (!shader->key.opt.clip_disable) {
 646                                 si_llvm_emit_clipvertex(ctx, pos_args,
 647                                                         outputs[i].values);
 648                         }
 649                         break;
 650                 }
 651         }
 652
 653         /* We need to add the position output manually if it's missing. */
 654         if (!pos_args[0].out[0]) {
 655                 pos_args[0].enabled_channels = 0xf; /* writemask */
 656                 pos_args[0].valid_mask = 0; /* EXEC mask */
 657                 pos_args[0].done = 0; /* last export? */
 658                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
 659                 pos_args[0].compr = 0; /* COMPR flag */
 660                 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
 661                 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
 662                 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
 663                 pos_args[0].out[3] = ctx->ac.f32_1;  /* W */
 664         }
 665
 666         bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
 667                                    !shader->key.as_ngg;
 668
 669         /* Write the misc vector (point size, edgeflag, layer, viewport). */
 670         if (shader->selector->info.writes_psize ||
 671             pos_writes_edgeflag ||
 672             shader->selector->info.writes_viewport_index ||
 673             shader->selector->info.writes_layer) {
 674                 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
 675                                                (pos_writes_edgeflag << 1) |
 676                                                (shader->selector->info.writes_layer << 2);
 677
 678                 pos_args[1].valid_mask = 0; /* EXEC mask */
 679                 pos_args[1].done = 0; /* last export? */
 680                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
 681                 pos_args[1].compr = 0; /* COMPR flag */
 682                 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
 683                 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
 684                 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
 685                 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
 686
 687                 if (shader->selector->info.writes_psize)
 688                         pos_args[1].out[0] = psize_value;
 689
 690                 if (pos_writes_edgeflag) {
 691                         /* The output is a float, but the hw expects an integer
 692                          * with the first bit containing the edge flag. */
 693                         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
 694                                                          edgeflag_value,
 695                                                          ctx->ac.i32, "");
 696                         edgeflag_value = ac_build_umin(&ctx->ac,
 697                                                       edgeflag_value,
 698                                                       ctx->ac.i32_1);
 699
 700                         /* The LLVM intrinsic expects a float. */
 701                         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
 702                 }
 703
 704                 if (ctx->screen->info.chip_class >= GFX9) {
 705                         /* GFX9 has the layer in out.z[10:0] and the viewport
 706                          * index in out.z[19:16].
 707                          */
 708                         if (shader->selector->info.writes_layer)
 709                                 pos_args[1].out[2] = layer_value;
 710
 711                         if (shader->selector->info.writes_viewport_index) {
 712                                 LLVMValueRef v = viewport_index_value;
 713
 714                                 v = ac_to_integer(&ctx->ac, v);
 715                                 v = LLVMBuildShl(ctx->ac.builder, v,
 716                                                  LLVMConstInt(ctx->ac.i32, 16, 0), "");
 717                                 v = LLVMBuildOr(ctx->ac.builder, v,
 718                                                 ac_to_integer(&ctx->ac,  pos_args[1].out[2]), "");
 719                                 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
 720                                 pos_args[1].enabled_channels |= 1 << 2;
 721                         }
 722                 } else {
 723                         if (shader->selector->info.writes_layer)
 724                                 pos_args[1].out[2] = layer_value;
 725
 726                         if (shader->selector->info.writes_viewport_index) {
 727                                 pos_args[1].out[3] = viewport_index_value;
 728                                 pos_args[1].enabled_channels |= 1 << 3;
 729                         }
 730                 }
 731         }
 732
 733         for (i = 0; i < 4; i++)
 734                 if (pos_args[i].out[0])
 735                         shader->info.nr_pos_exports++;
 736
 737         /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
 738          * Setting valid_mask=1 prevents it and has no other effect.
 739          */
 740         if (ctx->screen->info.family == CHIP_NAVI10 ||
 741             ctx->screen->info.family == CHIP_NAVI12 ||
 742             ctx->screen->info.family == CHIP_NAVI14)
 743                 pos_args[0].valid_mask = 1;
 744
 745         pos_idx = 0;
 746         for (i = 0; i < 4; i++) {
 747                 if (!pos_args[i].out[0])
 748                         continue;
 749
 750                 /* Specify the target we are exporting */
 751                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
 752
 753                 if (pos_idx == shader->info.nr_pos_exports)
 754                         /* Specify that this is the last export */
 755                         pos_args[i].done = 1;
 756
 757                 ac_build_export(&ctx->ac, &pos_args[i]);
 758         }
 759
 760         /* Build parameter exports. */
 761         si_build_param_exports(ctx, outputs, noutput);
 762 }
 763
 764 void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
 765                               LLVMValueRef *addrs)
 766 {
 767         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 768         struct si_shader_info *info = &ctx->shader->selector->info;
 769         struct si_shader_output_values *outputs = NULL;
 770         int i,j;
 771
 772         assert(!ctx->shader->is_gs_copy_shader);
 773         assert(info->num_outputs <= max_outputs);
 774
 775         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 776
 777         for (i = 0; i < info->num_outputs; i++) {
 778                 outputs[i].semantic_name = info->output_semantic_name[i];
 779                 outputs[i].semantic_index = info->output_semantic_index[i];
 780
 781                 for (j = 0; j < 4; j++) {
 782                         outputs[i].values[j] =
 783                                 LLVMBuildLoad(ctx->ac.builder,
 784                                               addrs[4 * i + j],
 785                                               "");
 786                         outputs[i].vertex_stream[j] =
 787                                 (info->output_streams[i] >> (2 * j)) & 3;
 788                 }
 789         }
 790
 791         if (!ctx->screen->use_ngg_streamout &&
 792             ctx->shader->selector->so.num_outputs)
 793                 si_llvm_emit_streamout(ctx, outputs, i, 0);
 794
 795         /* Export PrimitiveID. */
 796         if (ctx->shader->key.mono.u.vs_export_prim_id) {
 797                 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
 798                 outputs[i].semantic_index = 0;
 799                 outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
 800                 for (j = 1; j < 4; j++)
 801                         outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
 802
 803                 memset(outputs[i].vertex_stream, 0,
 804                        sizeof(outputs[i].vertex_stream));
 805                 i++;
 806         }
 807
 808         si_llvm_build_vs_exports(ctx, outputs, i);
 809         FREE(outputs);
 810 }
 811
 812 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
 813                                                   unsigned max_outputs,
 814                                                   LLVMValueRef *addrs)
 815 {
 816         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 817         struct si_shader_info *info = &ctx->shader->selector->info;
 818         LLVMValueRef pos[4] = {};
 819
 820         assert(info->num_outputs <= max_outputs);
 821
 822         for (unsigned i = 0; i < info->num_outputs; i++) {
 823                 if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
 824                         continue;
 825
 826                 for (unsigned chan = 0; chan < 4; chan++)
 827                         pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
 828                 break;
 829         }
 830         assert(pos[0] != NULL);
 831
 832         /* Return the position output. */
 833         LLVMValueRef ret = ctx->return_value;
 834         for (unsigned chan = 0; chan < 4; chan++)
 835                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
 836         ctx->return_value = ret;
 837 }
 838
 839 /**
 840  * Build the vertex shader prolog function.
 841  *
 842  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
 843  * All inputs are returned unmodified. The vertex load indices are
 844  * stored after them, which will be used by the API VS for fetching inputs.
 845  *
 846  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
 847  *   input_v0,
 848  *   input_v1,
 849  *   input_v2,
 850  *   input_v3,
 851  *   (VertexID + BaseVertex),
 852  *   (InstanceID + StartInstance),
 853  *   (InstanceID / 2 + StartInstance)
 854  */
 855 void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
 856                              union si_shader_part_key *key)
 857 {
 858         LLVMTypeRef *returns;
 859         LLVMValueRef ret, func;
 860         int num_returns, i;
 861         unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
 862         unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
 863         struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
 864         struct ac_arg input_vgpr_param[9];
 865         LLVMValueRef input_vgprs[9];
 866         unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
 867                                       num_input_vgprs;
 868         unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
 869
 870         memset(&ctx->args, 0, sizeof(ctx->args));
 871
 872         /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
 873         returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
 874                          sizeof(LLVMTypeRef));
 875         num_returns = 0;
 876
 877         /* Declare input and output SGPRs. */
 878         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
 879                 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
 880                            &input_sgpr_param[i]);
 881                 returns[num_returns++] = ctx->ac.i32;
 882         }
 883
 884         struct ac_arg merged_wave_info = input_sgpr_param[3];
 885
 886         /* Preloaded VGPRs (outputs must be floats) */
 887         for (i = 0; i < num_input_vgprs; i++) {
 888                 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
 889                 returns[num_returns++] = ctx->ac.f32;
 890         }
 891
 892         /* Vertex load indices. */
 893         for (i = 0; i < key->vs_prolog.num_inputs; i++)
 894                 returns[num_returns++] = ctx->ac.f32;
 895
 896         /* Create the function. */
 897         si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
 898         func = ctx->main_fn;
 899
 900         for (i = 0; i < num_input_vgprs; i++) {
 901                 input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
 902         }
 903
 904         if (key->vs_prolog.num_merged_next_stage_vgprs) {
 905                 if (!key->vs_prolog.is_monolithic)
 906                         si_init_exec_from_input(ctx, merged_wave_info, 0);
 907
 908                 if (key->vs_prolog.as_ls &&
 909                     ctx->screen->info.has_ls_vgpr_init_bug) {
 910                         /* If there are no HS threads, SPI loads the LS VGPRs
 911                          * starting at VGPR 0. Shift them back to where they
 912                          * belong.
 913                          */
 914                         LLVMValueRef has_hs_threads =
 915                                 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
 916                                     si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
 917                                     ctx->ac.i32_0, "");
 918
 919                         for (i = 4; i > 0; --i) {
 920                                 input_vgprs[i + 1] =
 921                                         LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
 922                                                         input_vgprs[i + 1],
 923                                                         input_vgprs[i - 1], "");
 924                         }
 925                 }
 926         }
 927
 928         if (key->vs_prolog.gs_fast_launch_tri_list ||
 929             key->vs_prolog.gs_fast_launch_tri_strip) {
 930                 LLVMValueRef wave_id, thread_id_in_tg;
 931
 932                 wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
 933                 thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
 934                                                 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
 935                                                 ac_get_thread_id(&ctx->ac));
 936
 937                 /* The GS fast launch initializes all VGPRs to the value of
 938                  * the first thread, so we have to add the thread ID.
 939                  *
 940                  * Only these are initialized by the hw:
 941                  *   VGPR2: Base Primitive ID
 942                  *   VGPR5: Base Vertex ID
 943                  *   VGPR6: Instance ID
 944                  */
 945
 946                 /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
 947                  * The NGG cull shader will read them from there.
 948                  */
 949                 if (key->vs_prolog.gs_fast_launch_tri_list) {
 950                         input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
 951                                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
 952                                                        LLVMConstInt(ctx->ac.i32, 0, 0));
 953                         input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
 954                                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
 955                                                        LLVMConstInt(ctx->ac.i32, 1, 0));
 956                         input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
 957                                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
 958                                                        LLVMConstInt(ctx->ac.i32, 2, 0));
 959                 } else {
 960                         assert(key->vs_prolog.gs_fast_launch_tri_strip);
 961                         LLVMBuilderRef builder = ctx->ac.builder;
 962                         /* Triangle indices: */
 963                         LLVMValueRef index[3] = {
 964                                 thread_id_in_tg,
 965                                 LLVMBuildAdd(builder, thread_id_in_tg,
 966                                              LLVMConstInt(ctx->ac.i32, 1, 0), ""),
 967                                 LLVMBuildAdd(builder, thread_id_in_tg,
 968                                              LLVMConstInt(ctx->ac.i32, 2, 0), ""),
 969                         };
 970                         LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
 971                                                              thread_id_in_tg, ctx->ac.i1, "");
 972                         LLVMValueRef flatshade_first =
 973                                 LLVMBuildICmp(builder, LLVMIntEQ,
 974                                               si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
 975                                               ctx->ac.i32_0, "");
 976
 977                         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
 978                                                                     flatshade_first, index);
 979                         input_vgprs[0] = index[0];
 980                         input_vgprs[1] = index[1];
 981                         input_vgprs[4] = index[2];
 982                 }
 983
 984                 /* Triangles always have all edge flags set initially. */
 985                 input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
 986
 987                 input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
 988                                               thread_id_in_tg, ""); /* PrimID */
 989                 input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
 990                                               thread_id_in_tg, ""); /* VertexID */
 991                 input_vgprs[8] = input_vgprs[6]; /* InstanceID */
 992         }
 993
 994         unsigned vertex_id_vgpr = first_vs_vgpr;
 995         unsigned instance_id_vgpr =
 996                 ctx->screen->info.chip_class >= GFX10 ?
 997                         first_vs_vgpr + 3 :
 998                         first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
 999
1000         ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
1001         ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
1002
1003         /* InstanceID = VertexID >> 16;
1004          * VertexID   = VertexID & 0xffff;
1005          */
1006         if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
1007                 ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
1008                                                      LLVMConstInt(ctx->ac.i32, 16, 0), "");
1009                 ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
1010                                                   LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
1011         }
1012
1013         /* Copy inputs to outputs. This should be no-op, as the registers match,
1014          * but it will prevent the compiler from overwriting them unintentionally.
1015          */
1016         ret = ctx->return_value;
1017         for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
1018                 LLVMValueRef p = LLVMGetParam(func, i);
1019                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
1020         }
1021         for (i = 0; i < num_input_vgprs; i++) {
1022                 LLVMValueRef p = input_vgprs[i];
1023
1024                 if (i == vertex_id_vgpr)
1025                         p = ctx->abi.vertex_id;
1026                 else if (i == instance_id_vgpr)
1027                         p = ctx->abi.instance_id;
1028
1029                 p = ac_to_float(&ctx->ac, p);
1030                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
1031                                            key->vs_prolog.num_input_sgprs + i, "");
1032         }
1033
1034         /* Compute vertex load indices from instance divisors. */
1035         LLVMValueRef instance_divisor_constbuf = NULL;
1036
1037         if (key->vs_prolog.states.instance_divisor_is_fetched) {
1038                 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
1039                 LLVMValueRef buf_index =
1040                         LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
1041                 instance_divisor_constbuf =
1042                         ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
1043         }
1044
1045         for (i = 0; i < key->vs_prolog.num_inputs; i++) {
1046                 bool divisor_is_one =
1047                         key->vs_prolog.states.instance_divisor_is_one & (1u << i);
1048                 bool divisor_is_fetched =
1049                         key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
1050                 LLVMValueRef index = NULL;
1051
1052                 if (divisor_is_one) {
1053                         index = ctx->abi.instance_id;
1054                 } else if (divisor_is_fetched) {
1055                         LLVMValueRef udiv_factors[4];
1056
1057                         for (unsigned j = 0; j < 4; j++) {
1058                                 udiv_factors[j] =
1059                                         si_buffer_load_const(ctx, instance_divisor_constbuf,
1060                                                              LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
1061                                 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
1062                         }
1063                         /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
1064                          * Such InstanceID might not be achievable in a reasonable time though.
1065                          */
1066                         index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
1067                                                        udiv_factors[0], udiv_factors[1],
1068                                                        udiv_factors[2], udiv_factors[3]);
1069                 }
1070
1071                 if (divisor_is_one || divisor_is_fetched) {
1072                         /* Add StartInstance. */
1073                         index = LLVMBuildAdd(ctx->ac.builder, index,
1074                                              LLVMGetParam(ctx->main_fn, user_sgpr_base +
1075                                                           SI_SGPR_START_INSTANCE), "");
1076                 } else {
1077                         /* VertexID + BaseVertex */
1078                         index = LLVMBuildAdd(ctx->ac.builder,
1079                                              ctx->abi.vertex_id,
1080                                              LLVMGetParam(func, user_sgpr_base +
1081                                                                 SI_SGPR_BASE_VERTEX), "");
1082                 }
1083
1084                 index = ac_to_float(&ctx->ac, index);
1085                 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
1086                                            ctx->args.arg_count + i, "");
1087         }
1088
1089         si_llvm_build_ret(ctx, ret);
1090 }
1091
1092 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
1093 {
1094         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1095
1096         /* For non-indexed draws, the base vertex set by the driver
1097          * (for direct draws) or the CP (for indirect draws) is the
1098          * first vertex ID, but GLSL expects 0 to be returned.
1099          */
1100         LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
1101                                            ctx->vs_state_bits);
1102         LLVMValueRef indexed;
1103
1104         indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
1105         indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
1106
1107         return LLVMBuildSelect(ctx->ac.builder, indexed,
1108                                ac_get_arg(&ctx->ac, ctx->args.base_vertex),
1109                                ctx->ac.i32_0, "");
1110 }
1111
1112 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
1113 {
1114         struct si_shader *shader = ctx->shader;
1115
1116         if (shader->key.as_ls)
1117                 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
1118         else if (shader->key.as_es)
1119                 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
1120         else if (shader->key.opt.vs_as_prim_discard_cs)
1121                 ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
1122         else if (ngg_cull_shader)
1123                 ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
1124         else if (shader->key.as_ngg)
1125                 ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
1126         else
1127                 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
1128
1129         ctx->abi.load_base_vertex = get_base_vertex;
1130 }