src/gallium/drivers/radeonsi/si_shader_llvm_vs.c

   1 /*
   2  * Copyright 2020 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_pipe.h"
  26 #include "si_shader_internal.h"
  27 #include "sid.h"
  28 #include "util/u_memory.h"
  29
  30 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
  31 {
  32    assert(index <= 1);
  33
  34    if (index == 1)
  35       return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");
  36
  37    return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
  38                         ctx->ac.i32, "");
  39 }
  40
  41 static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
  42 {
  43    const struct si_shader_info *info = &ctx->shader->selector->info;
  44    unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
  45
  46    if (vs_blit_property) {
  47       LLVMValueRef vertex_id = ctx->abi.vertex_id;
  48       LLVMValueRef sel_x1 =
  49          LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
  50       /* Use LLVMIntNE, because we have 3 vertices and only
  51        * the middle one should use y2.
  52        */
  53       LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");
  54
  55       unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
  56       if (input_index == 0) {
  57          /* Position: */
  58          LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
  59          LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);
  60
  61          LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
  62          LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
  63          LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
  64          LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
  65
  66          LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
  67          LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
  68
  69          out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
  70          out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
  71          out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
  72          out[3] = ctx->ac.f32_1;
  73          return;
  74       }
  75
  76       /* Color or texture coordinates: */
  77       assert(input_index == 1);
  78
  79       if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
  80          for (int i = 0; i < 4; i++) {
  81             out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
  82          }
  83       } else {
  84          assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
  85          LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
  86          LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
  87          LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
  88          LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);
  89
  90          out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
  91          out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
  92          out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
  93          out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
  94       }
  95       return;
  96    }
  97
  98    unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
  99    union si_vs_fix_fetch fix_fetch;
 100    LLVMValueRef vb_desc;
 101    LLVMValueRef vertex_index;
 102    LLVMValueRef tmp;
 103
 104    if (input_index < num_vbos_in_user_sgprs) {
 105       vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
 106    } else {
 107       unsigned index = input_index - num_vbos_in_user_sgprs;
 108       vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->vertex_buffers),
 109                                       LLVMConstInt(ctx->ac.i32, index, 0));
 110    }
 111
 112    vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
 113
 114    /* Use the open-coded implementation for all loads of doubles and
 115     * of dword-sized data that needs fixups. We need to insert conversion
 116     * code anyway, and the amd/common code does it for us.
 117     *
 118     * Note: On LLVM <= 8, we can only open-code formats with
 119     * channel size >= 4 bytes.
 120     */
 121    bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
 122    fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
 123    if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
 124        (fix_fetch.u.log_size == 2)) {
 125       tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
 126                                            fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
 127                                            fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
 128                                            ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
 129       for (unsigned i = 0; i < 4; ++i)
 130          out[i] =
 131             LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
 132       return;
 133    }
 134
 135    unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
 136    if (required_channels == 0) {
 137       for (unsigned i = 0; i < 4; ++i)
 138          out[i] = LLVMGetUndef(ctx->ac.f32);
 139       return;
 140    }
 141
 142    /* Do multiple loads for special formats. */
 143    LLVMValueRef fetches[4];
 144    unsigned num_fetches;
 145    unsigned fetch_stride;
 146    unsigned channels_per_fetch;
 147
 148    if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
 149       num_fetches = MIN2(required_channels, 3);
 150       fetch_stride = 1 << fix_fetch.u.log_size;
 151       channels_per_fetch = 1;
 152    } else {
 153       num_fetches = 1;
 154       fetch_stride = 0;
 155       channels_per_fetch = required_channels;
 156    }
 157
 158    for (unsigned i = 0; i < num_fetches; ++i) {
 159       LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
 160       fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
 161                                                channels_per_fetch, 0, true, false);
 162    }
 163
 164    if (num_fetches == 1 && channels_per_fetch > 1) {
 165       LLVMValueRef fetch = fetches[0];
 166       for (unsigned i = 0; i < channels_per_fetch; ++i) {
 167          tmp = LLVMConstInt(ctx->ac.i32, i, false);
 168          fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
 169       }
 170       num_fetches = channels_per_fetch;
 171       channels_per_fetch = 1;
 172    }
 173
 174    for (unsigned i = num_fetches; i < 4; ++i)
 175       fetches[i] = LLVMGetUndef(ctx->ac.f32);
 176
 177    if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
 178       if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
 179          fetches[3] = ctx->ac.i32_1;
 180       else
 181          fetches[3] = ctx->ac.f32_1;
 182    } else if (fix_fetch.u.log_size == 3 &&
 183               (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
 184                fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
 185                fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
 186               required_channels == 4) {
 187       /* For 2_10_10_10, the hardware returns an unsigned value;
 188        * convert it to a signed one.
 189        */
 190       LLVMValueRef tmp = fetches[3];
 191       LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
 192
 193       /* First, recover the sign-extended signed integer value. */
 194       if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
 195          tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
 196       else
 197          tmp = ac_to_integer(&ctx->ac, tmp);
 198
 199       /* For the integer-like cases, do a natural sign extension.
 200        *
 201        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
 202        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
 203        * exponent.
 204        */
 205       tmp = LLVMBuildShl(
 206          ctx->ac.builder, tmp,
 207          fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
 208       tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
 209
 210       /* Convert back to the right type. */
 211       if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
 212          LLVMValueRef clamp;
 213          LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
 214          tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
 215          clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
 216          tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
 217       } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
 218          tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
 219       }
 220
 221       fetches[3] = tmp;
 222    }
 223
 224    for (unsigned i = 0; i < 4; ++i)
 225       out[i] = ac_to_float(&ctx->ac, fetches[i]);
 226 }
 227
 228 static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index)
 229 {
 230    LLVMValueRef input[4];
 231
 232    load_input_vs(ctx, input_index / 4, input);
 233
 234    for (unsigned chan = 0; chan < 4; chan++) {
 235       ctx->inputs[input_index + chan] =
 236          LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
 237    }
 238 }
 239
 240 void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
 241 {
 242    uint64_t processed_inputs = 0;
 243
 244    nir_foreach_shader_in_variable (variable, nir) {
 245       unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
 246       unsigned input_idx = variable->data.driver_location;
 247       unsigned loc = variable->data.location;
 248
 249       for (unsigned i = 0; i < attrib_count; i++) {
 250          /* Packed components share the same location so skip
 251           * them if we have already processed the location.
 252           */
 253          if (processed_inputs & ((uint64_t)1 << (loc + i))) {
 254             input_idx += 4;
 255             continue;
 256          }
 257
 258          declare_input_vs(ctx, input_idx);
 259          if (glsl_type_is_dual_slot(variable->type)) {
 260             input_idx += 4;
 261             declare_input_vs(ctx, input_idx);
 262          }
 263
 264          processed_inputs |= ((uint64_t)1 << (loc + i));
 265          input_idx += 4;
 266       }
 267    }
 268 }
 269
 270 void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
 271                                     LLVMValueRef const *so_write_offsets,
 272                                     struct pipe_stream_output *stream_out,
 273                                     struct si_shader_output_values *shader_out)
 274 {
 275    unsigned buf_idx = stream_out->output_buffer;
 276    unsigned start = stream_out->start_component;
 277    unsigned num_comps = stream_out->num_components;
 278    LLVMValueRef out[4];
 279
 280    assert(num_comps && num_comps <= 4);
 281    if (!num_comps || num_comps > 4)
 282       return;
 283
 284    /* Load the output as int. */
 285    for (int j = 0; j < num_comps; j++) {
 286       assert(stream_out->stream == shader_out->vertex_stream[start + j]);
 287
 288       out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
 289    }
 290
 291    /* Pack the output. */
 292    LLVMValueRef vdata = NULL;
 293
 294    switch (num_comps) {
 295    case 1: /* as i32 */
 296       vdata = out[0];
 297       break;
 298    case 2: /* as v2i32 */
 299    case 3: /* as v3i32 */
 300       if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
 301          vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
 302          break;
 303       }
 304       /* as v4i32 (aligned to 4) */
 305       out[3] = LLVMGetUndef(ctx->ac.i32);
 306       /* fall through */
 307    case 4: /* as v4i32 */
 308       vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
 309       break;
 310    }
 311
 312    ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, num_comps,
 313                                so_write_offsets[buf_idx], ctx->ac.i32_0, stream_out->dst_offset * 4,
 314                                ac_glc | ac_slc);
 315 }
 316
 317 /**
 318  * Write streamout data to buffers for vertex stream @p stream (different
 319  * vertex streams can occur for GS copy shaders).
 320  */
 321 void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
 322                             unsigned noutput, unsigned stream)
 323 {
 324    struct si_shader_selector *sel = ctx->shader->selector;
 325    struct pipe_stream_output_info *so = &sel->so;
 326    LLVMBuilderRef builder = ctx->ac.builder;
 327    int i;
 328
 329    /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
 330    LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->streamout_config, 16, 7);
 331
 332    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
 333
 334    /* can_emit = tid < so_vtx_count; */
 335    LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
 336
 337    /* Emit the streamout code conditionally. This actually avoids
 338     * out-of-bounds buffer access. The hw tells us via the SGPR
 339     * (so_vtx_count) which threads are allowed to emit streamout data. */
 340    ac_build_ifcc(&ctx->ac, can_emit, 6501);
 341    {
 342       /* The buffer offset is computed as follows:
 343        *   ByteOffset = streamout_offset[buffer_id]*4 +
 344        *                (streamout_write_index + thread_id)*stride[buffer_id] +
 345        *                attrib_offset
 346        */
 347
 348       LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->streamout_write_index);
 349
 350       /* Compute (streamout_write_index + thread_id). */
 351       so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
 352
 353       /* Load the descriptor and compute the write offset for each
 354        * enabled buffer. */
 355       LLVMValueRef so_write_offset[4] = {};
 356       LLVMValueRef so_buffers[4];
 357       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 358
 359       for (i = 0; i < 4; i++) {
 360          if (!so->stride[i])
 361             continue;
 362
 363          LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);
 364
 365          so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
 366
 367          LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->streamout_offset[i]);
 368          so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
 369
 370          so_write_offset[i] = ac_build_imad(
 371             &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
 372       }
 373
 374       /* Write streamout data. */
 375       for (i = 0; i < so->num_outputs; i++) {
 376          unsigned reg = so->output[i].register_index;
 377
 378          if (reg >= noutput)
 379             continue;
 380
 381          if (stream != so->output[i].stream)
 382             continue;
 383
 384          si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
 385                                         &outputs[reg]);
 386       }
 387    }
 388    ac_build_endif(&ctx->ac, 6501);
 389 }
 390
 391 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_export_args *pos,
 392                                     LLVMValueRef *out_elts)
 393 {
 394    unsigned reg_index;
 395    unsigned chan;
 396    unsigned const_chan;
 397    LLVMValueRef base_elt;
 398    LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 399    LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
 400    LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
 401
 402    for (reg_index = 0; reg_index < 2; reg_index++) {
 403       struct ac_export_args *args = &pos[2 + reg_index];
 404
 405       args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
 406
 407       /* Compute dot products of position and user clip plane vectors */
 408       for (chan = 0; chan < 4; chan++) {
 409          for (const_chan = 0; const_chan < 4; const_chan++) {
 410             LLVMValueRef addr =
 411                LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
 412             base_elt = si_buffer_load_const(ctx, const_resource, addr);
 413             args->out[chan] =
 414                ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan], args->out[chan]);
 415          }
 416       }
 417
 418       args->enabled_channels = 0xf;
 419       args->valid_mask = 0;
 420       args->done = 0;
 421       args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
 422       args->compr = 0;
 423    }
 424 }
 425
 426 /* Initialize arguments for the shader export intrinsic */
 427 static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
 428                                         unsigned target, struct ac_export_args *args)
 429 {
 430    args->enabled_channels = 0xf; /* writemask - default is 0xf */
 431    args->valid_mask = 0;         /* Specify whether the EXEC mask represents the valid mask */
 432    args->done = 0;               /* Specify whether this is the last export */
 433    args->target = target;        /* Specify the target we are exporting */
 434    args->compr = false;
 435
 436    memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 437 }
 438
 439 static void si_export_param(struct si_shader_context *ctx, unsigned index, LLVMValueRef *values)
 440 {
 441    struct ac_export_args args;
 442
 443    si_llvm_init_vs_export_args(ctx, values, V_008DFC_SQ_EXP_PARAM + index, &args);
 444    ac_build_export(&ctx->ac, &args);
 445 }
 446
 447 static void si_build_param_exports(struct si_shader_context *ctx,
 448                                    struct si_shader_output_values *outputs, unsigned noutput)
 449 {
 450    struct si_shader *shader = ctx->shader;
 451    unsigned param_count = 0;
 452
 453    for (unsigned i = 0; i < noutput; i++) {
 454       unsigned semantic_name = outputs[i].semantic_name;
 455       unsigned semantic_index = outputs[i].semantic_index;
 456
 457       if (outputs[i].vertex_stream[0] != 0 && outputs[i].vertex_stream[1] != 0 &&
 458           outputs[i].vertex_stream[2] != 0 && outputs[i].vertex_stream[3] != 0)
 459          continue;
 460
 461       switch (semantic_name) {
 462       case TGSI_SEMANTIC_LAYER:
 463       case TGSI_SEMANTIC_VIEWPORT_INDEX:
 464       case TGSI_SEMANTIC_CLIPDIST:
 465       case TGSI_SEMANTIC_COLOR:
 466       case TGSI_SEMANTIC_BCOLOR:
 467       case TGSI_SEMANTIC_PRIMID:
 468       case TGSI_SEMANTIC_FOG:
 469       case TGSI_SEMANTIC_TEXCOORD:
 470       case TGSI_SEMANTIC_GENERIC:
 471          break;
 472       default:
 473          continue;
 474       }
 475
 476       if ((semantic_name != TGSI_SEMANTIC_GENERIC || semantic_index < SI_MAX_IO_GENERIC) &&
 477           shader->key.opt.kill_outputs &
 478              (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index, true)))
 479          continue;
 480
 481       si_export_param(ctx, param_count, outputs[i].values);
 482
 483       assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
 484       shader->info.vs_output_param_offset[i] = param_count++;
 485    }
 486
 487    shader->info.nr_param_exports = param_count;
 488 }
 489
 490 /**
 491  * Vertex color clamping.
 492  *
 493  * This uses a state constant loaded in a user data SGPR and
 494  * an IF statement is added that clamps all colors if the constant
 495  * is true.
 496  */
 497 static void si_vertex_color_clamping(struct si_shader_context *ctx,
 498                                      struct si_shader_output_values *outputs, unsigned noutput)
 499 {
 500    LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
 501    bool has_colors = false;
 502
 503    /* Store original colors to alloca variables. */
 504    for (unsigned i = 0; i < noutput; i++) {
 505       if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
 506           outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
 507          continue;
 508
 509       for (unsigned j = 0; j < 4; j++) {
 510          addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
 511          LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
 512       }
 513       has_colors = true;
 514    }
 515
 516    if (!has_colors)
 517       return;
 518
 519    /* The state is in the first bit of the user SGPR. */
 520    LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
 521    cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
 522
 523    ac_build_ifcc(&ctx->ac, cond, 6502);
 524
 525    /* Store clamped colors to alloca variables within the conditional block. */
 526    for (unsigned i = 0; i < noutput; i++) {
 527       if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
 528           outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
 529          continue;
 530
 531       for (unsigned j = 0; j < 4; j++) {
 532          LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
 533                         addr[i][j]);
 534       }
 535    }
 536    ac_build_endif(&ctx->ac, 6502);
 537
 538    /* Load clamped colors */
 539    for (unsigned i = 0; i < noutput; i++) {
 540       if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
 541           outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
 542          continue;
 543
 544       for (unsigned j = 0; j < 4; j++) {
 545          outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
 546       }
 547    }
 548 }
 549
 550 /* Generate export instructions for hardware VS shader stage or NGG GS stage
 551  * (position and parameter data only).
 552  */
 553 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
 554                               struct si_shader_output_values *outputs, unsigned noutput)
 555 {
 556    struct si_shader *shader = ctx->shader;
 557    struct ac_export_args pos_args[4] = {};
 558    LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
 559                 viewport_index_value = NULL;
 560    unsigned pos_idx;
 561    int i;
 562
 563    si_vertex_color_clamping(ctx, outputs, noutput);
 564
 565    /* Build position exports. */
 566    for (i = 0; i < noutput; i++) {
 567       switch (outputs[i].semantic_name) {
 568       case TGSI_SEMANTIC_POSITION:
 569          si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
 570          break;
 571       case TGSI_SEMANTIC_PSIZE:
 572          psize_value = outputs[i].values[0];
 573          break;
 574       case TGSI_SEMANTIC_LAYER:
 575          layer_value = outputs[i].values[0];
 576          break;
 577       case TGSI_SEMANTIC_VIEWPORT_INDEX:
 578          viewport_index_value = outputs[i].values[0];
 579          break;
 580       case TGSI_SEMANTIC_EDGEFLAG:
 581          edgeflag_value = outputs[i].values[0];
 582          break;
 583       case TGSI_SEMANTIC_CLIPDIST:
 584          if (!shader->key.opt.clip_disable) {
 585             unsigned index = 2 + outputs[i].semantic_index;
 586             si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + index,
 587                                         &pos_args[index]);
 588          }
 589          break;
 590       case TGSI_SEMANTIC_CLIPVERTEX:
 591          if (!shader->key.opt.clip_disable) {
 592             si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
 593          }
 594          break;
 595       }
 596    }
 597
 598    /* We need to add the position output manually if it's missing. */
 599    if (!pos_args[0].out[0]) {
 600       pos_args[0].enabled_channels = 0xf; /* writemask */
 601       pos_args[0].valid_mask = 0;         /* EXEC mask */
 602       pos_args[0].done = 0;               /* last export? */
 603       pos_args[0].target = V_008DFC_SQ_EXP_POS;
 604       pos_args[0].compr = 0;              /* COMPR flag */
 605       pos_args[0].out[0] = ctx->ac.f32_0; /* X */
 606       pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
 607       pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
 608       pos_args[0].out[3] = ctx->ac.f32_1; /* W */
 609    }
 610
 611    bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;
 612
 613    /* Write the misc vector (point size, edgeflag, layer, viewport). */
 614    if (shader->selector->info.writes_psize || pos_writes_edgeflag ||
 615        shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
 616       pos_args[1].enabled_channels = shader->selector->info.writes_psize |
 617                                      (pos_writes_edgeflag << 1) |
 618                                      (shader->selector->info.writes_layer << 2);
 619
 620       pos_args[1].valid_mask = 0; /* EXEC mask */
 621       pos_args[1].done = 0;       /* last export? */
 622       pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
 623       pos_args[1].compr = 0;              /* COMPR flag */
 624       pos_args[1].out[0] = ctx->ac.f32_0; /* X */
 625       pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
 626       pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
 627       pos_args[1].out[3] = ctx->ac.f32_0; /* W */
 628
 629       if (shader->selector->info.writes_psize)
 630          pos_args[1].out[0] = psize_value;
 631
 632       if (pos_writes_edgeflag) {
 633          /* The output is a float, but the hw expects an integer
 634           * with the first bit containing the edge flag. */
 635          edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
 636          edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);
 637
 638          /* The LLVM intrinsic expects a float. */
 639          pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
 640       }
 641
 642       if (ctx->screen->info.chip_class >= GFX9) {
 643          /* GFX9 has the layer in out.z[10:0] and the viewport
 644           * index in out.z[19:16].
 645           */
 646          if (shader->selector->info.writes_layer)
 647             pos_args[1].out[2] = layer_value;
 648
 649          if (shader->selector->info.writes_viewport_index) {
 650             LLVMValueRef v = viewport_index_value;
 651
 652             v = ac_to_integer(&ctx->ac, v);
 653             v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
 654             v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
 655             pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
 656             pos_args[1].enabled_channels |= 1 << 2;
 657          }
 658       } else {
 659          if (shader->selector->info.writes_layer)
 660             pos_args[1].out[2] = layer_value;
 661
 662          if (shader->selector->info.writes_viewport_index) {
 663             pos_args[1].out[3] = viewport_index_value;
 664             pos_args[1].enabled_channels |= 1 << 3;
 665          }
 666       }
 667    }
 668
 669    for (i = 0; i < 4; i++)
 670       if (pos_args[i].out[0])
 671          shader->info.nr_pos_exports++;
 672
 673    /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
 674     * Setting valid_mask=1 prevents it and has no other effect.
 675     */
 676    if (ctx->screen->info.chip_class == GFX10)
 677       pos_args[0].valid_mask = 1;
 678
 679    pos_idx = 0;
 680    for (i = 0; i < 4; i++) {
 681       if (!pos_args[i].out[0])
 682          continue;
 683
 684       /* Specify the target we are exporting */
 685       pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
 686
 687       if (pos_idx == shader->info.nr_pos_exports)
 688          /* Specify that this is the last export */
 689          pos_args[i].done = 1;
 690
 691       ac_build_export(&ctx->ac, &pos_args[i]);
 692    }
 693
 694    /* Build parameter exports. */
 695    si_build_param_exports(ctx, outputs, noutput);
 696 }
 697
 698 void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
 699 {
 700    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 701    struct si_shader_info *info = &ctx->shader->selector->info;
 702    struct si_shader_output_values *outputs = NULL;
 703    int i, j;
 704
 705    assert(!ctx->shader->is_gs_copy_shader);
 706    assert(info->num_outputs <= max_outputs);
 707
 708    outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 709
 710    for (i = 0; i < info->num_outputs; i++) {
 711       outputs[i].semantic_name = info->output_semantic_name[i];
 712       outputs[i].semantic_index = info->output_semantic_index[i];
 713
 714       for (j = 0; j < 4; j++) {
 715          outputs[i].values[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
 716          outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
 717       }
 718    }
 719
 720    if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs)
 721       si_llvm_emit_streamout(ctx, outputs, i, 0);
 722
 723    /* Export PrimitiveID. */
 724    if (ctx->shader->key.mono.u.vs_export_prim_id) {
 725       outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
 726       outputs[i].semantic_index = 0;
 727       outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
 728       for (j = 1; j < 4; j++)
 729          outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
 730
 731       memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
 732       i++;
 733    }
 734
 735    si_llvm_build_vs_exports(ctx, outputs, i);
 736    FREE(outputs);
 737 }
 738
 739 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
 740                                                   LLVMValueRef *addrs)
 741 {
 742    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 743    struct si_shader_info *info = &ctx->shader->selector->info;
 744    LLVMValueRef pos[4] = {};
 745
 746    assert(info->num_outputs <= max_outputs);
 747
 748    for (unsigned i = 0; i < info->num_outputs; i++) {
 749       if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
 750          continue;
 751
 752       for (unsigned chan = 0; chan < 4; chan++)
 753          pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
 754       break;
 755    }
 756    assert(pos[0] != NULL);
 757
 758    /* Return the position output. */
 759    LLVMValueRef ret = ctx->return_value;
 760    for (unsigned chan = 0; chan < 4; chan++)
 761       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
 762    ctx->return_value = ret;
 763 }
 764
 765 /**
 766  * Build the vertex shader prolog function.
 767  *
 768  * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
 769  * All inputs are returned unmodified. The vertex load indices are
 770  * stored after them, which will be used by the API VS for fetching inputs.
 771  *
 772  * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
 773  *   input_v0,
 774  *   input_v1,
 775  *   input_v2,
 776  *   input_v3,
 777  *   (VertexID + BaseVertex),
 778  *   (InstanceID + StartInstance),
 779  *   (InstanceID / 2 + StartInstance)
 780  */
 781 void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
 782 {
 783    LLVMTypeRef *returns;
 784    LLVMValueRef ret, func;
 785    int num_returns, i;
 786    unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
 787    unsigned num_input_vgprs =
 788       key->vs_prolog.num_merged_next_stage_vgprs + 4 + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
 789    struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
 790    struct ac_arg input_vgpr_param[10];
 791    LLVMValueRef input_vgprs[10];
 792    unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
 793    unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
 794
 795    memset(&ctx->args, 0, sizeof(ctx->args));
 796
 797    /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
 798    returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
 799    num_returns = 0;
 800
 801    /* Declare input and output SGPRs. */
 802    for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
 803       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
 804       returns[num_returns++] = ctx->ac.i32;
 805    }
 806
 807    struct ac_arg merged_wave_info = input_sgpr_param[3];
 808
 809    /* Preloaded VGPRs (outputs must be floats) */
 810    for (i = 0; i < num_input_vgprs; i++) {
 811       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
 812       returns[num_returns++] = ctx->ac.f32;
 813    }
 814
 815    /* Vertex load indices. */
 816    for (i = 0; i < key->vs_prolog.num_inputs; i++)
 817       returns[num_returns++] = ctx->ac.f32;
 818
 819    /* Create the function. */
 820    si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
 821    func = ctx->main_fn;
 822
 823    for (i = 0; i < num_input_vgprs; i++) {
 824       input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
 825    }
 826
 827    if (key->vs_prolog.num_merged_next_stage_vgprs) {
 828       if (!key->vs_prolog.is_monolithic)
 829          si_init_exec_from_input(ctx, merged_wave_info, 0);
 830
 831       if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
 832          /* If there are no HS threads, SPI loads the LS VGPRs
 833           * starting at VGPR 0. Shift them back to where they
 834           * belong.
 835           */
 836          LLVMValueRef has_hs_threads =
 837             LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
 838                           si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");
 839
 840          for (i = 4; i > 0; --i) {
 841             input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
 842                                                  input_vgprs[i + 1], input_vgprs[i - 1], "");
 843          }
 844       }
 845    }
 846
 847    if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
 848       LLVMValueRef wave_id, thread_id_in_tg;
 849
 850       wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
 851       thread_id_in_tg =
 852          ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
 853                        ac_get_thread_id(&ctx->ac));
 854
 855       /* The GS fast launch initializes all VGPRs to the value of
 856        * the first thread, so we have to add the thread ID.
 857        *
 858        * Only these are initialized by the hw:
 859        *   VGPR2: Base Primitive ID
 860        *   VGPR5: Base Vertex ID
 861        *   VGPR6: Instance ID
 862        */
 863
 864       /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
 865        * The NGG cull shader will read them from there.
 866        */
 867       if (key->vs_prolog.gs_fast_launch_tri_list) {
 868          input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx01_offset */
 869                                         LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
 870                                         LLVMConstInt(ctx->ac.i32, 0, 0));
 871          input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx23_offset */
 872                                         LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
 873                                         LLVMConstInt(ctx->ac.i32, 1, 0));
 874          input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx45_offset */
 875                                         LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
 876                                         LLVMConstInt(ctx->ac.i32, 2, 0));
 877       } else {
 878          assert(key->vs_prolog.gs_fast_launch_tri_strip);
 879          LLVMBuilderRef builder = ctx->ac.builder;
 880          /* Triangle indices: */
 881          LLVMValueRef index[3] = {
 882             thread_id_in_tg,
 883             LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),
 884             LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),
 885          };
 886          LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");
 887          LLVMValueRef flatshade_first = LLVMBuildICmp(
 888             builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
 889
 890          ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);
 891          input_vgprs[0] = index[0];
 892          input_vgprs[1] = index[1];
 893          input_vgprs[4] = index[2];
 894       }
 895
 896       /* Triangles always have all edge flags set initially. */
 897       input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
 898
 899       input_vgprs[2] =
 900          LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */
 901       input_vgprs[5] =
 902          LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
 903       input_vgprs[8] = input_vgprs[6];                                       /* InstanceID */
 904    }
 905
 906    unsigned vertex_id_vgpr = first_vs_vgpr;
 907    unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
 908                                   ? first_vs_vgpr + 3
 909                                   : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
 910
 911    ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
 912    ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
 913
 914    /* InstanceID = VertexID >> 16;
 915     * VertexID   = VertexID & 0xffff;
 916     */
 917    if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
 918       ctx->abi.instance_id =
 919          LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
 920       ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
 921                                         LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
 922    }
 923
 924    /* Copy inputs to outputs. This should be no-op, as the registers match,
 925     * but it will prevent the compiler from overwriting them unintentionally.
 926     */
 927    ret = ctx->return_value;
 928    for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
 929       LLVMValueRef p = LLVMGetParam(func, i);
 930       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
 931    }
 932    for (i = 0; i < num_input_vgprs; i++) {
 933       LLVMValueRef p = input_vgprs[i];
 934
 935       if (i == vertex_id_vgpr)
 936          p = ctx->abi.vertex_id;
 937       else if (i == instance_id_vgpr)
 938          p = ctx->abi.instance_id;
 939
 940       p = ac_to_float(&ctx->ac, p);
 941       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
 942    }
 943
 944    /* Compute vertex load indices from instance divisors. */
 945    LLVMValueRef instance_divisor_constbuf = NULL;
 946
 947    if (key->vs_prolog.states.instance_divisor_is_fetched) {
 948       LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
 949       LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
 950       instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
 951    }
 952
 953    for (i = 0; i < key->vs_prolog.num_inputs; i++) {
 954       bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
 955       bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
 956       LLVMValueRef index = NULL;
 957
 958       if (divisor_is_one) {
 959          index = ctx->abi.instance_id;
 960       } else if (divisor_is_fetched) {
 961          LLVMValueRef udiv_factors[4];
 962
 963          for (unsigned j = 0; j < 4; j++) {
 964             udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
 965                                                    LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
 966             udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
 967          }
 968          /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
 969           * Such InstanceID might not be achievable in a reasonable time though.
 970           */
 971          index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
 972                                         udiv_factors[1], udiv_factors[2], udiv_factors[3]);
 973       }
 974
 975       if (divisor_is_one || divisor_is_fetched) {
 976          /* Add StartInstance. */
 977          index =
 978             LLVMBuildAdd(ctx->ac.builder, index,
 979                          LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
 980       } else {
 981          /* VertexID + BaseVertex */
 982          index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
 983                               LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
 984       }
 985
 986       index = ac_to_float(&ctx->ac, index);
 987       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
 988    }
 989
 990    si_llvm_build_ret(ctx, ret);
 991 }
 992
 993 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
 994 {
 995    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 996
 997    /* For non-indexed draws, the base vertex set by the driver
 998     * (for direct draws) or the CP (for indirect draws) is the
 999     * first vertex ID, but GLSL expects 0 to be returned.
1000     */
1001    LLVMValueRef vs_state = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
1002    LLVMValueRef indexed;
1003
1004    indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
1005    indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
1006
1007    return LLVMBuildSelect(ctx->ac.builder, indexed, ac_get_arg(&ctx->ac, ctx->args.base_vertex),
1008                           ctx->ac.i32_0, "");
1009 }
1010
1011 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
1012 {
1013    struct si_shader *shader = ctx->shader;
1014
1015    if (shader->key.as_ls)
1016       ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
1017    else if (shader->key.as_es)
1018       ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
1019    else if (shader->key.opt.vs_as_prim_discard_cs)
1020       ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
1021    else if (ngg_cull_shader)
1022       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
1023    else if (shader->key.as_ngg)
1024       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
1025    else
1026       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
1027
1028    ctx->abi.load_base_vertex = get_base_vertex;
1029 }