src/gallium/drivers/radeonsi/si_shader_llvm_gs.c

   1 /*
   2  * Copyright 2020 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include "si_pipe.h"
  26 #include "si_shader_internal.h"
  27 #include "sid.h"
  28 #include "util/u_memory.h"
  29
  30 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx)
  31 {
  32    /* Return true if the current thread should execute an ES thread. */
  33    return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
  34                         si_unpack_param(ctx, ctx->merged_wave_info, 0, 8), "");
  35 }
  36
  37 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx)
  38 {
  39    /* Return true if the current thread should execute a GS thread. */
  40    return LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
  41                         si_unpack_param(ctx, ctx->merged_wave_info, 8, 8), "");
  42 }
  43
  44 static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned input_index,
  45                                           unsigned vtx_offset_param, LLVMTypeRef type,
  46                                           unsigned swizzle)
  47 {
  48    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
  49    struct si_shader *shader = ctx->shader;
  50    LLVMValueRef vtx_offset, soffset;
  51    struct si_shader_info *info = &shader->selector->info;
  52    unsigned semantic_name = info->input_semantic_name[input_index];
  53    unsigned semantic_index = info->input_semantic_index[input_index];
  54    unsigned param;
  55    LLVMValueRef value;
  56
  57    param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
  58
  59    /* GFX9 has the ESGS ring in LDS. */
  60    if (ctx->screen->info.chip_class >= GFX9) {
  61       unsigned index = vtx_offset_param;
  62
  63       switch (index / 2) {
  64       case 0:
  65          vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
  66          break;
  67       case 1:
  68          vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
  69          break;
  70       case 2:
  71          vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
  72          break;
  73       default:
  74          assert(0);
  75          return NULL;
  76       }
  77
  78       unsigned offset = param * 4 + swizzle;
  79       vtx_offset =
  80          LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
  81
  82       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
  83       LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
  84       if (ac_get_type_size(type) == 8) {
  85          ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &ctx->ac.i32_1, 1, "");
  86          LLVMValueRef values[2] = {value, LLVMBuildLoad(ctx->ac.builder, ptr, "")};
  87          value = ac_build_gather_values(&ctx->ac, values, 2);
  88       }
  89       return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
  90    }
  91
  92    /* GFX6: input load from the ESGS ring in memory. */
  93    if (swizzle == ~0) {
  94       LLVMValueRef values[4];
  95       unsigned chan;
  96       for (chan = 0; chan < 4; chan++) {
  97          values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, type, chan);
  98       }
  99       return ac_build_gather_values(&ctx->ac, values, 4);
 100    }
 101
 102    /* Get the vertex offset parameter on GFX6. */
 103    LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->gs_vtx_offset[vtx_offset_param]);
 104
 105    vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
 106
 107    soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
 108
 109    value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0,
 110                                 ac_glc, true, false);
 111    if (ac_get_type_size(type) == 8) {
 112       LLVMValueRef value2;
 113       soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0);
 114
 115       value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset,
 116                                     0, ac_glc, true, false);
 117       return si_build_gather_64bit(ctx, type, value, value2);
 118    }
 119    return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
 120 }
 121
 122 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, unsigned location,
 123                                          unsigned driver_location, unsigned component,
 124                                          unsigned num_components, unsigned vertex_index,
 125                                          unsigned const_index, LLVMTypeRef type)
 126 {
 127    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 128
 129    LLVMValueRef value[4];
 130    for (unsigned i = 0; i < num_components; i++) {
 131       unsigned offset = i;
 132       if (ac_get_type_size(type) == 8)
 133          offset *= 2;
 134
 135       offset += component;
 136       value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index,
 137                                                    vertex_index, type, offset);
 138    }
 139
 140    return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 141 }
 142
 143 /* Pass GS inputs from ES to GS on GFX9. */
 144 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
 145 {
 146    LLVMValueRef ret = ctx->return_value;
 147
 148    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
 149    ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
 150    if (ctx->shader->key.as_ngg)
 151       ret = si_insert_input_ptr(ctx, ret, ctx->gs_tg_info, 2);
 152    else
 153       ret = si_insert_input_ret(ctx, ret, ctx->gs2vs_offset, 2);
 154    ret = si_insert_input_ret(ctx, ret, ctx->merged_wave_info, 3);
 155    ret = si_insert_input_ret(ctx, ret, ctx->merged_scratch_offset, 5);
 156
 157    ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
 158    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
 159                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
 160    if (ctx->screen->use_ngg) {
 161       ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
 162    }
 163
 164    unsigned vgpr;
 165    if (ctx->type == PIPE_SHADER_VERTEX)
 166       vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
 167    else
 168       vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
 169
 170    ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
 171    ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
 172    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
 173    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
 174    ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
 175    ctx->return_value = ret;
 176 }
 177
 178 void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
 179 {
 180    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 181    struct si_shader *es = ctx->shader;
 182    struct si_shader_info *info = &es->selector->info;
 183    LLVMValueRef lds_base = NULL;
 184    unsigned chan;
 185    int i;
 186
 187    if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
 188       unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
 189       LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
 190       LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
 191       vertex_idx =
 192          LLVMBuildOr(ctx->ac.builder, vertex_idx,
 193                      LLVMBuildMul(ctx->ac.builder, wave_idx,
 194                                   LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), ""),
 195                      "");
 196       lds_base =
 197          LLVMBuildMul(ctx->ac.builder, vertex_idx, LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
 198    }
 199
 200    for (i = 0; i < info->num_outputs; i++) {
 201       int param;
 202
 203       if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
 204           info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
 205          continue;
 206
 207       param = si_shader_io_get_unique_index(info->output_semantic_name[i],
 208                                             info->output_semantic_index[i], false);
 209
 210       for (chan = 0; chan < 4; chan++) {
 211          if (!(info->output_usagemask[i] & (1 << chan)))
 212             continue;
 213
 214          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
 215          out_val = ac_to_integer(&ctx->ac, out_val);
 216
 217          /* GFX9 has the ESGS ring in LDS. */
 218          if (ctx->screen->info.chip_class >= GFX9) {
 219             LLVMValueRef idx = LLVMConstInt(ctx->ac.i32, param * 4 + chan, false);
 220             idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, "");
 221             ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val);
 222             continue;
 223          }
 224
 225          ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL,
 226                                      ac_get_arg(&ctx->ac, ctx->es2gs_offset),
 227                                      (4 * param + chan) * 4, ac_glc | ac_slc | ac_swizzled);
 228       }
 229    }
 230
 231    if (ctx->screen->info.chip_class >= GFX9)
 232       si_set_es_return_value_for_gs(ctx);
 233 }
 234
 235 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
 236 {
 237    if (ctx->screen->info.chip_class >= GFX9)
 238       return si_unpack_param(ctx, ctx->merged_wave_info, 16, 8);
 239    else
 240       return ac_get_arg(&ctx->ac, ctx->gs_wave_id);
 241 }
 242
 243 static void emit_gs_epilogue(struct si_shader_context *ctx)
 244 {
 245    if (ctx->shader->key.as_ngg) {
 246       gfx10_ngg_gs_emit_epilogue(ctx);
 247       return;
 248    }
 249
 250    if (ctx->screen->info.chip_class >= GFX10)
 251       LLVMBuildFence(ctx->ac.builder, LLVMAtomicOrderingRelease, false, "");
 252
 253    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx));
 254
 255    if (ctx->screen->info.chip_class >= GFX9)
 256       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
 257 }
 258
 259 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
 260                                      LLVMValueRef *addrs)
 261 {
 262    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 263    struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
 264
 265    assert(info->num_outputs <= max_outputs);
 266
 267    emit_gs_epilogue(ctx);
 268 }
 269
 270 /* Emit one vertex from the geometry shader */
 271 static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
 272 {
 273    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 274
 275    if (ctx->shader->key.as_ngg) {
 276       gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
 277       return;
 278    }
 279
 280    struct si_shader_info *info = &ctx->shader->selector->info;
 281    struct si_shader *shader = ctx->shader;
 282    LLVMValueRef soffset = ac_get_arg(&ctx->ac, ctx->gs2vs_offset);
 283    LLVMValueRef gs_next_vertex;
 284    LLVMValueRef can_emit;
 285    unsigned chan, offset;
 286    int i;
 287
 288    /* Write vertex attribute values to GSVS ring */
 289    gs_next_vertex = LLVMBuildLoad(ctx->ac.builder, ctx->gs_next_vertex[stream], "");
 290
 291    /* If this thread has already emitted the declared maximum number of
 292     * vertices, skip the write: excessive vertex emissions are not
 293     * supposed to have any effect.
 294     *
 295     * If the shader has no writes to memory, kill it instead. This skips
 296     * further memory loads and may allow LLVM to skip to the end
 297     * altogether.
 298     */
 299    can_emit =
 300       LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
 301                     LLVMConstInt(ctx->ac.i32, shader->selector->gs_max_out_vertices, 0), "");
 302
 303    bool use_kill = !info->writes_memory;
 304    if (use_kill) {
 305       ac_build_kill_if_false(&ctx->ac, can_emit);
 306    } else {
 307       ac_build_ifcc(&ctx->ac, can_emit, 6505);
 308    }
 309
 310    offset = 0;
 311    for (i = 0; i < info->num_outputs; i++) {
 312       for (chan = 0; chan < 4; chan++) {
 313          if (!(info->output_usagemask[i] & (1 << chan)) ||
 314              ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
 315             continue;
 316
 317          LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
 318          LLVMValueRef voffset =
 319             LLVMConstInt(ctx->ac.i32, offset * shader->selector->gs_max_out_vertices, 0);
 320          offset++;
 321
 322          voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
 323          voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
 324
 325          out_val = ac_to_integer(&ctx->ac, out_val);
 326
 327          ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring[stream], out_val, 1, voffset, soffset,
 328                                      0, ac_glc | ac_slc | ac_swizzled);
 329       }
 330    }
 331
 332    gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->ac.i32_1, "");
 333    LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
 334
 335    /* Signal vertex emission if vertex data was written. */
 336    if (offset) {
 337       ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
 338                        si_get_gs_wave_id(ctx));
 339    }
 340
 341    if (!use_kill)
 342       ac_build_endif(&ctx->ac, 6505);
 343 }
 344
 345 /* Cut one primitive from the geometry shader */
 346 static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
 347 {
 348    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 349
 350    if (ctx->shader->key.as_ngg) {
 351       LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
 352       return;
 353    }
 354
 355    /* Signal primitive cut */
 356    ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
 357                     si_get_gs_wave_id(ctx));
 358 }
 359
 360 void si_preload_esgs_ring(struct si_shader_context *ctx)
 361 {
 362    if (ctx->screen->info.chip_class <= GFX8) {
 363       unsigned ring = ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS : SI_ES_RING_ESGS;
 364       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, 0);
 365       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 366
 367       ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
 368    } else {
 369       if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
 370          /* Declare the ESGS ring as an explicit LDS symbol. */
 371          si_llvm_declare_esgs_ring(ctx);
 372       } else {
 373          ac_declare_lds_as_pointer(&ctx->ac);
 374          ctx->esgs_ring = ctx->ac.lds;
 375       }
 376    }
 377 }
 378
 379 void si_preload_gs_rings(struct si_shader_context *ctx)
 380 {
 381    const struct si_shader_selector *sel = ctx->shader->selector;
 382    LLVMBuilderRef builder = ctx->ac.builder;
 383    LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_RING_GSVS, 0);
 384    LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 385    LLVMValueRef base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
 386
 387    /* The conceptual layout of the GSVS ring is
 388     *   v0c0 .. vLv0 v0c1 .. vLc1 ..
 389     * but the real memory layout is swizzled across
 390     * threads:
 391     *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
 392     *   t16v0c0 ..
 393     * Override the buffer descriptor accordingly.
 394     */
 395    LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
 396    uint64_t stream_offset = 0;
 397
 398    for (unsigned stream = 0; stream < 4; ++stream) {
 399       unsigned num_components;
 400       unsigned stride;
 401       unsigned num_records;
 402       LLVMValueRef ring, tmp;
 403
 404       num_components = sel->info.num_stream_output_components[stream];
 405       if (!num_components)
 406          continue;
 407
 408       stride = 4 * num_components * sel->gs_max_out_vertices;
 409
 410       /* Limit on the stride field for <= GFX7. */
 411       assert(stride < (1 << 14));
 412
 413       num_records = ctx->ac.wave_size;
 414
 415       ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
 416       tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_0, "");
 417       tmp = LLVMBuildAdd(builder, tmp, LLVMConstInt(ctx->ac.i64, stream_offset, 0), "");
 418       stream_offset += stride * ctx->ac.wave_size;
 419
 420       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_0, "");
 421       ring = LLVMBuildBitCast(builder, ring, ctx->ac.v4i32, "");
 422       tmp = LLVMBuildExtractElement(builder, ring, ctx->ac.i32_1, "");
 423       tmp = LLVMBuildOr(
 424          builder, tmp,
 425          LLVMConstInt(ctx->ac.i32, S_008F04_STRIDE(stride) | S_008F04_SWIZZLE_ENABLE(1), 0), "");
 426       ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->ac.i32_1, "");
 427       ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, num_records, 0),
 428                                     LLVMConstInt(ctx->ac.i32, 2, 0), "");
 429
 430       uint32_t rsrc3 =
 431          S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 432          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
 433          S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
 434          S_008F0C_ADD_TID_ENABLE(1);
 435
 436       if (ctx->ac.chip_class >= GFX10) {
 437          rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
 438                   S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1);
 439       } else {
 440          rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
 441                   S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
 442                   S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
 443       }
 444
 445       ring = LLVMBuildInsertElement(builder, ring, LLVMConstInt(ctx->ac.i32, rsrc3, false),
 446                                     LLVMConstInt(ctx->ac.i32, 3, 0), "");
 447
 448       ctx->gsvs_ring[stream] = ring;
 449    }
 450 }
 451
 452 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 453 struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
 454                                              struct ac_llvm_compiler *compiler,
 455                                              struct si_shader_selector *gs_selector,
 456                                              struct pipe_debug_callback *debug)
 457 {
 458    struct si_shader_context ctx;
 459    struct si_shader *shader;
 460    LLVMBuilderRef builder;
 461    struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
 462    struct si_shader_info *gsinfo = &gs_selector->info;
 463    int i;
 464
 465    shader = CALLOC_STRUCT(si_shader);
 466    if (!shader)
 467       return NULL;
 468
 469    /* We can leave the fence as permanently signaled because the GS copy
 470     * shader only becomes visible globally after it has been compiled. */
 471    util_queue_fence_init(&shader->ready);
 472
 473    shader->selector = gs_selector;
 474    shader->is_gs_copy_shader = true;
 475
 476    si_llvm_context_init(&ctx, sscreen, compiler,
 477                         si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false));
 478    ctx.shader = shader;
 479    ctx.type = PIPE_SHADER_VERTEX;
 480
 481    builder = ctx.ac.builder;
 482
 483    si_create_function(&ctx, false);
 484
 485    LLVMValueRef buf_ptr = ac_get_arg(&ctx.ac, ctx.rw_buffers);
 486    ctx.gsvs_ring[0] =
 487       ac_build_load_to_sgpr(&ctx.ac, buf_ptr, LLVMConstInt(ctx.ac.i32, SI_RING_GSVS, 0));
 488
 489    LLVMValueRef voffset =
 490       LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id, LLVMConstInt(ctx.ac.i32, 4, 0), "");
 491
 492    /* Fetch the vertex stream ID.*/
 493    LLVMValueRef stream_id;
 494
 495    if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs)
 496       stream_id = si_unpack_param(&ctx, ctx.streamout_config, 24, 2);
 497    else
 498       stream_id = ctx.ac.i32_0;
 499
 500    /* Fill in output information. */
 501    for (i = 0; i < gsinfo->num_outputs; ++i) {
 502       outputs[i].semantic_name = gsinfo->output_semantic_name[i];
 503       outputs[i].semantic_index = gsinfo->output_semantic_index[i];
 504
 505       for (int chan = 0; chan < 4; chan++) {
 506          outputs[i].vertex_stream[chan] = (gsinfo->output_streams[i] >> (2 * chan)) & 3;
 507       }
 508    }
 509
 510    LLVMBasicBlockRef end_bb;
 511    LLVMValueRef switch_inst;
 512
 513    end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
 514    switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
 515
 516    for (int stream = 0; stream < 4; stream++) {
 517       LLVMBasicBlockRef bb;
 518       unsigned offset;
 519
 520       if (!gsinfo->num_stream_output_components[stream])
 521          continue;
 522
 523       if (stream > 0 && !gs_selector->so.num_outputs)
 524          continue;
 525
 526       bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
 527       LLVMAddCase(switch_inst, LLVMConstInt(ctx.ac.i32, stream, 0), bb);
 528       LLVMPositionBuilderAtEnd(builder, bb);
 529
 530       /* Fetch vertex data from GSVS ring */
 531       offset = 0;
 532       for (i = 0; i < gsinfo->num_outputs; ++i) {
 533          for (unsigned chan = 0; chan < 4; chan++) {
 534             if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
 535                 outputs[i].vertex_stream[chan] != stream) {
 536                outputs[i].values[chan] = LLVMGetUndef(ctx.ac.f32);
 537                continue;
 538             }
 539
 540             LLVMValueRef soffset =
 541                LLVMConstInt(ctx.ac.i32, offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
 542             offset++;
 543
 544             outputs[i].values[chan] =
 545                ac_build_buffer_load(&ctx.ac, ctx.gsvs_ring[0], 1, ctx.ac.i32_0, voffset, soffset, 0,
 546                                     ac_glc | ac_slc, true, false);
 547          }
 548       }
 549
 550       /* Streamout and exports. */
 551       if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) {
 552          si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream);
 553       }
 554
 555       if (stream == 0)
 556          si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
 557
 558       LLVMBuildBr(builder, end_bb);
 559    }
 560
 561    LLVMPositionBuilderAtEnd(builder, end_bb);
 562
 563    LLVMBuildRetVoid(ctx.ac.builder);
 564
 565    ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
 566    si_llvm_optimize_module(&ctx);
 567
 568    bool ok = false;
 569    if (si_compile_llvm(sscreen, &ctx.shader->binary, &ctx.shader->config, ctx.compiler, &ctx.ac,
 570                        debug, PIPE_SHADER_GEOMETRY, "GS Copy Shader", false)) {
 571       if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
 572          fprintf(stderr, "GS Copy Shader:\n");
 573       si_shader_dump(sscreen, ctx.shader, debug, stderr, true);
 574
 575       if (!ctx.shader->config.scratch_bytes_per_wave)
 576          ok = si_shader_binary_upload(sscreen, ctx.shader, 0);
 577       else
 578          ok = true;
 579    }
 580
 581    si_llvm_dispose(&ctx);
 582
 583    if (!ok) {
 584       FREE(shader);
 585       shader = NULL;
 586    } else {
 587       si_fix_resource_usage(sscreen, shader);
 588    }
 589    return shader;
 590 }
 591
 592 /**
 593  * Build the GS prolog function. Rotate the input vertices for triangle strips
 594  * with adjacency.
 595  */
 596 void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
 597 {
 598    unsigned num_sgprs, num_vgprs;
 599    LLVMBuilderRef builder = ctx->ac.builder;
 600    LLVMTypeRef returns[AC_MAX_ARGS];
 601    LLVMValueRef func, ret;
 602
 603    memset(&ctx->args, 0, sizeof(ctx->args));
 604
 605    if (ctx->screen->info.chip_class >= GFX9) {
 606       if (key->gs_prolog.states.gfx9_prev_is_vs)
 607          num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
 608       else
 609          num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
 610       num_vgprs = 5; /* ES inputs are not needed by GS */
 611    } else {
 612       num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
 613       num_vgprs = 8;
 614    }
 615
 616    for (unsigned i = 0; i < num_sgprs; ++i) {
 617       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
 618       returns[i] = ctx->ac.i32;
 619    }
 620
 621    for (unsigned i = 0; i < num_vgprs; ++i) {
 622       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
 623       returns[num_sgprs + i] = ctx->ac.f32;
 624    }
 625
 626    /* Create the function. */
 627    si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
 628    func = ctx->main_fn;
 629
 630    /* Set the full EXEC mask for the prolog, because we are only fiddling
 631     * with registers here. The main shader part will set the correct EXEC
 632     * mask.
 633     */
 634    if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
 635       ac_init_exec_full_mask(&ctx->ac);
 636
 637    /* Copy inputs to outputs. This should be no-op, as the registers match,
 638     * but it will prevent the compiler from overwriting them unintentionally.
 639     */
 640    ret = ctx->return_value;
 641    for (unsigned i = 0; i < num_sgprs; i++) {
 642       LLVMValueRef p = LLVMGetParam(func, i);
 643       ret = LLVMBuildInsertValue(builder, ret, p, i, "");
 644    }
 645    for (unsigned i = 0; i < num_vgprs; i++) {
 646       LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
 647       p = ac_to_float(&ctx->ac, p);
 648       ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
 649    }
 650
 651    if (key->gs_prolog.states.tri_strip_adj_fix) {
 652       /* Remap the input vertices for every other primitive. */
 653       const struct ac_arg gfx6_vtx_params[6] = {
 654          {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
 655          {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
 656          {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
 657       };
 658       const struct ac_arg gfx9_vtx_params[3] = {
 659          {.used = true, .arg_index = num_sgprs},
 660          {.used = true, .arg_index = num_sgprs + 1},
 661          {.used = true, .arg_index = num_sgprs + 4},
 662       };
 663       LLVMValueRef vtx_in[6], vtx_out[6];
 664       LLVMValueRef prim_id, rotate;
 665
 666       if (ctx->screen->info.chip_class >= GFX9) {
 667          for (unsigned i = 0; i < 3; i++) {
 668             vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
 669             vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
 670          }
 671       } else {
 672          for (unsigned i = 0; i < 6; i++)
 673             vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
 674       }
 675
 676       prim_id = LLVMGetParam(func, num_sgprs + 2);
 677       rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
 678
 679       for (unsigned i = 0; i < 6; ++i) {
 680          LLVMValueRef base, rotated;
 681          base = vtx_in[i];
 682          rotated = vtx_in[(i + 4) % 6];
 683          vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
 684       }
 685
 686       if (ctx->screen->info.chip_class >= GFX9) {
 687          for (unsigned i = 0; i < 3; i++) {
 688             LLVMValueRef hi, out;
 689
 690             hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
 691             out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
 692             out = ac_to_float(&ctx->ac, out);
 693             ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
 694          }
 695       } else {
 696          for (unsigned i = 0; i < 6; i++) {
 697             LLVMValueRef out;
 698
 699             out = ac_to_float(&ctx->ac, vtx_out[i]);
 700             ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
 701          }
 702       }
 703    }
 704
 705    LLVMBuildRet(builder, ret);
 706 }
 707
 708 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
 709 {
 710    ctx->abi.load_inputs = si_nir_load_input_gs;
 711    ctx->abi.emit_vertex = si_llvm_emit_vertex;
 712    ctx->abi.emit_primitive = si_llvm_emit_primitive;
 713    ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
 714 }