src/gallium/drivers/radeonsi/gfx10_shader_ngg.c

   1 /*
   2  * Copyright 2017 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 #include "ac_llvm_cull.h"
  25 #include "si_pipe.h"
  26 #include "si_shader_internal.h"
  27 #include "sid.h"
  28 #include "util/u_memory.h"
  29 #include "util/u_prim.h"
  30
  31 static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
  32 {
  33    return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
  34 }
  35
  36 static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
  37 {
  38    return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
  39 }
  40
  41 static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
  42 {
  43    LLVMBuilderRef builder = ctx->ac.builder;
  44    LLVMValueRef tmp;
  45    tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
  46                       LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
  47    return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
  48 }
  49
  50 static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
  51 {
  52    return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
  53 }
  54
  55 static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
  56 {
  57    return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
  58 }
  59
  60 static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
  61 {
  62    return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
  63 }
  64
  65 static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
  66 {
  67    LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
  68
  69    return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
  70                                 LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
  71 }
  72
  73 static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
  74 {
  75    if (ctx->type == PIPE_SHADER_VERTEX) {
  76       LLVMValueRef tmp;
  77       tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
  78                           LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
  79       return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
  80    }
  81    return ctx->ac.i1false;
  82 }
  83
  84 /**
  85  * Return the number of vertices as a constant in \p num_vertices,
  86  * and return a more precise value as LLVMValueRef from the function.
  87  */
  88 static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices)
  89 {
  90    const struct si_shader_info *info = &ctx->shader->selector->info;
  91
  92    if (ctx->type == PIPE_SHADER_VERTEX) {
  93       if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
  94          /* Blits always use axis-aligned rectangles with 3 vertices. */
  95          *num_vertices = 3;
  96          return LLVMConstInt(ctx->ac.i32, 3, 0);
  97       } else {
  98          /* We always build up all three indices for the prim export
  99           * independent of the primitive type. The additional garbage
 100           * data shouldn't hurt. This number doesn't matter with
 101           * NGG passthrough.
 102           */
 103          *num_vertices = 3;
 104
 105          /* Extract OUTPRIM field. */
 106          LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
 107          return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
 108       }
 109    } else {
 110       assert(ctx->type == PIPE_SHADER_TESS_EVAL);
 111
 112       if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
 113          *num_vertices = 1;
 114       else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
 115          *num_vertices = 2;
 116       else
 117          *num_vertices = 3;
 118
 119       return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
 120    }
 121 }
 122
 123 bool gfx10_ngg_export_prim_early(struct si_shader *shader)
 124 {
 125    struct si_shader_selector *sel = shader->selector;
 126
 127    assert(shader->key.as_ngg && !shader->key.as_es);
 128
 129    return sel->type != PIPE_SHADER_GEOMETRY && !sel->info.writes_edgeflag;
 130 }
 131
 132 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
 133 {
 134    ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
 135                                  ngg_get_prim_cnt(ctx));
 136 }
 137
 138 void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
 139                                  LLVMValueRef prim_passthrough)
 140 {
 141    LLVMBuilderRef builder = ctx->ac.builder;
 142
 143    if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) {
 144       ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
 145       {
 146          struct ac_ngg_prim prim = {};
 147
 148          if (prim_passthrough)
 149             prim.passthrough = prim_passthrough;
 150          else
 151             prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
 152
 153          /* This is only used with NGG culling, which returns the NGG
 154           * passthrough prim export encoding.
 155           */
 156          if (ctx->shader->selector->info.writes_edgeflag) {
 157             unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
 158             LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
 159
 160             unsigned num_vertices;
 161             ngg_get_vertices_per_prim(ctx, &num_vertices);
 162
 163             for (unsigned i = 0; i < num_vertices; i++) {
 164                unsigned shift = 9 + i * 10;
 165                LLVMValueRef edge;
 166
 167                edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
 168                edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
 169                edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
 170                edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
 171             }
 172             prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
 173          }
 174
 175          ac_build_export_prim(&ctx->ac, &prim);
 176       }
 177       ac_build_endif(&ctx->ac, 6001);
 178       return;
 179    }
 180
 181    ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
 182    {
 183       struct ac_ngg_prim prim = {};
 184
 185       ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
 186
 187       prim.isnull = ctx->ac.i1false;
 188       prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
 189       prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
 190       prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
 191
 192       for (unsigned i = 0; i < prim.num_vertices; ++i) {
 193          prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
 194
 195          if (ctx->shader->selector->info.writes_edgeflag) {
 196             LLVMValueRef edge;
 197
 198             edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
 199             edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
 200             prim.edgeflag[i] = edge;
 201          }
 202       }
 203
 204       ac_build_export_prim(&ctx->ac, &prim);
 205    }
 206    ac_build_endif(&ctx->ac, 6001);
 207 }
 208
 209 static void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer,
 210                                    LLVMValueRef *wg_offset_dw, unsigned stream,
 211                                    LLVMValueRef offset_vtx, LLVMValueRef vertexptr)
 212 {
 213    struct si_shader_info *info = &ctx->shader->selector->info;
 214    struct pipe_stream_output_info *so = &ctx->shader->selector->so;
 215    LLVMBuilderRef builder = ctx->ac.builder;
 216    LLVMValueRef offset[4] = {};
 217    LLVMValueRef tmp;
 218
 219    for (unsigned buffer = 0; buffer < 4; ++buffer) {
 220       if (!wg_offset_dw[buffer])
 221          continue;
 222
 223       tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false),
 224                          "");
 225       tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
 226       offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
 227    }
 228
 229    for (unsigned i = 0; i < so->num_outputs; ++i) {
 230       if (so->output[i].stream != stream)
 231          continue;
 232
 233       unsigned reg = so->output[i].register_index;
 234       struct si_shader_output_values out;
 235       out.semantic_name = info->output_semantic_name[reg];
 236       out.semantic_index = info->output_semantic_index[reg];
 237
 238       for (unsigned comp = 0; comp < 4; comp++) {
 239          tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
 240          out.values[comp] = LLVMBuildLoad(builder, tmp, "");
 241          out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3;
 242       }
 243
 244       si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
 245    }
 246 }
 247
 248 struct ngg_streamout {
 249    LLVMValueRef num_vertices;
 250
 251    /* per-thread data */
 252    LLVMValueRef prim_enable[4]; /* i1 per stream */
 253    LLVMValueRef vertices[3];    /* [N x i32] addrspace(LDS)* */
 254
 255    /* Output */
 256    LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
 257 };
 258
 259 /**
 260  * Build streamout logic.
 261  *
 262  * Implies a barrier.
 263  *
 264  * Writes number of emitted primitives to gs_ngg_scratch[4:8].
 265  *
 266  * Clobbers gs_ngg_scratch[8:].
 267  */
 268 static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso)
 269 {
 270    struct si_shader_info *info = &ctx->shader->selector->info;
 271    struct pipe_stream_output_info *so = &ctx->shader->selector->so;
 272    LLVMBuilderRef builder = ctx->ac.builder;
 273    LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 274    LLVMValueRef tid = get_thread_id_in_tg(ctx);
 275    LLVMValueRef tmp, tmp2;
 276    LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
 277    LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
 278    LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
 279    LLVMValueRef so_buffer[4] = {};
 280    unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
 281    LLVMValueRef prim_stride_dw[4] = {};
 282    LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
 283    int stream_for_buffer[4] = {-1, -1, -1, -1};
 284    unsigned bufmask_for_stream[4] = {};
 285    bool isgs = ctx->type == PIPE_SHADER_GEOMETRY;
 286    unsigned scratch_emit_base = isgs ? 4 : 0;
 287    LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
 288    unsigned scratch_offset_base = isgs ? 8 : 4;
 289    LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
 290
 291    ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
 292
 293    /* Determine the mapping of streamout buffers to vertex streams. */
 294    for (unsigned i = 0; i < so->num_outputs; ++i) {
 295       unsigned buf = so->output[i].output_buffer;
 296       unsigned stream = so->output[i].stream;
 297       assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
 298       stream_for_buffer[buf] = stream;
 299       bufmask_for_stream[stream] |= 1 << buf;
 300    }
 301
 302    for (unsigned buffer = 0; buffer < 4; ++buffer) {
 303       if (stream_for_buffer[buffer] == -1)
 304          continue;
 305
 306       assert(so->stride[buffer]);
 307
 308       tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
 309       prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
 310       prim_stride_dw_vgpr =
 311          ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
 312                             LLVMConstInt(ctx->ac.i32, buffer, false));
 313
 314       so_buffer[buffer] = ac_build_load_to_sgpr(
 315          &ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
 316    }
 317
 318    tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
 319    ac_build_ifcc(&ctx->ac, tmp, 5200);
 320    {
 321       LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
 322       LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
 323
 324       /* Advance the streamout offsets in GDS. */
 325       LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
 326       LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
 327
 328       tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
 329       ac_build_ifcc(&ctx->ac, tmp, 5210);
 330       {
 331          if (isgs) {
 332             tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
 333             tmp = LLVMBuildLoad(builder, tmp, "");
 334          } else {
 335             tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
 336          }
 337          LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
 338
 339          unsigned swizzle[4];
 340          int unused_stream = -1;
 341          for (unsigned stream = 0; stream < 4; ++stream) {
 342             if (!info->num_stream_output_components[stream]) {
 343                unused_stream = stream;
 344                break;
 345             }
 346          }
 347          for (unsigned buffer = 0; buffer < 4; ++buffer) {
 348             if (stream_for_buffer[buffer] >= 0) {
 349                swizzle[buffer] = stream_for_buffer[buffer];
 350             } else {
 351                assert(unused_stream >= 0);
 352                swizzle[buffer] = unused_stream;
 353             }
 354          }
 355
 356          tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
 357          tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
 358
 359          LLVMValueRef args[] = {
 360             LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
 361             tmp,
 362             ctx->ac.i32_0,                             // ordering
 363             ctx->ac.i32_0,                             // scope
 364             ctx->ac.i1false,                           // isVolatile
 365             LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
 366             ctx->ac.i1true,                            // wave release
 367             ctx->ac.i1true,                            // wave done
 368          };
 369          tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
 370                                   ARRAY_SIZE(args), 0);
 371
 372          /* Keep offsets in a VGPR for quick retrieval via readlane by
 373           * the first wave for bounds checking, and also store in LDS
 374           * for retrieval by all waves later. */
 375          LLVMBuildStore(builder, tmp, offsets_vgpr);
 376
 377          tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
 378          tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
 379          LLVMBuildStore(builder, tmp, tmp2);
 380       }
 381       ac_build_endif(&ctx->ac, 5210);
 382
 383       /* Determine the max emit per buffer. This is done via the SALU, in part
 384        * because LLVM can't generate divide-by-multiply if we try to do this
 385        * via VALU with one lane per buffer.
 386        */
 387       LLVMValueRef max_emit[4] = {};
 388       for (unsigned buffer = 0; buffer < 4; ++buffer) {
 389          if (stream_for_buffer[buffer] == -1)
 390             continue;
 391
 392          LLVMValueRef bufsize_dw = LLVMBuildLShr(
 393             builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
 394
 395          tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
 396          LLVMValueRef offset_dw =
 397             ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
 398
 399          tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
 400          tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
 401
 402          tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
 403          max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
 404       }
 405
 406       /* Determine the number of emitted primitives per stream and fixup the
 407        * GDS counter if necessary.
 408        *
 409        * This is complicated by the fact that a single stream can emit to
 410        * multiple buffers (but luckily not vice versa).
 411        */
 412       LLVMValueRef emit_vgpr = ctx->ac.i32_0;
 413
 414       for (unsigned stream = 0; stream < 4; ++stream) {
 415          if (!info->num_stream_output_components[stream])
 416             continue;
 417
 418          tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
 419          LLVMValueRef generated =
 420             ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
 421
 422          LLVMValueRef emit = generated;
 423          for (unsigned buffer = 0; buffer < 4; ++buffer) {
 424             if (stream_for_buffer[buffer] == stream)
 425                emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
 426          }
 427
 428          emit_vgpr =
 429             ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
 430
 431          /* Fixup the offset using a plain GDS atomic if we overflowed. */
 432          tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
 433          ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
 434          tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
 435                              ac_get_thread_id(&ctx->ac), "");
 436          tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
 437          ac_build_ifcc(&ctx->ac, tmp, 5222);
 438          {
 439             tmp = LLVMBuildSub(builder, generated, emit, "");
 440             tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
 441             tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
 442             LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
 443                                LLVMAtomicOrderingMonotonic, false);
 444          }
 445          ac_build_endif(&ctx->ac, 5222);
 446          ac_build_endif(&ctx->ac, 5221);
 447       }
 448
 449       tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
 450       ac_build_ifcc(&ctx->ac, tmp, 5225);
 451       {
 452          tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
 453          tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
 454          LLVMBuildStore(builder, emit_vgpr, tmp);
 455       }
 456       ac_build_endif(&ctx->ac, 5225);
 457    }
 458    ac_build_endif(&ctx->ac, 5200);
 459
 460    /* Determine the workgroup-relative per-thread / primitive offset into
 461     * the streamout buffers */
 462    struct ac_wg_scan primemit_scan[4] = {};
 463
 464    if (isgs) {
 465       for (unsigned stream = 0; stream < 4; ++stream) {
 466          if (!info->num_stream_output_components[stream])
 467             continue;
 468
 469          primemit_scan[stream].enable_exclusive = true;
 470          primemit_scan[stream].op = nir_op_iadd;
 471          primemit_scan[stream].src = nggso->prim_enable[stream];
 472          primemit_scan[stream].scratch = ac_build_gep0(
 473             &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
 474          primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
 475          primemit_scan[stream].numwaves = get_tgsize(ctx);
 476          primemit_scan[stream].maxwaves = 8;
 477          ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
 478       }
 479    }
 480
 481    ac_build_s_barrier(&ctx->ac);
 482
 483    /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
 484    LLVMValueRef wgoffset_dw[4] = {};
 485
 486    {
 487       LLVMValueRef scratch_vgpr;
 488
 489       tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
 490       scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
 491
 492       for (unsigned buffer = 0; buffer < 4; ++buffer) {
 493          if (stream_for_buffer[buffer] >= 0) {
 494             wgoffset_dw[buffer] =
 495                ac_build_readlane(&ctx->ac, scratch_vgpr,
 496                                  LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
 497          }
 498       }
 499
 500       for (unsigned stream = 0; stream < 4; ++stream) {
 501          if (info->num_stream_output_components[stream]) {
 502             nggso->emit[stream] =
 503                ac_build_readlane(&ctx->ac, scratch_vgpr,
 504                                  LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
 505          }
 506       }
 507    }
 508
 509    /* Write out primitive data */
 510    for (unsigned stream = 0; stream < 4; ++stream) {
 511       if (!info->num_stream_output_components[stream])
 512          continue;
 513
 514       if (isgs) {
 515          ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
 516       } else {
 517          primemit_scan[stream].result_exclusive = tid;
 518       }
 519
 520       tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
 521                           nggso->emit[stream], "");
 522       tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
 523       ac_build_ifcc(&ctx->ac, tmp, 5240);
 524       {
 525          LLVMValueRef offset_vtx =
 526             LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
 527
 528          for (unsigned i = 0; i < max_num_vertices; ++i) {
 529             tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
 530                                 nggso->num_vertices, "");
 531             ac_build_ifcc(&ctx->ac, tmp, 5241);
 532             build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
 533                                    nggso->vertices[i]);
 534             ac_build_endif(&ctx->ac, 5241);
 535             offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
 536          }
 537       }
 538       ac_build_endif(&ctx->ac, 5240);
 539    }
 540 }
 541
 542 /* LDS layout of ES vertex data for NGG culling. */
 543 enum
 544 {
 545    /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
 546     *         ES thread ID. After vertex compaction, compacted ES threads
 547     *         store the old thread ID here to copy input VGPRs from uncompacted
 548     *         ES threads.
 549     * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
 550     * Byte 2: TES rel patch ID
 551     * Byte 3: Unused
 552     */
 553    lds_byte0_accept_flag = 0,
 554    lds_byte0_old_thread_id = 0,
 555    lds_byte1_new_thread_id,
 556    lds_byte2_tes_rel_patch_id,
 557    lds_byte3_unused,
 558
 559    lds_packed_data = 0, /* lds_byteN_... */
 560
 561    lds_pos_x,
 562    lds_pos_y,
 563    lds_pos_z,
 564    lds_pos_w,
 565    lds_pos_x_div_w,
 566    lds_pos_y_div_w,
 567    /* If VS: */
 568    lds_vertex_id,
 569    lds_instance_id, /* optional */
 570    /* If TES: */
 571    lds_tes_u = lds_vertex_id,
 572    lds_tes_v = lds_instance_id,
 573    lds_tes_patch_id, /* optional */
 574 };
 575
 576 static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr,
 577                                     unsigned byte_index)
 578 {
 579    assert(byte_index < 4);
 580    LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
 581    LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
 582
 583    return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index,
 584                        1, "");
 585 }
 586
 587 static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
 588 {
 589    unsigned lds_vertex_size = 0;
 590
 591    /* The edgeflag is always stored in the last element that's also
 592     * used for padding to reduce LDS bank conflicts. */
 593    if (shader->selector->so.num_outputs)
 594       lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
 595    if (shader->selector->info.writes_edgeflag)
 596       lds_vertex_size = MAX2(lds_vertex_size, 1);
 597
 598    /* LDS size for passing data from GS to ES.
 599     * GS stores Primitive IDs into LDS at the address corresponding
 600     * to the ES thread of the provoking vertex. All ES threads
 601     * load and export PrimitiveID for their thread.
 602     */
 603    if (shader->selector->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)
 604       lds_vertex_size = MAX2(lds_vertex_size, 1);
 605
 606    if (shader->key.opt.ngg_culling) {
 607       if (shader->selector->type == PIPE_SHADER_VERTEX) {
 608          STATIC_ASSERT(lds_instance_id + 1 == 9);
 609          lds_vertex_size = MAX2(lds_vertex_size, 9);
 610       } else {
 611          assert(shader->selector->type == PIPE_SHADER_TESS_EVAL);
 612
 613          if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) {
 614             STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
 615             lds_vertex_size = MAX2(lds_vertex_size, 11);
 616          } else {
 617             STATIC_ASSERT(lds_tes_v + 1 == 9);
 618             lds_vertex_size = MAX2(lds_vertex_size, 9);
 619          }
 620       }
 621    }
 622
 623    return lds_vertex_size;
 624 }
 625
 626 /**
 627  * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
 628  * for the vertex outputs.
 629  */
 630 static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid)
 631 {
 632    /* The extra dword is used to avoid LDS bank conflicts. */
 633    unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
 634    LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
 635    LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
 636    LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
 637    return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
 638 }
 639
 640 static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret,
 641                                           struct ac_arg param, unsigned return_index)
 642 {
 643    LLVMValueRef v = ac_get_arg(&ctx->ac, param);
 644
 645    for (unsigned i = 0; i < 4; i++) {
 646       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i),
 647                                  return_index + i, "");
 648    }
 649    return ret;
 650 }
 651
 652 static void load_bitmasks_2x64(struct si_shader_context *ctx, LLVMValueRef lds_ptr,
 653                                unsigned dw_offset, LLVMValueRef mask[2],
 654                                LLVMValueRef *total_bitcount)
 655 {
 656    LLVMBuilderRef builder = ctx->ac.builder;
 657    LLVMValueRef ptr64 = LLVMBuildPointerCast(
 658       builder, lds_ptr, LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2), AC_ADDR_SPACE_LDS), "");
 659    for (unsigned i = 0; i < 2; i++) {
 660       LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
 661       mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
 662    }
 663
 664    /* We get better code if we don't use the 128-bit bitcount. */
 665    *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
 666                                   ac_build_bit_count(&ctx->ac, mask[1]), "");
 667 }
 668
 669 /**
 670  * Given a total thread count, update total and per-wave thread counts in input SGPRs
 671  * and return the per-wave thread count.
 672  *
 673  * \param new_num_threads    Total thread count on the input, per-wave thread count on the output.
 674  * \param tg_info            tg_info SGPR value
 675  * \param tg_info_num_bits   the bit size of thread count field in tg_info
 676  * \param tg_info_shift      the bit offset of the thread count field in tg_info
 677  * \param wave_info          merged_wave_info SGPR value
 678  * \param wave_info_num_bits the bit size of thread count field in merged_wave_info
 679  * \param wave_info_shift    the bit offset of the thread count field in merged_wave_info
 680  */
 681 static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads,
 682                                  LLVMValueRef *tg_info, unsigned tg_info_num_bits,
 683                                  unsigned tg_info_shift, LLVMValueRef *wave_info,
 684                                  unsigned wave_info_num_bits, unsigned wave_info_shift)
 685 {
 686    LLVMBuilderRef builder = ctx->ac.builder;
 687
 688    /* Update the total thread count. */
 689    unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
 690    *tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
 691    *tg_info = LLVMBuildOr(
 692       builder, *tg_info,
 693       LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
 694
 695    /* Update the per-wave thread count. */
 696    LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
 697                                             LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
 698    *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
 699    *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
 700    *new_num_threads =
 701       ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
 702    unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
 703    *wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
 704    *wave_info = LLVMBuildOr(
 705       builder, *wave_info,
 706       LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""),
 707       "");
 708 }
 709
 710 /**
 711  * Cull primitives for NGG VS or TES, then compact vertices, which happens
 712  * before the VS or TES main function. Return values for the main function.
 713  * Also return the position, which is passed to the shader as an input,
 714  * so that we don't compute it twice.
 715  */
 716 void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
 717                                                LLVMValueRef *addrs)
 718 {
 719    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 720    struct si_shader *shader = ctx->shader;
 721    struct si_shader_selector *sel = shader->selector;
 722    struct si_shader_info *info = &sel->info;
 723    LLVMBuilderRef builder = ctx->ac.builder;
 724
 725    assert(shader->key.opt.ngg_culling);
 726    assert(shader->key.as_ngg);
 727    assert(sel->type == PIPE_SHADER_VERTEX ||
 728           (sel->type == PIPE_SHADER_TESS_EVAL && !shader->key.as_es));
 729
 730    LLVMValueRef position[4] = {};
 731    for (unsigned i = 0; i < info->num_outputs; i++) {
 732       switch (info->output_semantic_name[i]) {
 733       case TGSI_SEMANTIC_POSITION:
 734          for (unsigned j = 0; j < 4; j++) {
 735             position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
 736          }
 737          break;
 738       }
 739    }
 740    assert(position[0]);
 741
 742    /* Store Position.XYZW into LDS. */
 743    LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 744    for (unsigned chan = 0; chan < 4; chan++) {
 745       LLVMBuildStore(
 746          builder, ac_to_integer(&ctx->ac, position[chan]),
 747          ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
 748    }
 749    /* Store Position.XY / W into LDS. */
 750    for (unsigned chan = 0; chan < 2; chan++) {
 751       LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
 752       LLVMBuildStore(
 753          builder, ac_to_integer(&ctx->ac, val),
 754          ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
 755    }
 756
 757    /* Store VertexID and InstanceID. ES threads will have to load them
 758     * from LDS after vertex compaction and use them instead of their own
 759     * system values.
 760     */
 761    bool uses_instance_id = false;
 762    bool uses_tes_prim_id = false;
 763    LLVMValueRef packed_data = ctx->ac.i32_0;
 764
 765    if (ctx->type == PIPE_SHADER_VERTEX) {
 766       uses_instance_id = sel->info.uses_instanceid ||
 767                          shader->key.part.vs.prolog.instance_divisor_is_one ||
 768                          shader->key.part.vs.prolog.instance_divisor_is_fetched;
 769
 770       LLVMBuildStore(
 771          builder, ctx->abi.vertex_id,
 772          ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
 773       if (uses_instance_id) {
 774          LLVMBuildStore(
 775             builder, ctx->abi.instance_id,
 776             ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
 777       }
 778    } else {
 779       uses_tes_prim_id = sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id;
 780
 781       assert(ctx->type == PIPE_SHADER_TESS_EVAL);
 782       LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
 783                      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
 784       LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
 785                      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
 786       packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
 787                                  LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
 788       if (uses_tes_prim_id) {
 789          LLVMBuildStore(
 790             builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
 791             ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
 792       }
 793    }
 794    /* Initialize the packed data. */
 795    LLVMBuildStore(
 796       builder, packed_data,
 797       ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
 798    ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
 799
 800    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
 801
 802    /* Initialize the last 3 gs_ngg_scratch dwords to 0, because we may have less
 803     * than 4 waves, but we always read all 4 values. This is where the thread
 804     * bitmasks of unculled threads will be stored.
 805     *
 806     * gs_ngg_scratch layout: esmask[0..3]
 807     */
 808    ac_build_ifcc(&ctx->ac,
 809                  LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
 810                                LLVMConstInt(ctx->ac.i32, 3, 0), ""),
 811                  16101);
 812    {
 813       LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
 814       LLVMBuildStore(builder, ctx->ac.i32_0, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, index));
 815    }
 816    ac_build_endif(&ctx->ac, 16101);
 817    ac_build_s_barrier(&ctx->ac);
 818
 819    /* The hardware requires that there are no holes between unculled vertices,
 820     * which means we have to pack ES threads, i.e. reduce the ES thread count
 821     * and move ES input VGPRs to lower threads. The upside is that varyings
 822     * are only fetched and computed for unculled vertices.
 823     *
 824     * Vertex compaction in GS threads:
 825     *
 826     * Part 1: Compute the surviving vertex mask in GS threads:
 827     * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
 828     *   - In GS, notify ES threads whether the vertex survived.
 829     *   - Barrier
 830     *   - ES threads will create the mask and store it in LDS.
 831     * - Barrier
 832     * - Each GS thread loads the vertex masks from LDS.
 833     *
 834     * Part 2: Compact ES threads in GS threads:
 835     * - Compute the prefix sum for all 3 vertices from the masks. These are the new
 836     *   thread IDs for each vertex within the primitive.
 837     * - Write the value of the old thread ID into the LDS address of the new thread ID.
 838     *   The ES thread will load the old thread ID and use it to load the position, VertexID,
 839     *   and InstanceID.
 840     * - Update vertex indices and null flag in the GS input VGPRs.
 841     * - Barrier
 842     *
 843     * Part 3: Update inputs GPRs
 844     * - For all waves, update per-wave thread counts in input SGPRs.
 845     * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
 846     */
 847
 848    LLVMValueRef vtxindex[3];
 849    if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
 850       /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
 851        * into these VGPRs.
 852        */
 853       vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
 854       vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
 855       vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
 856    } else {
 857       vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
 858       vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
 859       vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
 860    };
 861    LLVMValueRef gs_vtxptr[] = {
 862       ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
 863       ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
 864       ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
 865    };
 866    es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 867
 868    LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
 869
 870    /* Do culling in GS threads. */
 871    ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
 872    {
 873       /* Load positions. */
 874       LLVMValueRef pos[3][4] = {};
 875       for (unsigned vtx = 0; vtx < 3; vtx++) {
 876          for (unsigned chan = 0; chan < 4; chan++) {
 877             unsigned index;
 878             if (chan == 0 || chan == 1)
 879                index = lds_pos_x_div_w + chan;
 880             else if (chan == 3)
 881                index = lds_pos_w;
 882             else
 883                continue;
 884
 885             LLVMValueRef addr =
 886                ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0));
 887             pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
 888             pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
 889          }
 890       }
 891
 892       /* Load the viewport state for small prim culling. */
 893       LLVMValueRef vp = ac_build_load_invariant(
 894          &ctx->ac, ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), ctx->ac.i32_0);
 895       vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
 896       LLVMValueRef vp_scale[2], vp_translate[2];
 897       vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
 898       vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
 899       vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
 900       vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
 901
 902       /* Get the small prim filter precision. */
 903       LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
 904       small_prim_precision =
 905          LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
 906       small_prim_precision =
 907          LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
 908       small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
 909
 910       /* Execute culling code. */
 911       struct ac_cull_options options = {};
 912       options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
 913       options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
 914       options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
 915       options.cull_small_prims = options.cull_view_xy;
 916       options.cull_zero_area = options.cull_front || options.cull_back;
 917       options.cull_w = true;
 918
 919       /* Tell ES threads whether their vertex survived. */
 920       ac_build_ifcc(&ctx->ac,
 921                     ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
 922                                      small_prim_precision, &options),
 923                     16003);
 924       {
 925          LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
 926          for (unsigned vtx = 0; vtx < 3; vtx++) {
 927             LLVMBuildStore(builder, ctx->ac.i8_1,
 928                            si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
 929          }
 930       }
 931       ac_build_endif(&ctx->ac, 16003);
 932    }
 933    ac_build_endif(&ctx->ac, 16002);
 934    ac_build_s_barrier(&ctx->ac);
 935
 936    gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
 937
 938    LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
 939
 940    /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
 941    ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
 942    {
 943       LLVMValueRef es_accepted_flag =
 944          LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
 945
 946       LLVMValueRef es_accepted_bool =
 947          LLVMBuildICmp(builder, LLVMIntNE, es_accepted_flag, ctx->ac.i8_0, "");
 948       LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
 949
 950       LLVMBuildStore(builder, es_accepted_bool, es_accepted);
 951
 952       ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
 953       {
 954          LLVMBuildStore(builder, es_mask,
 955                         ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, get_wave_id_in_tg(ctx)));
 956       }
 957       ac_build_endif(&ctx->ac, 16008);
 958    }
 959    ac_build_endif(&ctx->ac, 16007);
 960    ac_build_s_barrier(&ctx->ac);
 961
 962    /* Load the vertex masks and compute the new ES thread count. */
 963    LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
 964    load_bitmasks_2x64(ctx, ctx->gs_ngg_scratch, 0, es_mask, &new_num_es_threads);
 965    new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
 966
 967    /* ES threads compute their prefix sum, which is the new ES thread ID.
 968     * Then they write the value of the old thread ID into the LDS address
 969     * of the new thread ID. It will be used it to load input VGPRs from
 970     * the old thread's LDS location.
 971     */
 972    ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
 973    {
 974       LLVMValueRef old_id = get_thread_id_in_tg(ctx);
 975       LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
 976
 977       LLVMBuildStore(
 978          builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
 979          si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id));
 980       LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
 981                      si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
 982    }
 983    ac_build_endif(&ctx->ac, 16009);
 984
 985    /* Kill waves that have inactive threads. */
 986    kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
 987                              ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
 988                              LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
 989                                           LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""),
 990                              "");
 991    ac_build_ifcc(&ctx->ac, kill_wave, 19202);
 992    {
 993       /* If we are killing wave 0, send that there are no primitives
 994        * in this threadgroup.
 995        */
 996       ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0);
 997       ac_build_s_endpgm(&ctx->ac);
 998    }
 999    ac_build_endif(&ctx->ac, 19202);
1000    ac_build_s_barrier(&ctx->ac);
1001
1002    /* Send the final vertex and primitive counts. */
1003    ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
1004                                  ngg_get_prim_cnt(ctx));
1005
1006    /* Update thread counts in SGPRs. */
1007    LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
1008    LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
1009
1010    /* This also converts the thread count from the total count to the per-wave count. */
1011    update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8,
1012                         0);
1013
1014    /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
1015    LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1016
1017    /* Set the null flag at the beginning (culled), and then
1018     * overwrite it for accepted primitives.
1019     */
1020    LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
1021
1022    /* Get vertex indices after vertex compaction. */
1023    ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
1024    {
1025       struct ac_ngg_prim prim = {};
1026       prim.num_vertices = 3;
1027       prim.isnull = ctx->ac.i1false;
1028
1029       for (unsigned vtx = 0; vtx < 3; vtx++) {
1030          prim.index[vtx] = LLVMBuildLoad(
1031             builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
1032          prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
1033          prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
1034       }
1035
1036       /* Set the new GS input VGPR. */
1037       LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
1038    }
1039    ac_build_endif(&ctx->ac, 16011);
1040
1041    if (gfx10_ngg_export_prim_early(shader))
1042       gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
1043
1044    /* Set the new ES input VGPRs. */
1045    LLVMValueRef es_data[4];
1046    LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1047
1048    for (unsigned i = 0; i < 4; i++)
1049       es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1050
1051    ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, new_num_es_threads, ""),
1052                  16012);
1053    {
1054       LLVMValueRef old_id, old_es_vtxptr, tmp;
1055
1056       /* Load ES input VGPRs from the ES thread before compaction. */
1057       old_id = LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
1058       old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
1059
1060       LLVMBuildStore(builder, old_id, old_thread_id);
1061       old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
1062
1063       for (unsigned i = 0; i < 2; i++) {
1064          tmp = LLVMBuildLoad(
1065             builder,
1066             ac_build_gep0(&ctx->ac, old_es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)),
1067             "");
1068          LLVMBuildStore(builder, tmp, es_data[i]);
1069       }
1070
1071       if (ctx->type == PIPE_SHADER_TESS_EVAL) {
1072          tmp = LLVMBuildLoad(builder,
1073                              si_build_gep_i8(ctx, old_es_vtxptr, lds_byte2_tes_rel_patch_id), "");
1074          tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1075          LLVMBuildStore(builder, tmp, es_data[2]);
1076
1077          if (uses_tes_prim_id) {
1078             tmp = LLVMBuildLoad(builder,
1079                                 ac_build_gep0(&ctx->ac, old_es_vtxptr,
1080                                               LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)),
1081                                 "");
1082             LLVMBuildStore(builder, tmp, es_data[3]);
1083          }
1084       }
1085    }
1086    ac_build_endif(&ctx->ac, 16012);
1087
1088    /* Return values for the main function. */
1089    LLVMValueRef ret = ctx->return_value;
1090    LLVMValueRef val;
1091
1092    ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
1093    ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
1094    if (ctx->type == PIPE_SHADER_TESS_EVAL)
1095       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
1096
1097    ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
1098    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
1099                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
1100    ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers,
1101                              8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
1102    ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
1103    ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
1104
1105    if (ctx->type == PIPE_SHADER_VERTEX) {
1106       ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);
1107       ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE);
1108       ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID);
1109       ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, 8 + SI_VS_NUM_USER_SGPR);
1110
1111       for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
1112          ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
1113                                      8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
1114       }
1115    } else {
1116       assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1117       ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
1118       ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR);
1119    }
1120
1121    unsigned vgpr;
1122    if (ctx->type == PIPE_SHADER_VERTEX) {
1123       if (shader->selector->num_vbos_in_user_sgprs) {
1124          vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
1125       } else {
1126          vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
1127       }
1128    } else {
1129       vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
1130    }
1131
1132    val = LLVMBuildLoad(builder, new_vgpr0, "");
1133    ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1134    vgpr++; /* gs_vtx23_offset */
1135
1136    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
1137    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
1138    vgpr++; /* gs_vtx45_offset */
1139
1140    if (ctx->type == PIPE_SHADER_VERTEX) {
1141       val = LLVMBuildLoad(builder, es_data[0], "");
1142       ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1143                                  ""); /* VGPR5 - VertexID */
1144       vgpr += 2;
1145       if (uses_instance_id) {
1146          val = LLVMBuildLoad(builder, es_data[1], "");
1147          ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1148                                     ""); /* VGPR8 - InstanceID */
1149       } else {
1150          vgpr++;
1151       }
1152    } else {
1153       assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1154       unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
1155       for (unsigned i = 0; i < num_vgprs; i++) {
1156          val = LLVMBuildLoad(builder, es_data[i], "");
1157          ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1158       }
1159       if (num_vgprs == 3)
1160          vgpr++;
1161    }
1162    /* Return the old thread ID. */
1163    val = LLVMBuildLoad(builder, old_thread_id, "");
1164    ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1165
1166    /* These two also use LDS. */
1167    if (sel->info.writes_edgeflag ||
1168        (ctx->type == PIPE_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
1169       ac_build_s_barrier(&ctx->ac);
1170
1171    ctx->return_value = ret;
1172 }
1173
1174 /**
1175  * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
1176  */
1177 void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
1178 {
1179    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1180    struct si_shader_selector *sel = ctx->shader->selector;
1181    struct si_shader_info *info = &sel->info;
1182    struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1183    LLVMBuilderRef builder = ctx->ac.builder;
1184    LLVMValueRef tmp, tmp2;
1185
1186    assert(!ctx->shader->is_gs_copy_shader);
1187    assert(info->num_outputs <= max_outputs);
1188
1189    LLVMValueRef vertex_ptr = NULL;
1190
1191    if (sel->so.num_outputs || sel->info.writes_edgeflag)
1192       vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1193
1194    for (unsigned i = 0; i < info->num_outputs; i++) {
1195       outputs[i].semantic_name = info->output_semantic_name[i];
1196       outputs[i].semantic_index = info->output_semantic_index[i];
1197
1198       for (unsigned j = 0; j < 4; j++) {
1199          outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
1200
1201          /* TODO: we may store more outputs than streamout needs,
1202           * but streamout performance isn't that important.
1203           */
1204          if (sel->so.num_outputs) {
1205             tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
1206             tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1207             tmp2 = ac_to_integer(&ctx->ac, tmp2);
1208             LLVMBuildStore(builder, tmp2, tmp);
1209          }
1210       }
1211
1212       /* Store the edgeflag at the end (if streamout is enabled) */
1213       if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG && sel->info.writes_edgeflag) {
1214          LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
1215          /* The output is a float, but the hw expects a 1-bit integer. */
1216          edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
1217          edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
1218
1219          tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1220          tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1221          LLVMBuildStore(builder, edgeflag, tmp);
1222       }
1223    }
1224
1225    bool unterminated_es_if_block =
1226       !sel->so.num_outputs && !sel->info.writes_edgeflag &&
1227       !ctx->screen->use_ngg_streamout && /* no query buffer */
1228       (ctx->type != PIPE_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id);
1229
1230    if (!unterminated_es_if_block)
1231       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1232
1233    LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
1234    LLVMValueRef is_es_thread = si_is_es_thread(ctx);
1235    LLVMValueRef vtxindex[3];
1236
1237    if (ctx->shader->key.opt.ngg_culling) {
1238       vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
1239       vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
1240       vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
1241    } else {
1242       vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
1243       vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
1244       vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
1245    }
1246
1247    /* Determine the number of vertices per primitive. */
1248    unsigned num_vertices;
1249    LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
1250
1251    /* Streamout */
1252    LLVMValueRef emitted_prims = NULL;
1253
1254    if (sel->so.num_outputs) {
1255       assert(!unterminated_es_if_block);
1256
1257       struct ngg_streamout nggso = {};
1258       nggso.num_vertices = num_vertices_val;
1259       nggso.prim_enable[0] = is_gs_thread;
1260
1261       for (unsigned i = 0; i < num_vertices; ++i)
1262          nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1263
1264       build_streamout(ctx, &nggso);
1265       emitted_prims = nggso.emit[0];
1266    }
1267
1268    LLVMValueRef user_edgeflags[3] = {};
1269
1270    if (sel->info.writes_edgeflag) {
1271       assert(!unterminated_es_if_block);
1272
1273       /* Streamout already inserted the barrier, so don't insert it again. */
1274       if (!sel->so.num_outputs)
1275          ac_build_s_barrier(&ctx->ac);
1276
1277       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1278       /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
1279       for (unsigned i = 0; i < num_vertices; i++) {
1280          tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1281          tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1282          tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
1283          tmp = LLVMBuildLoad(builder, tmp, "");
1284          tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1285
1286          user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
1287          LLVMBuildStore(builder, tmp, user_edgeflags[i]);
1288       }
1289       ac_build_endif(&ctx->ac, 5400);
1290    }
1291
1292    /* Copy Primitive IDs from GS threads to the LDS address corresponding
1293     * to the ES thread of the provoking vertex.
1294     */
1295    if (ctx->type == PIPE_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) {
1296       assert(!unterminated_es_if_block);
1297
1298       /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
1299       if (sel->so.num_outputs || sel->info.writes_edgeflag)
1300          ac_build_s_barrier(&ctx->ac);
1301
1302       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1303       /* Extract the PROVOKING_VTX_INDEX field. */
1304       LLVMValueRef provoking_vtx_in_prim = si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
1305
1306       /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1307       LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1308       LLVMValueRef provoking_vtx_index =
1309          LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1310       LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
1311
1312       LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
1313                      ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
1314       ac_build_endif(&ctx->ac, 5400);
1315    }
1316
1317    /* Update query buffer */
1318    if (ctx->screen->use_ngg_streamout && !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
1319       assert(!unterminated_es_if_block);
1320
1321       tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1322       tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1323       ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
1324       tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
1325       ac_build_ifcc(&ctx->ac, tmp, 5030);
1326       tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
1327                           sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
1328       ac_build_ifcc(&ctx->ac, tmp, 5031);
1329       {
1330          LLVMValueRef args[] = {
1331             ngg_get_prim_cnt(ctx),
1332             ngg_get_query_buf(ctx),
1333             LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
1334             ctx->ac.i32_0,                        /* soffset */
1335             ctx->ac.i32_0,                        /* cachepolicy */
1336          };
1337
1338          if (sel->so.num_outputs) {
1339             args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
1340             args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false),
1341                                          ctx->ac.i32_1);
1342          }
1343
1344          /* TODO: should this be 64-bit atomics? */
1345          ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1346                             0);
1347       }
1348       ac_build_endif(&ctx->ac, 5031);
1349       ac_build_endif(&ctx->ac, 5030);
1350       ac_build_endif(&ctx->ac, 5029);
1351    }
1352
1353    /* Build the primitive export. */
1354    if (!gfx10_ngg_export_prim_early(ctx->shader)) {
1355       assert(!unterminated_es_if_block);
1356       gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
1357    }
1358
1359    /* Export per-vertex data (positions and parameters). */
1360    if (!unterminated_es_if_block)
1361       ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1362    {
1363       unsigned i;
1364
1365       /* Unconditionally (re-)load the values for proper SSA form. */
1366       for (i = 0; i < info->num_outputs; i++) {
1367          /* If the NGG cull shader part computed the position, don't
1368           * use the position from the current shader part. Instead,
1369           * load it from LDS.
1370           */
1371          if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
1372              ctx->shader->key.opt.ngg_culling) {
1373             vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
1374
1375             for (unsigned j = 0; j < 4; j++) {
1376                tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
1377                tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1378                tmp = LLVMBuildLoad(builder, tmp, "");
1379                outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1380             }
1381          } else {
1382             for (unsigned j = 0; j < 4; j++) {
1383                outputs[i].values[j] = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1384             }
1385          }
1386       }
1387
1388       if (ctx->shader->key.mono.u.vs_export_prim_id) {
1389          outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
1390          outputs[i].semantic_index = 0;
1391
1392          if (ctx->type == PIPE_SHADER_VERTEX) {
1393             /* Wait for GS stores to finish. */
1394             ac_build_s_barrier(&ctx->ac);
1395
1396             tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1397             tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1398             outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
1399          } else {
1400             assert(ctx->type == PIPE_SHADER_TESS_EVAL);
1401             outputs[i].values[0] = si_get_primitive_id(ctx, 0);
1402          }
1403
1404          outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
1405          for (unsigned j = 1; j < 4; j++)
1406             outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
1407
1408          memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
1409          i++;
1410       }
1411
1412       si_llvm_build_vs_exports(ctx, outputs, i);
1413    }
1414    ac_build_endif(&ctx->ac, 6002);
1415 }
1416
1417 static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
1418 {
1419    const struct si_shader_selector *sel = ctx->shader->selector;
1420    const struct si_shader_info *info = &sel->info;
1421
1422    LLVMTypeRef elements[2] = {
1423       LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
1424       LLVMArrayType(ctx->ac.i8, 4),
1425    };
1426    LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1427    type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1428    return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1429 }
1430
1431 /**
1432  * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1433  * is in emit order; that is:
1434  * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1435  * - during vertex emit, i.e. while the API GS shader invocation is running,
1436  *   N = threadidx * gs_max_out_vertices + emitidx
1437  *
1438  * Goals of the LDS memory layout:
1439  * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1440  *    in uniform control flow
1441  * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1442  *    culling
1443  * 3. Agnostic to the number of waves (since we don't know it before compiling)
1444  * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1445  * 5. Avoid wasting memory.
1446  *
1447  * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1448  * layout, elimination of bank conflicts requires that each vertex occupy an
1449  * odd number of dwords. We use the additional dword to store the output stream
1450  * index as well as a flag to indicate whether this vertex ends a primitive
1451  * for rasterization.
1452  *
1453  * Swizzling is required to satisfy points 1 and 2 simultaneously.
1454  *
1455  * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
1456  * Indices are swizzled in groups of 32, which ensures point 1 without
1457  * disturbing point 2.
1458  *
1459  * \return an LDS pointer to type {[N x i32], [4 x i8]}
1460  */
1461 static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
1462 {
1463    struct si_shader_selector *sel = ctx->shader->selector;
1464    LLVMBuilderRef builder = ctx->ac.builder;
1465    LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1466
1467    /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
1468    unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
1469    if (write_stride_2exp) {
1470       LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1471       LLVMValueRef swizzle = LLVMBuildAnd(
1472          builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1473       vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1474    }
1475
1476    return ac_build_gep0(&ctx->ac, storage, vertexidx);
1477 }
1478
1479 static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
1480                                            LLVMValueRef emitidx)
1481 {
1482    struct si_shader_selector *sel = ctx->shader->selector;
1483    LLVMBuilderRef builder = ctx->ac.builder;
1484    LLVMValueRef tmp;
1485
1486    tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
1487    tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1488    const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1489    return ngg_gs_vertex_ptr(ctx, vertexidx);
1490 }
1491
1492 static LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx,
1493                                                LLVMValueRef vertexptr, unsigned out_idx)
1494 {
1495    LLVMValueRef gep_idx[3] = {
1496       ctx->ac.i32_0, /* implied C-style array */
1497       ctx->ac.i32_0, /* first struct entry */
1498       LLVMConstInt(ctx->ac.i32, out_idx, false),
1499    };
1500    return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1501 }
1502
1503 static LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx,
1504                                                  LLVMValueRef vertexptr, unsigned stream)
1505 {
1506    LLVMValueRef gep_idx[3] = {
1507       ctx->ac.i32_0, /* implied C-style array */
1508       ctx->ac.i32_1, /* second struct entry */
1509       LLVMConstInt(ctx->ac.i32, stream, false),
1510    };
1511    return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1512 }
1513
1514 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs)
1515 {
1516    const struct si_shader_selector *sel = ctx->shader->selector;
1517    const struct si_shader_info *info = &sel->info;
1518    LLVMBuilderRef builder = ctx->ac.builder;
1519    LLVMValueRef tmp;
1520    const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1521
1522    /* If this thread has already emitted the declared maximum number of
1523     * vertices, skip the write: excessive vertex emissions are not
1524     * supposed to have any effect.
1525     */
1526    const LLVMValueRef can_emit =
1527       LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
1528                     LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
1529
1530    tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1531    tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
1532    LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1533
1534    ac_build_ifcc(&ctx->ac, can_emit, 9001);
1535
1536    const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
1537    unsigned out_idx = 0;
1538    for (unsigned i = 0; i < info->num_outputs; i++) {
1539       for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
1540          if (!(info->output_usagemask[i] & (1 << chan)) ||
1541              ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
1542             continue;
1543
1544          LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
1545          out_val = ac_to_integer(&ctx->ac, out_val);
1546          LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
1547       }
1548    }
1549    assert(out_idx * 4 == sel->gsvs_vertex_size);
1550
1551    /* Determine and store whether this vertex completed a primitive. */
1552    const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
1553
1554    tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
1555    const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
1556
1557    /* Since the geometry shader emits triangle strips, we need to
1558     * track which primitive is odd and swap vertex indices to get
1559     * the correct vertex order.
1560     */
1561    LLVMValueRef is_odd = ctx->ac.i1false;
1562    if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
1563       tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
1564       is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
1565    }
1566
1567    tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
1568    LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
1569
1570    /* The per-vertex primitive flag encoding:
1571     *   bit 0: whether this vertex finishes a primitive
1572     *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
1573     */
1574    tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
1575    tmp = LLVMBuildOr(
1576       builder, tmp,
1577       LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
1578    LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
1579
1580    tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1581    tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
1582    LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
1583
1584    ac_build_endif(&ctx->ac, 9001);
1585 }
1586
1587 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
1588 {
1589    /* Zero out the part of LDS scratch that is used to accumulate the
1590     * per-stream generated primitive count.
1591     */
1592    LLVMBuilderRef builder = ctx->ac.builder;
1593    LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1594    LLVMValueRef tid = get_thread_id_in_tg(ctx);
1595    LLVMValueRef tmp;
1596
1597    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1598    ac_build_ifcc(&ctx->ac, tmp, 5090);
1599    {
1600       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1601       LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1602    }
1603    ac_build_endif(&ctx->ac, 5090);
1604
1605    ac_build_s_barrier(&ctx->ac);
1606 }
1607
1608 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
1609 {
1610    const struct si_shader_selector *sel = ctx->shader->selector;
1611    const struct si_shader_info *info = &sel->info;
1612    const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
1613    LLVMBuilderRef builder = ctx->ac.builder;
1614    LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1615    LLVMValueRef tmp, tmp2;
1616
1617    /* Zero out remaining (non-emitted) primitive flags.
1618     *
1619     * Note: Alternatively, we could pass the relevant gs_next_vertex to
1620     *       the emit threads via LDS. This is likely worse in the expected
1621     *       typical case where each GS thread emits the full set of
1622     *       vertices.
1623     */
1624    for (unsigned stream = 0; stream < 4; ++stream) {
1625       if (!info->num_stream_output_components[stream])
1626          continue;
1627
1628       const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1629
1630       ac_build_bgnloop(&ctx->ac, 5100);
1631
1632       const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1633       tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1634                           LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
1635       ac_build_ifcc(&ctx->ac, tmp, 5101);
1636       ac_build_break(&ctx->ac);
1637       ac_build_endif(&ctx->ac, 5101);
1638
1639       tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1640       LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1641
1642       tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1643       LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1644
1645       ac_build_endloop(&ctx->ac, 5100);
1646    }
1647
1648    /* Accumulate generated primitives counts across the entire threadgroup. */
1649    for (unsigned stream = 0; stream < 4; ++stream) {
1650       if (!info->num_stream_output_components[stream])
1651          continue;
1652
1653       LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1654       numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1655
1656       tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
1657       ac_build_ifcc(&ctx->ac, tmp, 5105);
1658       {
1659          LLVMBuildAtomicRMW(
1660             builder, LLVMAtomicRMWBinOpAdd,
1661             ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
1662             numprims, LLVMAtomicOrderingMonotonic, false);
1663       }
1664       ac_build_endif(&ctx->ac, 5105);
1665    }
1666
1667    ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1668
1669    ac_build_s_barrier(&ctx->ac);
1670
1671    const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1672    LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1673
1674    /* Streamout */
1675    if (sel->so.num_outputs) {
1676       struct ngg_streamout nggso = {};
1677
1678       nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
1679
1680       LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
1681       for (unsigned stream = 0; stream < 4; ++stream) {
1682          if (!info->num_stream_output_components[stream])
1683             continue;
1684
1685          tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
1686          tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1687          tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1688          nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
1689       }
1690
1691       for (unsigned i = 0; i < verts_per_prim; ++i) {
1692          tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
1693                             "");
1694          tmp = ngg_gs_vertex_ptr(ctx, tmp);
1695          nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1696       }
1697
1698       build_streamout(ctx, &nggso);
1699    }
1700
1701    /* Write shader query data. */
1702    if (ctx->screen->use_ngg_streamout) {
1703       tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1704       tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1705       ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
1706       unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
1707       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
1708                           LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
1709       ac_build_ifcc(&ctx->ac, tmp, 5110);
1710       {
1711          LLVMValueRef offset;
1712          tmp = tid;
1713          if (sel->so.num_outputs)
1714             tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
1715          offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
1716          if (sel->so.num_outputs) {
1717             tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
1718             tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
1719             offset = LLVMBuildAdd(builder, offset, tmp, "");
1720          }
1721
1722          tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1723          LLVMValueRef args[] = {
1724             tmp,           ngg_get_query_buf(ctx),
1725             offset,        LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
1726             ctx->ac.i32_0,                                       /* cachepolicy */
1727          };
1728          ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1729                             0);
1730       }
1731       ac_build_endif(&ctx->ac, 5110);
1732       ac_build_endif(&ctx->ac, 5109);
1733    }
1734
1735    /* Determine vertex liveness. */
1736    LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1737
1738    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1739    ac_build_ifcc(&ctx->ac, tmp, 5120);
1740    {
1741       for (unsigned i = 0; i < verts_per_prim; ++i) {
1742          const LLVMValueRef primidx =
1743             LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
1744
1745          if (i > 0) {
1746             tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1747             ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1748          }
1749
1750          /* Load primitive liveness */
1751          tmp = ngg_gs_vertex_ptr(ctx, primidx);
1752          tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1753          const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1754
1755          tmp = LLVMBuildLoad(builder, vertliveptr, "");
1756          tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
1757
1758          if (i > 0)
1759             ac_build_endif(&ctx->ac, 5121 + i);
1760       }
1761    }
1762    ac_build_endif(&ctx->ac, 5120);
1763
1764    /* Inclusive scan addition across the current wave. */
1765    LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1766    struct ac_wg_scan vertlive_scan = {};
1767    vertlive_scan.op = nir_op_iadd;
1768    vertlive_scan.enable_reduce = true;
1769    vertlive_scan.enable_exclusive = true;
1770    vertlive_scan.src = vertlive;
1771    vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
1772    vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1773    vertlive_scan.numwaves = get_tgsize(ctx);
1774    vertlive_scan.maxwaves = 8;
1775
1776    ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1777
1778    /* Skip all exports (including index exports) when possible. At least on
1779     * early gfx10 revisions this is also to avoid hangs.
1780     */
1781    LLVMValueRef have_exports =
1782       LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1783    num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1784
1785    /* Allocate export space. Send this message as early as possible, to
1786     * hide the latency of the SQ <-> SPI roundtrip.
1787     *
1788     * Note: We could consider compacting primitives for export as well.
1789     *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
1790     *       prim data per clock and skips null primitives at no additional
1791     *       cost. So compacting primitives can only be beneficial when
1792     *       there are 4 or more contiguous null primitives in the export
1793     *       (in the common case of single-dword prim exports).
1794     */
1795    ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
1796                                  num_emit_threads);
1797
1798    /* Setup the reverse vertex compaction permutation. We re-use stream 1
1799     * of the primitive liveness flags, relying on the fact that each
1800     * threadgroup can have at most 256 threads. */
1801    ac_build_ifcc(&ctx->ac, vertlive, 5130);
1802    {
1803       tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1804       tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1805       LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1806    }
1807    ac_build_endif(&ctx->ac, 5130);
1808
1809    ac_build_s_barrier(&ctx->ac);
1810
1811    /* Export primitive data */
1812    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1813    ac_build_ifcc(&ctx->ac, tmp, 5140);
1814    {
1815       LLVMValueRef flags;
1816       struct ac_ngg_prim prim = {};
1817       prim.num_vertices = verts_per_prim;
1818
1819       tmp = ngg_gs_vertex_ptr(ctx, tid);
1820       flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1821       prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
1822
1823       for (unsigned i = 0; i < verts_per_prim; ++i) {
1824          prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1825                                       LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1826          prim.edgeflag[i] = ctx->ac.i1false;
1827       }
1828
1829       /* Geometry shaders output triangle strips, but NGG expects triangles. */
1830       if (verts_per_prim == 3) {
1831          LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1832          is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
1833          LLVMValueRef flatshade_first = LLVMBuildICmp(
1834             builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
1835
1836          ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
1837       }
1838
1839       ac_build_export_prim(&ctx->ac, &prim);
1840    }
1841    ac_build_endif(&ctx->ac, 5140);
1842
1843    /* Export position and parameter data */
1844    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1845    ac_build_ifcc(&ctx->ac, tmp, 5145);
1846    {
1847       struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1848
1849       tmp = ngg_gs_vertex_ptr(ctx, tid);
1850       tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1851       tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1852       const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1853
1854       unsigned out_idx = 0;
1855       for (unsigned i = 0; i < info->num_outputs; i++) {
1856          outputs[i].semantic_name = info->output_semantic_name[i];
1857          outputs[i].semantic_index = info->output_semantic_index[i];
1858
1859          for (unsigned j = 0; j < 4; j++, out_idx++) {
1860             tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1861             tmp = LLVMBuildLoad(builder, tmp, "");
1862             outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1863             outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
1864          }
1865       }
1866
1867       si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
1868    }
1869    ac_build_endif(&ctx->ac, 5145);
1870 }
1871
1872 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
1873                                      unsigned min_verts_per_prim, bool use_adjacency)
1874 {
1875    unsigned max_reuse = max_esverts - min_verts_per_prim;
1876    if (use_adjacency)
1877       max_reuse /= 2;
1878    *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1879 }
1880
1881 /**
1882  * Determine subgroup information like maximum number of vertices and prims.
1883  *
1884  * This happens before the shader is uploaded, since LDS relocations during
1885  * upload depend on the subgroup size.
1886  */
1887 bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
1888 {
1889    const struct si_shader_selector *gs_sel = shader->selector;
1890    const struct si_shader_selector *es_sel =
1891       shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
1892    const enum pipe_shader_type gs_type = gs_sel->type;
1893    const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
1894    const unsigned input_prim = si_get_input_prim(gs_sel);
1895    const bool use_adjacency =
1896       input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
1897    const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
1898    const unsigned min_verts_per_prim = gs_type == PIPE_SHADER_GEOMETRY ? max_verts_per_prim : 1;
1899
1900    /* All these are in dwords: */
1901    /* We can't allow using the whole LDS, because GS waves compete with
1902     * other shader stages for LDS space.
1903     *
1904     * TODO: We should really take the shader's internal LDS use into
1905     *       account. The linker will fail if the size is greater than
1906     *       8K dwords.
1907     */
1908    const unsigned max_lds_size = 8 * 1024 - 768;
1909    const unsigned target_lds_size = max_lds_size;
1910    unsigned esvert_lds_size = 0;
1911    unsigned gsprim_lds_size = 0;
1912
1913    /* All these are per subgroup: */
1914    bool max_vert_out_per_gs_instance = false;
1915    unsigned max_gsprims_base = 128; /* default prim group size clamp */
1916    unsigned max_esverts_base = 128;
1917
1918    if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
1919       max_gsprims_base = 128 / 3;
1920       max_esverts_base = max_gsprims_base * 3;
1921    } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
1922       max_gsprims_base = 126;
1923       max_esverts_base = 128;
1924    }
1925
1926    /* Hardware has the following non-natural restrictions on the value
1927     * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
1928     * the draw:
1929     *  - at most 252 for any line input primitive type
1930     *  - at most 251 for any quad input primitive type
1931     *  - at most 251 for triangle strips with adjacency (this happens to
1932     *    be the natural limit for triangle *lists* with adjacency)
1933     */
1934    max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
1935
1936    if (gs_type == PIPE_SHADER_GEOMETRY) {
1937       bool force_multi_cycling = false;
1938       unsigned max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices * gs_num_invocations;
1939
1940 retry_select_mode:
1941       if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
1942          if (max_out_verts_per_gsprim) {
1943             max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
1944          }
1945       } else {
1946          /* Use special multi-cycling mode in which each GS
1947           * instance gets its own subgroup. Does not work with
1948           * tessellation. */
1949          max_vert_out_per_gs_instance = true;
1950          max_gsprims_base = 1;
1951          max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
1952       }
1953
1954       esvert_lds_size = es_sel->esgs_itemsize / 4;
1955       gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
1956
1957       if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
1958          if (gs_sel->tess_turns_off_ngg || es_sel->type != PIPE_SHADER_TESS_EVAL) {
1959             force_multi_cycling = true;
1960             goto retry_select_mode;
1961          }
1962       }
1963    } else {
1964       /* VS and TES. */
1965       /* LDS size for passing data from ES to GS. */
1966       esvert_lds_size = ngg_nogs_vertex_size(shader);
1967    }
1968
1969    unsigned max_gsprims = max_gsprims_base;
1970    unsigned max_esverts = max_esverts_base;
1971
1972    if (esvert_lds_size)
1973       max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
1974    if (gsprim_lds_size)
1975       max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
1976
1977    max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
1978    clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
1979    assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
1980
1981    if (esvert_lds_size || gsprim_lds_size) {
1982       /* Now that we have a rough proportionality between esverts
1983        * and gsprims based on the primitive type, scale both of them
1984        * down simultaneously based on required LDS space.
1985        *
1986        * We could be smarter about this if we knew how much vertex
1987        * reuse to expect.
1988        */
1989       unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
1990       if (lds_total > target_lds_size) {
1991          max_esverts = max_esverts * target_lds_size / lds_total;
1992          max_gsprims = max_gsprims * target_lds_size / lds_total;
1993
1994          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
1995          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
1996          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
1997       }
1998    }
1999
2000    /* Round up towards full wave sizes for better ALU utilization. */
2001    if (!max_vert_out_per_gs_instance) {
2002       const unsigned wavesize = gs_sel->screen->ge_wave_size;
2003       unsigned orig_max_esverts;
2004       unsigned orig_max_gsprims;
2005       do {
2006          orig_max_esverts = max_esverts;
2007          orig_max_gsprims = max_gsprims;
2008
2009          max_esverts = align(max_esverts, wavesize);
2010          max_esverts = MIN2(max_esverts, max_esverts_base);
2011          if (esvert_lds_size)
2012             max_esverts =
2013                MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2014          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2015
2016          max_gsprims = align(max_gsprims, wavesize);
2017          max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2018          if (gsprim_lds_size)
2019             max_gsprims =
2020                MIN2(max_gsprims, (max_lds_size - max_esverts * esvert_lds_size) / gsprim_lds_size);
2021          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2022          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2023       } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2024    }
2025
2026    /* Hardware restriction: minimum value of max_esverts */
2027    max_esverts = MAX2(max_esverts, 23 + max_verts_per_prim);
2028
2029    unsigned max_out_vertices =
2030       max_vert_out_per_gs_instance
2031          ? gs_sel->gs_max_out_vertices
2032          : gs_type == PIPE_SHADER_GEOMETRY
2033               ? max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices
2034               : max_esverts;
2035    assert(max_out_vertices <= 256);
2036
2037    unsigned prim_amp_factor = 1;
2038    if (gs_type == PIPE_SHADER_GEOMETRY) {
2039       /* Number of output primitives per GS input primitive after
2040        * GS instancing. */
2041       prim_amp_factor = gs_sel->gs_max_out_vertices;
2042    }
2043
2044    /* The GE only checks against the maximum number of ES verts after
2045     * allocating a full GS primitive. So we need to ensure that whenever
2046     * this check passes, there is enough space for a full primitive without
2047     * vertex reuse.
2048     */
2049    shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2050    shader->ngg.max_gsprims = max_gsprims;
2051    shader->ngg.max_out_verts = max_out_vertices;
2052    shader->ngg.prim_amp_factor = prim_amp_factor;
2053    shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2054
2055    shader->gs_info.esgs_ring_size = 4 * max_esverts * esvert_lds_size;
2056    shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
2057
2058    assert(shader->ngg.hw_max_esverts >= 24); /* HW limitation */
2059
2060    /* If asserts are disabled, we use the same conditions to return false */
2061    return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
2062           max_out_vertices <= 256 &&
2063           shader->ngg.hw_max_esverts >= 24;
2064 }