src/gallium/drivers/radeonsi/gfx10_shader_ngg.c

   1 /*
   2  * Copyright 2017 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 #include "ac_llvm_cull.h"
  25 #include "si_pipe.h"
  26 #include "si_shader_internal.h"
  27 #include "sid.h"
  28 #include "util/u_memory.h"
  29 #include "util/u_prim.h"
  30
  31 static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
  32 {
  33    return si_unpack_param(ctx, ctx->merged_wave_info, 24, 4);
  34 }
  35
  36 static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
  37 {
  38    return si_unpack_param(ctx, ctx->merged_wave_info, 28, 4);
  39 }
  40
  41 static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
  42 {
  43    LLVMBuilderRef builder = ctx->ac.builder;
  44    LLVMValueRef tmp;
  45    tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
  46                       LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
  47    return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
  48 }
  49
  50 static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
  51 {
  52    return si_unpack_param(ctx, ctx->gs_tg_info, 12, 9);
  53 }
  54
  55 static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
  56 {
  57    return si_unpack_param(ctx, ctx->gs_tg_info, 22, 9);
  58 }
  59
  60 static LLVMValueRef ngg_get_ordered_id(struct si_shader_context *ctx)
  61 {
  62    return si_unpack_param(ctx, ctx->gs_tg_info, 0, 12);
  63 }
  64
  65 static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
  66 {
  67    LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
  68
  69    return ac_build_load_to_sgpr(&ctx->ac, buf_ptr,
  70                                 LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
  71 }
  72
  73 static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
  74 {
  75    if (ctx->stage == MESA_SHADER_VERTEX) {
  76       LLVMValueRef tmp;
  77       tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
  78                           LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
  79       return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
  80    }
  81    return ctx->ac.i1false;
  82 }
  83
  84 /**
  85  * Return the number of vertices as a constant in \p num_vertices,
  86  * and return a more precise value as LLVMValueRef from the function.
  87  */
  88 static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, unsigned *num_vertices)
  89 {
  90    const struct si_shader_info *info = &ctx->shader->selector->info;
  91
  92    if (ctx->stage == MESA_SHADER_VERTEX) {
  93       if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
  94          /* Blits always use axis-aligned rectangles with 3 vertices. */
  95          *num_vertices = 3;
  96          return LLVMConstInt(ctx->ac.i32, 3, 0);
  97       } else {
  98          /* We always build up all three indices for the prim export
  99           * independent of the primitive type. The additional garbage
 100           * data shouldn't hurt. This number doesn't matter with
 101           * NGG passthrough.
 102           */
 103          *num_vertices = 3;
 104
 105          /* Extract OUTPRIM field. */
 106          LLVMValueRef num = si_unpack_param(ctx, ctx->vs_state_bits, 2, 2);
 107          return LLVMBuildAdd(ctx->ac.builder, num, ctx->ac.i32_1, "");
 108       }
 109    } else {
 110       assert(ctx->stage == MESA_SHADER_TESS_EVAL);
 111
 112       if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
 113          *num_vertices = 1;
 114       else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
 115          *num_vertices = 2;
 116       else
 117          *num_vertices = 3;
 118
 119       return LLVMConstInt(ctx->ac.i32, *num_vertices, false);
 120    }
 121 }
 122
 123 bool gfx10_ngg_export_prim_early(struct si_shader *shader)
 124 {
 125    struct si_shader_selector *sel = shader->selector;
 126
 127    assert(shader->key.as_ngg && !shader->key.as_es);
 128
 129    return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->info.writes_edgeflag;
 130 }
 131
 132 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
 133 {
 134    ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ngg_get_vtx_cnt(ctx),
 135                                  ngg_get_prim_cnt(ctx));
 136 }
 137
 138 void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
 139                                  LLVMValueRef prim_passthrough)
 140 {
 141    LLVMBuilderRef builder = ctx->ac.builder;
 142
 143    if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) {
 144       ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
 145       {
 146          struct ac_ngg_prim prim = {};
 147
 148          if (prim_passthrough)
 149             prim.passthrough = prim_passthrough;
 150          else
 151             prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
 152
 153          /* This is only used with NGG culling, which returns the NGG
 154           * passthrough prim export encoding.
 155           */
 156          if (ctx->shader->selector->info.writes_edgeflag) {
 157             unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
 158             LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
 159
 160             unsigned num_vertices;
 161             ngg_get_vertices_per_prim(ctx, &num_vertices);
 162
 163             for (unsigned i = 0; i < num_vertices; i++) {
 164                unsigned shift = 9 + i * 10;
 165                LLVMValueRef edge;
 166
 167                edge = LLVMBuildLoad(builder, user_edgeflags[i], "");
 168                edge = LLVMBuildZExt(builder, edge, ctx->ac.i32, "");
 169                edge = LLVMBuildShl(builder, edge, LLVMConstInt(ctx->ac.i32, shift, 0), "");
 170                edgeflags = LLVMBuildOr(builder, edgeflags, edge, "");
 171             }
 172             prim.passthrough = LLVMBuildAnd(builder, prim.passthrough, edgeflags, "");
 173          }
 174
 175          ac_build_export_prim(&ctx->ac, &prim);
 176       }
 177       ac_build_endif(&ctx->ac, 6001);
 178       return;
 179    }
 180
 181    ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
 182    {
 183       struct ac_ngg_prim prim = {};
 184
 185       ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
 186
 187       prim.isnull = ctx->ac.i1false;
 188       prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
 189       prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
 190       prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
 191
 192       for (unsigned i = 0; i < prim.num_vertices; ++i) {
 193          prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
 194
 195          if (ctx->shader->selector->info.writes_edgeflag) {
 196             LLVMValueRef edge;
 197
 198             edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
 199             edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
 200             prim.edgeflag[i] = edge;
 201          }
 202       }
 203
 204       ac_build_export_prim(&ctx->ac, &prim);
 205    }
 206    ac_build_endif(&ctx->ac, 6001);
 207 }
 208
 209 static void build_streamout_vertex(struct si_shader_context *ctx, LLVMValueRef *so_buffer,
 210                                    LLVMValueRef *wg_offset_dw, unsigned stream,
 211                                    LLVMValueRef offset_vtx, LLVMValueRef vertexptr)
 212 {
 213    struct si_shader_info *info = &ctx->shader->selector->info;
 214    struct pipe_stream_output_info *so = &ctx->shader->selector->so;
 215    LLVMBuilderRef builder = ctx->ac.builder;
 216    LLVMValueRef offset[4] = {};
 217    LLVMValueRef tmp;
 218
 219    for (unsigned buffer = 0; buffer < 4; ++buffer) {
 220       if (!wg_offset_dw[buffer])
 221          continue;
 222
 223       tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->stride[buffer], false),
 224                          "");
 225       tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
 226       offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
 227    }
 228
 229    for (unsigned i = 0; i < so->num_outputs; ++i) {
 230       if (so->output[i].stream != stream)
 231          continue;
 232
 233       unsigned reg = so->output[i].register_index;
 234       struct si_shader_output_values out;
 235       out.semantic_name = info->output_semantic_name[reg];
 236       out.semantic_index = info->output_semantic_index[reg];
 237
 238       for (unsigned comp = 0; comp < 4; comp++) {
 239          tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * reg + comp, false));
 240          out.values[comp] = LLVMBuildLoad(builder, tmp, "");
 241          out.vertex_stream[comp] = (info->output_streams[reg] >> (2 * comp)) & 3;
 242       }
 243
 244       si_llvm_streamout_store_output(ctx, so_buffer, offset, &so->output[i], &out);
 245    }
 246 }
 247
 248 struct ngg_streamout {
 249    LLVMValueRef num_vertices;
 250
 251    /* per-thread data */
 252    LLVMValueRef prim_enable[4]; /* i1 per stream */
 253    LLVMValueRef vertices[3];    /* [N x i32] addrspace(LDS)* */
 254
 255    /* Output */
 256    LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
 257 };
 258
 259 /**
 260  * Build streamout logic.
 261  *
 262  * Implies a barrier.
 263  *
 264  * Writes number of emitted primitives to gs_ngg_scratch[4:8].
 265  *
 266  * Clobbers gs_ngg_scratch[8:].
 267  */
 268 static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout *nggso)
 269 {
 270    struct si_shader_info *info = &ctx->shader->selector->info;
 271    struct pipe_stream_output_info *so = &ctx->shader->selector->so;
 272    LLVMBuilderRef builder = ctx->ac.builder;
 273    LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
 274    LLVMValueRef tid = get_thread_id_in_tg(ctx);
 275    LLVMValueRef tmp, tmp2;
 276    LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
 277    LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
 278    LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
 279    LLVMValueRef so_buffer[4] = {};
 280    unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
 281    LLVMValueRef prim_stride_dw[4] = {};
 282    LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
 283    int stream_for_buffer[4] = {-1, -1, -1, -1};
 284    unsigned bufmask_for_stream[4] = {};
 285    bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
 286    unsigned scratch_emit_base = isgs ? 4 : 0;
 287    LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
 288    unsigned scratch_offset_base = isgs ? 8 : 4;
 289    LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
 290
 291    ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256);
 292
 293    /* Determine the mapping of streamout buffers to vertex streams. */
 294    for (unsigned i = 0; i < so->num_outputs; ++i) {
 295       unsigned buf = so->output[i].output_buffer;
 296       unsigned stream = so->output[i].stream;
 297       assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
 298       stream_for_buffer[buf] = stream;
 299       bufmask_for_stream[stream] |= 1 << buf;
 300    }
 301
 302    for (unsigned buffer = 0; buffer < 4; ++buffer) {
 303       if (stream_for_buffer[buffer] == -1)
 304          continue;
 305
 306       assert(so->stride[buffer]);
 307
 308       tmp = LLVMConstInt(ctx->ac.i32, so->stride[buffer], false);
 309       prim_stride_dw[buffer] = LLVMBuildMul(builder, tmp, nggso->num_vertices, "");
 310       prim_stride_dw_vgpr =
 311          ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
 312                             LLVMConstInt(ctx->ac.i32, buffer, false));
 313
 314       so_buffer[buffer] = ac_build_load_to_sgpr(
 315          &ctx->ac, buf_ptr, LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + buffer, false));
 316    }
 317
 318    tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
 319    ac_build_ifcc(&ctx->ac, tmp, 5200);
 320    {
 321       LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
 322       LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
 323
 324       /* Advance the streamout offsets in GDS. */
 325       LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
 326       LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
 327
 328       tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
 329       ac_build_ifcc(&ctx->ac, tmp, 5210);
 330       {
 331          if (isgs) {
 332             tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
 333             tmp = LLVMBuildLoad(builder, tmp, "");
 334          } else {
 335             tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
 336          }
 337          LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
 338
 339          unsigned swizzle[4];
 340          int unused_stream = -1;
 341          for (unsigned stream = 0; stream < 4; ++stream) {
 342             if (!info->num_stream_output_components[stream]) {
 343                unused_stream = stream;
 344                break;
 345             }
 346          }
 347          for (unsigned buffer = 0; buffer < 4; ++buffer) {
 348             if (stream_for_buffer[buffer] >= 0) {
 349                swizzle[buffer] = stream_for_buffer[buffer];
 350             } else {
 351                assert(unused_stream >= 0);
 352                swizzle[buffer] = unused_stream;
 353             }
 354          }
 355
 356          tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
 357          tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
 358
 359          LLVMValueRef args[] = {
 360             LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
 361             tmp,
 362             ctx->ac.i32_0,                             // ordering
 363             ctx->ac.i32_0,                             // scope
 364             ctx->ac.i1false,                           // isVolatile
 365             LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
 366             ctx->ac.i1true,                            // wave release
 367             ctx->ac.i1true,                            // wave done
 368          };
 369          tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
 370                                   ARRAY_SIZE(args), 0);
 371
 372          /* Keep offsets in a VGPR for quick retrieval via readlane by
 373           * the first wave for bounds checking, and also store in LDS
 374           * for retrieval by all waves later. */
 375          LLVMBuildStore(builder, tmp, offsets_vgpr);
 376
 377          tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
 378          tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
 379          LLVMBuildStore(builder, tmp, tmp2);
 380       }
 381       ac_build_endif(&ctx->ac, 5210);
 382
 383       /* Determine the max emit per buffer. This is done via the SALU, in part
 384        * because LLVM can't generate divide-by-multiply if we try to do this
 385        * via VALU with one lane per buffer.
 386        */
 387       LLVMValueRef max_emit[4] = {};
 388       for (unsigned buffer = 0; buffer < 4; ++buffer) {
 389          if (stream_for_buffer[buffer] == -1)
 390             continue;
 391
 392          LLVMValueRef bufsize_dw = LLVMBuildLShr(
 393             builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
 394
 395          tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
 396          LLVMValueRef offset_dw =
 397             ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
 398
 399          tmp = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
 400          tmp = LLVMBuildUDiv(builder, tmp, prim_stride_dw[buffer], "");
 401
 402          tmp2 = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
 403          max_emit[buffer] = LLVMBuildSelect(builder, tmp2, ctx->ac.i32_0, tmp, "");
 404       }
 405
 406       /* Determine the number of emitted primitives per stream and fixup the
 407        * GDS counter if necessary.
 408        *
 409        * This is complicated by the fact that a single stream can emit to
 410        * multiple buffers (but luckily not vice versa).
 411        */
 412       LLVMValueRef emit_vgpr = ctx->ac.i32_0;
 413
 414       for (unsigned stream = 0; stream < 4; ++stream) {
 415          if (!info->num_stream_output_components[stream])
 416             continue;
 417
 418          tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
 419          LLVMValueRef generated =
 420             ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
 421
 422          LLVMValueRef emit = generated;
 423          for (unsigned buffer = 0; buffer < 4; ++buffer) {
 424             if (stream_for_buffer[buffer] == stream)
 425                emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
 426          }
 427
 428          emit_vgpr =
 429             ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
 430
 431          /* Fixup the offset using a plain GDS atomic if we overflowed. */
 432          tmp = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
 433          ac_build_ifcc(&ctx->ac, tmp, 5221); /* scalar branch */
 434          tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
 435                              ac_get_thread_id(&ctx->ac), "");
 436          tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
 437          ac_build_ifcc(&ctx->ac, tmp, 5222);
 438          {
 439             tmp = LLVMBuildSub(builder, generated, emit, "");
 440             tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
 441             tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
 442             LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
 443                                LLVMAtomicOrderingMonotonic, false);
 444          }
 445          ac_build_endif(&ctx->ac, 5222);
 446          ac_build_endif(&ctx->ac, 5221);
 447       }
 448
 449       tmp = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
 450       ac_build_ifcc(&ctx->ac, tmp, 5225);
 451       {
 452          tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
 453          tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
 454          LLVMBuildStore(builder, emit_vgpr, tmp);
 455       }
 456       ac_build_endif(&ctx->ac, 5225);
 457    }
 458    ac_build_endif(&ctx->ac, 5200);
 459
 460    /* Determine the workgroup-relative per-thread / primitive offset into
 461     * the streamout buffers */
 462    struct ac_wg_scan primemit_scan[4] = {};
 463
 464    if (isgs) {
 465       for (unsigned stream = 0; stream < 4; ++stream) {
 466          if (!info->num_stream_output_components[stream])
 467             continue;
 468
 469          primemit_scan[stream].enable_exclusive = true;
 470          primemit_scan[stream].op = nir_op_iadd;
 471          primemit_scan[stream].src = nggso->prim_enable[stream];
 472          primemit_scan[stream].scratch = ac_build_gep0(
 473             &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
 474          primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
 475          primemit_scan[stream].numwaves = get_tgsize(ctx);
 476          primemit_scan[stream].maxwaves = 8;
 477          ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
 478       }
 479    }
 480
 481    ac_build_s_barrier(&ctx->ac);
 482
 483    /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
 484    LLVMValueRef wgoffset_dw[4] = {};
 485
 486    {
 487       LLVMValueRef scratch_vgpr;
 488
 489       tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
 490       scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
 491
 492       for (unsigned buffer = 0; buffer < 4; ++buffer) {
 493          if (stream_for_buffer[buffer] >= 0) {
 494             wgoffset_dw[buffer] =
 495                ac_build_readlane(&ctx->ac, scratch_vgpr,
 496                                  LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
 497          }
 498       }
 499
 500       for (unsigned stream = 0; stream < 4; ++stream) {
 501          if (info->num_stream_output_components[stream]) {
 502             nggso->emit[stream] =
 503                ac_build_readlane(&ctx->ac, scratch_vgpr,
 504                                  LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
 505          }
 506       }
 507    }
 508
 509    /* Write out primitive data */
 510    for (unsigned stream = 0; stream < 4; ++stream) {
 511       if (!info->num_stream_output_components[stream])
 512          continue;
 513
 514       if (isgs) {
 515          ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
 516       } else {
 517          primemit_scan[stream].result_exclusive = tid;
 518       }
 519
 520       tmp = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
 521                           nggso->emit[stream], "");
 522       tmp = LLVMBuildAnd(builder, tmp, nggso->prim_enable[stream], "");
 523       ac_build_ifcc(&ctx->ac, tmp, 5240);
 524       {
 525          LLVMValueRef offset_vtx =
 526             LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
 527
 528          for (unsigned i = 0; i < max_num_vertices; ++i) {
 529             tmp = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
 530                                 nggso->num_vertices, "");
 531             ac_build_ifcc(&ctx->ac, tmp, 5241);
 532             build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
 533                                    nggso->vertices[i]);
 534             ac_build_endif(&ctx->ac, 5241);
 535             offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
 536          }
 537       }
 538       ac_build_endif(&ctx->ac, 5240);
 539    }
 540 }
 541
 542 /* LDS layout of ES vertex data for NGG culling. */
 543 enum
 544 {
 545    /* Byte 0: Boolean ES thread accepted (unculled) flag, and later the old
 546     *         ES thread ID. After vertex compaction, compacted ES threads
 547     *         store the old thread ID here to copy input VGPRs from uncompacted
 548     *         ES threads.
 549     * Byte 1: New ES thread ID, loaded by GS to prepare the prim export value.
 550     * Byte 2: TES rel patch ID
 551     * Byte 3: Unused
 552     */
 553    lds_byte0_accept_flag = 0,
 554    lds_byte0_old_thread_id = 0,
 555    lds_byte1_new_thread_id,
 556    lds_byte2_tes_rel_patch_id,
 557    lds_byte3_unused,
 558
 559    lds_packed_data = 0, /* lds_byteN_... */
 560
 561    lds_pos_x,
 562    lds_pos_y,
 563    lds_pos_z,
 564    lds_pos_w,
 565    lds_pos_x_div_w,
 566    lds_pos_y_div_w,
 567    /* If VS: */
 568    lds_vertex_id,
 569    lds_instance_id, /* optional */
 570    /* If TES: */
 571    lds_tes_u = lds_vertex_id,
 572    lds_tes_v = lds_instance_id,
 573    lds_tes_patch_id, /* optional */
 574 };
 575
 576 static LLVMValueRef si_build_gep_i8(struct si_shader_context *ctx, LLVMValueRef ptr,
 577                                     unsigned byte_index)
 578 {
 579    assert(byte_index < 4);
 580    LLVMTypeRef pi8 = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
 581    LLVMValueRef index = LLVMConstInt(ctx->ac.i32, byte_index, 0);
 582
 583    return LLVMBuildGEP(ctx->ac.builder, LLVMBuildPointerCast(ctx->ac.builder, ptr, pi8, ""), &index,
 584                        1, "");
 585 }
 586
 587 static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
 588 {
 589    unsigned lds_vertex_size = 0;
 590
 591    /* The edgeflag is always stored in the last element that's also
 592     * used for padding to reduce LDS bank conflicts. */
 593    if (shader->selector->so.num_outputs)
 594       lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
 595    if (shader->selector->info.writes_edgeflag)
 596       lds_vertex_size = MAX2(lds_vertex_size, 1);
 597
 598    /* LDS size for passing data from GS to ES.
 599     * GS stores Primitive IDs into LDS at the address corresponding
 600     * to the ES thread of the provoking vertex. All ES threads
 601     * load and export PrimitiveID for their thread.
 602     */
 603    if (shader->selector->info.stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)
 604       lds_vertex_size = MAX2(lds_vertex_size, 1);
 605
 606    if (shader->key.opt.ngg_culling) {
 607       if (shader->selector->info.stage == MESA_SHADER_VERTEX) {
 608          STATIC_ASSERT(lds_instance_id + 1 == 9);
 609          lds_vertex_size = MAX2(lds_vertex_size, 9);
 610       } else {
 611          assert(shader->selector->info.stage == MESA_SHADER_TESS_EVAL);
 612
 613          if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) {
 614             STATIC_ASSERT(lds_tes_patch_id + 2 == 11);
 615             lds_vertex_size = MAX2(lds_vertex_size, 11);
 616          } else {
 617             STATIC_ASSERT(lds_tes_v + 1 == 9);
 618             lds_vertex_size = MAX2(lds_vertex_size, 9);
 619          }
 620       }
 621    }
 622
 623    return lds_vertex_size;
 624 }
 625
 626 /**
 627  * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
 628  * for the vertex outputs.
 629  */
 630 static LLVMValueRef ngg_nogs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vtxid)
 631 {
 632    /* The extra dword is used to avoid LDS bank conflicts. */
 633    unsigned vertex_size = ngg_nogs_vertex_size(ctx->shader);
 634    LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
 635    LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
 636    LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
 637    return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
 638 }
 639
 640 static LLVMValueRef si_insert_input_v4i32(struct si_shader_context *ctx, LLVMValueRef ret,
 641                                           struct ac_arg param, unsigned return_index)
 642 {
 643    LLVMValueRef v = ac_get_arg(&ctx->ac, param);
 644
 645    for (unsigned i = 0; i < 4; i++) {
 646       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, ac_llvm_extract_elem(&ctx->ac, v, i),
 647                                  return_index + i, "");
 648    }
 649    return ret;
 650 }
 651
 652 static void load_bitmasks_2x64(struct si_shader_context *ctx, LLVMValueRef lds_ptr,
 653                                unsigned dw_offset, LLVMValueRef mask[2],
 654                                LLVMValueRef *total_bitcount)
 655 {
 656    LLVMBuilderRef builder = ctx->ac.builder;
 657    LLVMValueRef ptr64 = LLVMBuildPointerCast(
 658       builder, lds_ptr, LLVMPointerType(LLVMArrayType(ctx->ac.i64, 2), AC_ADDR_SPACE_LDS), "");
 659    for (unsigned i = 0; i < 2; i++) {
 660       LLVMValueRef index = LLVMConstInt(ctx->ac.i32, dw_offset / 2 + i, 0);
 661       mask[i] = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ptr64, index), "");
 662    }
 663
 664    /* We get better code if we don't use the 128-bit bitcount. */
 665    *total_bitcount = LLVMBuildAdd(builder, ac_build_bit_count(&ctx->ac, mask[0]),
 666                                   ac_build_bit_count(&ctx->ac, mask[1]), "");
 667 }
 668
 669 /**
 670  * Given a total thread count, update total and per-wave thread counts in input SGPRs
 671  * and return the per-wave thread count.
 672  *
 673  * \param new_num_threads    Total thread count on the input, per-wave thread count on the output.
 674  * \param tg_info            tg_info SGPR value
 675  * \param tg_info_num_bits   the bit size of thread count field in tg_info
 676  * \param tg_info_shift      the bit offset of the thread count field in tg_info
 677  * \param wave_info          merged_wave_info SGPR value
 678  * \param wave_info_num_bits the bit size of thread count field in merged_wave_info
 679  * \param wave_info_shift    the bit offset of the thread count field in merged_wave_info
 680  */
 681 static void update_thread_counts(struct si_shader_context *ctx, LLVMValueRef *new_num_threads,
 682                                  LLVMValueRef *tg_info, unsigned tg_info_num_bits,
 683                                  unsigned tg_info_shift, LLVMValueRef *wave_info,
 684                                  unsigned wave_info_num_bits, unsigned wave_info_shift)
 685 {
 686    LLVMBuilderRef builder = ctx->ac.builder;
 687
 688    /* Update the total thread count. */
 689    unsigned tg_info_mask = ~(u_bit_consecutive(0, tg_info_num_bits) << tg_info_shift);
 690    *tg_info = LLVMBuildAnd(builder, *tg_info, LLVMConstInt(ctx->ac.i32, tg_info_mask, 0), "");
 691    *tg_info = LLVMBuildOr(
 692       builder, *tg_info,
 693       LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, tg_info_shift, 0), ""), "");
 694
 695    /* Update the per-wave thread count. */
 696    LLVMValueRef prev_threads = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
 697                                             LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), "");
 698    *new_num_threads = LLVMBuildSub(builder, *new_num_threads, prev_threads, "");
 699    *new_num_threads = ac_build_imax(&ctx->ac, *new_num_threads, ctx->ac.i32_0);
 700    *new_num_threads =
 701       ac_build_imin(&ctx->ac, *new_num_threads, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0));
 702    unsigned wave_info_mask = ~(u_bit_consecutive(0, wave_info_num_bits) << wave_info_shift);
 703    *wave_info = LLVMBuildAnd(builder, *wave_info, LLVMConstInt(ctx->ac.i32, wave_info_mask, 0), "");
 704    *wave_info = LLVMBuildOr(
 705       builder, *wave_info,
 706       LLVMBuildShl(builder, *new_num_threads, LLVMConstInt(ctx->ac.i32, wave_info_shift, 0), ""),
 707       "");
 708 }
 709
 710 /**
 711  * Cull primitives for NGG VS or TES, then compact vertices, which happens
 712  * before the VS or TES main function. Return values for the main function.
 713  * Also return the position, which is passed to the shader as an input,
 714  * so that we don't compute it twice.
 715  */
 716 void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
 717                                      LLVMValueRef *addrs)
 718 {
 719    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 720    struct si_shader *shader = ctx->shader;
 721    struct si_shader_selector *sel = shader->selector;
 722    struct si_shader_info *info = &sel->info;
 723    LLVMBuilderRef builder = ctx->ac.builder;
 724    unsigned max_waves = ctx->ac.wave_size == 64 ? 2 : 4;
 725    LLVMValueRef ngg_scratch = ctx->gs_ngg_scratch;
 726
 727    if (ctx->ac.wave_size == 64) {
 728       ngg_scratch =  LLVMBuildPointerCast(builder, ngg_scratch,
 729                                           LLVMPointerType(LLVMArrayType(ctx->ac.i64, max_waves),
 730                                                           AC_ADDR_SPACE_LDS), "");
 731    }
 732
 733    assert(shader->key.opt.ngg_culling);
 734    assert(shader->key.as_ngg);
 735    assert(sel->info.stage == MESA_SHADER_VERTEX ||
 736           (sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es));
 737
 738    LLVMValueRef position[4] = {};
 739    for (unsigned i = 0; i < info->num_outputs; i++) {
 740       switch (info->output_semantic_name[i]) {
 741       case TGSI_SEMANTIC_POSITION:
 742          for (unsigned j = 0; j < 4; j++) {
 743             position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
 744          }
 745          break;
 746       }
 747    }
 748    assert(position[0]);
 749
 750    /* Store Position.XYZW into LDS. */
 751    LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 752    for (unsigned chan = 0; chan < 4; chan++) {
 753       LLVMBuildStore(
 754          builder, ac_to_integer(&ctx->ac, position[chan]),
 755          ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
 756    }
 757    /* Store Position.XY / W into LDS. */
 758    for (unsigned chan = 0; chan < 2; chan++) {
 759       LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
 760       LLVMBuildStore(
 761          builder, ac_to_integer(&ctx->ac, val),
 762          ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
 763    }
 764
 765    /* Store VertexID and InstanceID. ES threads will have to load them
 766     * from LDS after vertex compaction and use them instead of their own
 767     * system values.
 768     */
 769    bool uses_instance_id = false;
 770    bool uses_tes_prim_id = false;
 771    LLVMValueRef packed_data = ctx->ac.i32_0;
 772
 773    if (ctx->stage == MESA_SHADER_VERTEX) {
 774       uses_instance_id = sel->info.uses_instanceid ||
 775                          shader->key.part.vs.prolog.instance_divisor_is_one ||
 776                          shader->key.part.vs.prolog.instance_divisor_is_fetched;
 777
 778       LLVMBuildStore(
 779          builder, ctx->abi.vertex_id,
 780          ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
 781       if (uses_instance_id) {
 782          LLVMBuildStore(
 783             builder, ctx->abi.instance_id,
 784             ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
 785       }
 786    } else {
 787       uses_tes_prim_id = sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id;
 788
 789       assert(ctx->stage == MESA_SHADER_TESS_EVAL);
 790       LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
 791                      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
 792       LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
 793                      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
 794       packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
 795                                  LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
 796       if (uses_tes_prim_id) {
 797          LLVMBuildStore(
 798             builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
 799             ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
 800       }
 801    }
 802    /* Initialize the packed data. */
 803    LLVMBuildStore(
 804       builder, packed_data,
 805       ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
 806    ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
 807
 808    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
 809
 810    /* Initialize all but the first element of ngg_scratch to 0, because we may have less
 811     * than the maximum number of waves, but we always read all values. This is where
 812     * the thread bitmasks of unculled threads will be stored.
 813     *
 814     * ngg_scratch layout: iN_wavemask esmask[0..n]
 815     */
 816    ac_build_ifcc(&ctx->ac,
 817                  LLVMBuildICmp(builder, LLVMIntULT, get_thread_id_in_tg(ctx),
 818                                LLVMConstInt(ctx->ac.i32, max_waves - 1, 0), ""),
 819                  16101);
 820    {
 821       LLVMValueRef index = LLVMBuildAdd(builder, tid, ctx->ac.i32_1, "");
 822       LLVMBuildStore(builder, LLVMConstInt(ctx->ac.iN_wavemask, 0, 0),
 823                      ac_build_gep0(&ctx->ac, ngg_scratch, index));
 824    }
 825    ac_build_endif(&ctx->ac, 16101);
 826    ac_build_s_barrier(&ctx->ac);
 827
 828    /* The hardware requires that there are no holes between unculled vertices,
 829     * which means we have to pack ES threads, i.e. reduce the ES thread count
 830     * and move ES input VGPRs to lower threads. The upside is that varyings
 831     * are only fetched and computed for unculled vertices.
 832     *
 833     * Vertex compaction in GS threads:
 834     *
 835     * Part 1: Compute the surviving vertex mask in GS threads:
 836     * - Compute 4 32-bit surviving vertex masks in LDS. (max 4 waves)
 837     *   - In GS, notify ES threads whether the vertex survived.
 838     *   - Barrier
 839     *   - ES threads will create the mask and store it in LDS.
 840     * - Barrier
 841     * - Each GS thread loads the vertex masks from LDS.
 842     *
 843     * Part 2: Compact ES threads in GS threads:
 844     * - Compute the prefix sum for all 3 vertices from the masks. These are the new
 845     *   thread IDs for each vertex within the primitive.
 846     * - Write the value of the old thread ID into the LDS address of the new thread ID.
 847     *   The ES thread will load the old thread ID and use it to load the position, VertexID,
 848     *   and InstanceID.
 849     * - Update vertex indices and null flag in the GS input VGPRs.
 850     * - Barrier
 851     *
 852     * Part 3: Update inputs GPRs
 853     * - For all waves, update per-wave thread counts in input SGPRs.
 854     * - In ES threads, update the ES input VGPRs (VertexID, InstanceID, TES inputs).
 855     */
 856
 857    LLVMValueRef vtxindex[3];
 858    if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
 859       /* For the GS fast launch, the VS prologs simply puts the Vertex IDs
 860        * into these VGPRs.
 861        */
 862       vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
 863       vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
 864       vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
 865    } else {
 866       vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
 867       vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
 868       vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
 869    };
 870    LLVMValueRef gs_vtxptr[] = {
 871       ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
 872       ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
 873       ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
 874    };
 875    es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 876
 877    LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
 878
 879    /* Do culling in GS threads. */
 880    ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 16002);
 881    {
 882       /* Load positions. */
 883       LLVMValueRef pos[3][4] = {};
 884       for (unsigned vtx = 0; vtx < 3; vtx++) {
 885          for (unsigned chan = 0; chan < 4; chan++) {
 886             unsigned index;
 887             if (chan == 0 || chan == 1)
 888                index = lds_pos_x_div_w + chan;
 889             else if (chan == 3)
 890                index = lds_pos_w;
 891             else
 892                continue;
 893
 894             LLVMValueRef addr =
 895                ac_build_gep0(&ctx->ac, gs_vtxptr[vtx], LLVMConstInt(ctx->ac.i32, index, 0));
 896             pos[vtx][chan] = LLVMBuildLoad(builder, addr, "");
 897             pos[vtx][chan] = ac_to_float(&ctx->ac, pos[vtx][chan]);
 898          }
 899       }
 900
 901       /* Load the viewport state for small prim culling. */
 902       LLVMValueRef vp = ac_build_load_invariant(
 903          &ctx->ac, ac_get_arg(&ctx->ac, ctx->small_prim_cull_info), ctx->ac.i32_0);
 904       vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, "");
 905       LLVMValueRef vp_scale[2], vp_translate[2];
 906       vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
 907       vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
 908       vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
 909       vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
 910
 911       /* Get the small prim filter precision. */
 912       LLVMValueRef small_prim_precision = si_unpack_param(ctx, ctx->vs_state_bits, 7, 4);
 913       small_prim_precision =
 914          LLVMBuildOr(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 0x70, 0), "");
 915       small_prim_precision =
 916          LLVMBuildShl(builder, small_prim_precision, LLVMConstInt(ctx->ac.i32, 23, 0), "");
 917       small_prim_precision = LLVMBuildBitCast(builder, small_prim_precision, ctx->ac.f32, "");
 918
 919       /* Execute culling code. */
 920       struct ac_cull_options options = {};
 921       options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
 922       options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
 923       options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
 924       options.cull_small_prims = options.cull_view_xy;
 925       options.cull_zero_area = options.cull_front || options.cull_back;
 926       options.cull_w = true;
 927
 928       /* Tell ES threads whether their vertex survived. */
 929       ac_build_ifcc(&ctx->ac,
 930                     ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
 931                                      small_prim_precision, &options),
 932                     16003);
 933       {
 934          LLVMBuildStore(builder, ctx->ac.i32_1, gs_accepted);
 935          for (unsigned vtx = 0; vtx < 3; vtx++) {
 936             LLVMBuildStore(builder, ctx->ac.i8_1,
 937                            si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
 938          }
 939       }
 940       ac_build_endif(&ctx->ac, 16003);
 941    }
 942    ac_build_endif(&ctx->ac, 16002);
 943    ac_build_s_barrier(&ctx->ac);
 944
 945    gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");
 946
 947    LLVMValueRef es_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i1, "");
 948
 949    /* Convert the per-vertex flag to a thread bitmask in ES threads and store it in LDS. */
 950    ac_build_ifcc(&ctx->ac, si_is_es_thread(ctx), 16007);
 951    {
 952       LLVMValueRef es_accepted_flag =
 953          LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_accept_flag), "");
 954
 955       LLVMValueRef es_accepted_bool =
 956          LLVMBuildICmp(builder, LLVMIntNE, es_accepted_flag, ctx->ac.i8_0, "");
 957       LLVMValueRef es_mask = ac_get_i1_sgpr_mask(&ctx->ac, es_accepted_bool);
 958
 959       LLVMBuildStore(builder, es_accepted_bool, es_accepted);
 960
 961       ac_build_ifcc(&ctx->ac, LLVMBuildICmp(builder, LLVMIntEQ, tid, ctx->ac.i32_0, ""), 16008);
 962       {
 963          LLVMBuildStore(builder, es_mask,
 964                         ac_build_gep0(&ctx->ac, ngg_scratch, get_wave_id_in_tg(ctx)));
 965       }
 966       ac_build_endif(&ctx->ac, 16008);
 967    }
 968    ac_build_endif(&ctx->ac, 16007);
 969    ac_build_s_barrier(&ctx->ac);
 970
 971    /* Load the vertex masks and compute the new ES thread count. */
 972    LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
 973    load_bitmasks_2x64(ctx, ngg_scratch, 0, es_mask, &new_num_es_threads);
 974    new_num_es_threads = ac_build_readlane_no_opt_barrier(&ctx->ac, new_num_es_threads, NULL);
 975
 976    /* ES threads compute their prefix sum, which is the new ES thread ID.
 977     * Then they write the value of the old thread ID into the LDS address
 978     * of the new thread ID. It will be used it to load input VGPRs from
 979     * the old thread's LDS location.
 980     */
 981    ac_build_ifcc(&ctx->ac, LLVMBuildLoad(builder, es_accepted, ""), 16009);
 982    {
 983       LLVMValueRef old_id = get_thread_id_in_tg(ctx);
 984       LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
 985
 986       LLVMBuildStore(
 987          builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
 988          si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id));
 989       LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
 990                      si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
 991    }
 992    ac_build_endif(&ctx->ac, 16009);
 993
 994    /* Kill waves that have inactive threads. */
 995    kill_wave = LLVMBuildICmp(builder, LLVMIntULE,
 996                              ac_build_imax(&ctx->ac, new_num_es_threads, ngg_get_prim_cnt(ctx)),
 997                              LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
 998                                           LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, 0), ""),
 999                              "");
1000    ac_build_ifcc(&ctx->ac, kill_wave, 19202);
1001    {
1002       /* If we are killing wave 0, send that there are no primitives
1003        * in this threadgroup.
1004        */
1005       ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), ctx->ac.i32_0, ctx->ac.i32_0);
1006       ac_build_s_endpgm(&ctx->ac);
1007    }
1008    ac_build_endif(&ctx->ac, 19202);
1009    ac_build_s_barrier(&ctx->ac);
1010
1011    /* Send the final vertex and primitive counts. */
1012    ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
1013                                  ngg_get_prim_cnt(ctx));
1014
1015    /* Update thread counts in SGPRs. */
1016    LLVMValueRef new_gs_tg_info = ac_get_arg(&ctx->ac, ctx->gs_tg_info);
1017    LLVMValueRef new_merged_wave_info = ac_get_arg(&ctx->ac, ctx->merged_wave_info);
1018
1019    /* This also converts the thread count from the total count to the per-wave count. */
1020    update_thread_counts(ctx, &new_num_es_threads, &new_gs_tg_info, 9, 12, &new_merged_wave_info, 8,
1021                         0);
1022
1023    /* Update vertex indices in VGPR0 (same format as NGG passthrough). */
1024    LLVMValueRef new_vgpr0 = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1025
1026    /* Set the null flag at the beginning (culled), and then
1027     * overwrite it for accepted primitives.
1028     */
1029    LLVMBuildStore(builder, LLVMConstInt(ctx->ac.i32, 1u << 31, 0), new_vgpr0);
1030
1031    /* Get vertex indices after vertex compaction. */
1032    ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
1033    {
1034       struct ac_ngg_prim prim = {};
1035       prim.num_vertices = 3;
1036       prim.isnull = ctx->ac.i1false;
1037
1038       for (unsigned vtx = 0; vtx < 3; vtx++) {
1039          prim.index[vtx] = LLVMBuildLoad(
1040             builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
1041          prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
1042          prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
1043       }
1044
1045       /* Set the new GS input VGPR. */
1046       LLVMBuildStore(builder, ac_pack_prim_export(&ctx->ac, &prim), new_vgpr0);
1047    }
1048    ac_build_endif(&ctx->ac, 16011);
1049
1050    if (gfx10_ngg_export_prim_early(shader))
1051       gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
1052
1053    /* Set the new ES input VGPRs. */
1054    LLVMValueRef es_data[4];
1055    LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1056
1057    for (unsigned i = 0; i < 4; i++)
1058       es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1059
1060    ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, new_num_es_threads, ""),
1061                  16012);
1062    {
1063       LLVMValueRef old_id, old_es_vtxptr, tmp;
1064
1065       /* Load ES input VGPRs from the ES thread before compaction. */
1066       old_id = LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
1067       old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
1068
1069       LLVMBuildStore(builder, old_id, old_thread_id);
1070       old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
1071
1072       for (unsigned i = 0; i < 2; i++) {
1073          tmp = LLVMBuildLoad(
1074             builder,
1075             ac_build_gep0(&ctx->ac, old_es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)),
1076             "");
1077          LLVMBuildStore(builder, tmp, es_data[i]);
1078       }
1079
1080       if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1081          tmp = LLVMBuildLoad(builder,
1082                              si_build_gep_i8(ctx, old_es_vtxptr, lds_byte2_tes_rel_patch_id), "");
1083          tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1084          LLVMBuildStore(builder, tmp, es_data[2]);
1085
1086          if (uses_tes_prim_id) {
1087             tmp = LLVMBuildLoad(builder,
1088                                 ac_build_gep0(&ctx->ac, old_es_vtxptr,
1089                                               LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)),
1090                                 "");
1091             LLVMBuildStore(builder, tmp, es_data[3]);
1092          }
1093       }
1094    }
1095    ac_build_endif(&ctx->ac, 16012);
1096
1097    /* Return values for the main function. */
1098    LLVMValueRef ret = ctx->return_value;
1099    LLVMValueRef val;
1100
1101    ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_gs_tg_info, 2, "");
1102    ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, "");
1103    if (ctx->stage == MESA_SHADER_TESS_EVAL)
1104       ret = si_insert_input_ret(ctx, ret, ctx->tcs_offchip_offset, 4);
1105
1106    ret = si_insert_input_ptr(ctx, ret, ctx->rw_buffers, 8 + SI_SGPR_RW_BUFFERS);
1107    ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images,
1108                              8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
1109    ret = si_insert_input_ptr(ctx, ret, ctx->const_and_shader_buffers,
1110                              8 + SI_SGPR_CONST_AND_SHADER_BUFFERS);
1111    ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES);
1112    ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS);
1113
1114    if (ctx->stage == MESA_SHADER_VERTEX) {
1115       ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX);
1116       ret = si_insert_input_ptr(ctx, ret, ctx->args.start_instance, 8 + SI_SGPR_START_INSTANCE);
1117       ret = si_insert_input_ptr(ctx, ret, ctx->args.draw_id, 8 + SI_SGPR_DRAWID);
1118       ret = si_insert_input_ptr(ctx, ret, ctx->vertex_buffers, 8 + SI_VS_NUM_USER_SGPR);
1119
1120       for (unsigned i = 0; i < shader->selector->num_vbos_in_user_sgprs; i++) {
1121          ret = si_insert_input_v4i32(ctx, ret, ctx->vb_descriptors[i],
1122                                      8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + i * 4);
1123       }
1124    } else {
1125       assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1126       ret = si_insert_input_ptr(ctx, ret, ctx->tcs_offchip_layout, 8 + SI_SGPR_TES_OFFCHIP_LAYOUT);
1127       ret = si_insert_input_ptr(ctx, ret, ctx->tes_offchip_addr, 8 + SI_SGPR_TES_OFFCHIP_ADDR);
1128    }
1129
1130    unsigned vgpr;
1131    if (ctx->stage == MESA_SHADER_VERTEX) {
1132       if (shader->selector->num_vbos_in_user_sgprs) {
1133          vgpr = 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
1134       } else {
1135          vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR + 1;
1136       }
1137    } else {
1138       vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
1139    }
1140
1141    val = LLVMBuildLoad(builder, new_vgpr0, "");
1142    ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1143    vgpr++; /* gs_vtx23_offset */
1144
1145    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
1146    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
1147    vgpr++; /* gs_vtx45_offset */
1148
1149    if (ctx->stage == MESA_SHADER_VERTEX) {
1150       val = LLVMBuildLoad(builder, es_data[0], "");
1151       ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1152                                  ""); /* VGPR5 - VertexID */
1153       vgpr += 2;
1154       if (uses_instance_id) {
1155          val = LLVMBuildLoad(builder, es_data[1], "");
1156          ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
1157                                     ""); /* VGPR8 - InstanceID */
1158       } else {
1159          vgpr++;
1160       }
1161    } else {
1162       assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1163       unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
1164       for (unsigned i = 0; i < num_vgprs; i++) {
1165          val = LLVMBuildLoad(builder, es_data[i], "");
1166          ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1167       }
1168       if (num_vgprs == 3)
1169          vgpr++;
1170    }
1171    /* Return the old thread ID. */
1172    val = LLVMBuildLoad(builder, old_thread_id, "");
1173    ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
1174
1175    /* These two also use LDS. */
1176    if (sel->info.writes_edgeflag ||
1177        (ctx->stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
1178       ac_build_s_barrier(&ctx->ac);
1179
1180    ctx->return_value = ret;
1181 }
1182
1183 /**
1184  * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
1185  */
1186 void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
1187 {
1188    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1189    struct si_shader_selector *sel = ctx->shader->selector;
1190    struct si_shader_info *info = &sel->info;
1191    struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1192    LLVMBuilderRef builder = ctx->ac.builder;
1193    LLVMValueRef tmp, tmp2;
1194
1195    assert(!ctx->shader->is_gs_copy_shader);
1196    assert(info->num_outputs <= max_outputs);
1197
1198    LLVMValueRef vertex_ptr = NULL;
1199
1200    if (sel->so.num_outputs || sel->info.writes_edgeflag)
1201       vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1202
1203    for (unsigned i = 0; i < info->num_outputs; i++) {
1204       outputs[i].semantic_name = info->output_semantic_name[i];
1205       outputs[i].semantic_index = info->output_semantic_index[i];
1206
1207       for (unsigned j = 0; j < 4; j++) {
1208          outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
1209
1210          /* TODO: we may store more outputs than streamout needs,
1211           * but streamout performance isn't that important.
1212           */
1213          if (sel->so.num_outputs) {
1214             tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + j, false));
1215             tmp2 = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1216             tmp2 = ac_to_integer(&ctx->ac, tmp2);
1217             LLVMBuildStore(builder, tmp2, tmp);
1218          }
1219       }
1220
1221       /* Store the edgeflag at the end (if streamout is enabled) */
1222       if (info->output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG && sel->info.writes_edgeflag) {
1223          LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
1224          /* The output is a float, but the hw expects a 1-bit integer. */
1225          edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
1226          edgeflag = ac_build_umin(&ctx->ac, edgeflag, ctx->ac.i32_1);
1227
1228          tmp = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1229          tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1230          LLVMBuildStore(builder, edgeflag, tmp);
1231       }
1232    }
1233
1234    bool unterminated_es_if_block =
1235       !sel->so.num_outputs && !sel->info.writes_edgeflag &&
1236       !ctx->screen->use_ngg_streamout && /* no query buffer */
1237       (ctx->stage != MESA_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id);
1238
1239    if (!unterminated_es_if_block)
1240       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1241
1242    LLVMValueRef is_gs_thread = si_is_gs_thread(ctx);
1243    LLVMValueRef is_es_thread = si_is_es_thread(ctx);
1244    LLVMValueRef vtxindex[3];
1245
1246    if (ctx->shader->key.opt.ngg_culling) {
1247       vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
1248       vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
1249       vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
1250    } else {
1251       vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
1252       vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
1253       vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
1254    }
1255
1256    /* Determine the number of vertices per primitive. */
1257    unsigned num_vertices;
1258    LLVMValueRef num_vertices_val = ngg_get_vertices_per_prim(ctx, &num_vertices);
1259
1260    /* Streamout */
1261    LLVMValueRef emitted_prims = NULL;
1262
1263    if (sel->so.num_outputs) {
1264       assert(!unterminated_es_if_block);
1265
1266       struct ngg_streamout nggso = {};
1267       nggso.num_vertices = num_vertices_val;
1268       nggso.prim_enable[0] = is_gs_thread;
1269
1270       for (unsigned i = 0; i < num_vertices; ++i)
1271          nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1272
1273       build_streamout(ctx, &nggso);
1274       emitted_prims = nggso.emit[0];
1275    }
1276
1277    LLVMValueRef user_edgeflags[3] = {};
1278
1279    if (sel->info.writes_edgeflag) {
1280       assert(!unterminated_es_if_block);
1281
1282       /* Streamout already inserted the barrier, so don't insert it again. */
1283       if (!sel->so.num_outputs)
1284          ac_build_s_barrier(&ctx->ac);
1285
1286       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1287       /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
1288       for (unsigned i = 0; i < num_vertices; i++) {
1289          tmp = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
1290          tmp2 = LLVMConstInt(ctx->ac.i32, ngg_nogs_vertex_size(ctx->shader) - 1, 0);
1291          tmp = ac_build_gep0(&ctx->ac, tmp, tmp2);
1292          tmp = LLVMBuildLoad(builder, tmp, "");
1293          tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1294
1295          user_edgeflags[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i1, "");
1296          LLVMBuildStore(builder, tmp, user_edgeflags[i]);
1297       }
1298       ac_build_endif(&ctx->ac, 5400);
1299    }
1300
1301    /* Copy Primitive IDs from GS threads to the LDS address corresponding
1302     * to the ES thread of the provoking vertex.
1303     */
1304    if (ctx->stage == MESA_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) {
1305       assert(!unterminated_es_if_block);
1306
1307       /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
1308       if (sel->so.num_outputs || sel->info.writes_edgeflag)
1309          ac_build_s_barrier(&ctx->ac);
1310
1311       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
1312       /* Extract the PROVOKING_VTX_INDEX field. */
1313       LLVMValueRef provoking_vtx_in_prim = si_unpack_param(ctx, ctx->vs_state_bits, 4, 2);
1314
1315       /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
1316       LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
1317       LLVMValueRef provoking_vtx_index =
1318          LLVMBuildExtractElement(builder, indices, provoking_vtx_in_prim, "");
1319       LLVMValueRef vertex_ptr = ngg_nogs_vertex_ptr(ctx, provoking_vtx_index);
1320
1321       LLVMBuildStore(builder, ac_get_arg(&ctx->ac, ctx->args.gs_prim_id),
1322                      ac_build_gep0(&ctx->ac, vertex_ptr, ctx->ac.i32_0));
1323       ac_build_endif(&ctx->ac, 5400);
1324    }
1325
1326    /* Update query buffer */
1327    if (ctx->screen->use_ngg_streamout && !info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD]) {
1328       assert(!unterminated_es_if_block);
1329
1330       tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1331       tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1332       ac_build_ifcc(&ctx->ac, tmp, 5029); /* if (STREAMOUT_QUERY_ENABLED) */
1333       tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
1334       ac_build_ifcc(&ctx->ac, tmp, 5030);
1335       tmp = LLVMBuildICmp(builder, LLVMIntULE, ac_get_thread_id(&ctx->ac),
1336                           sel->so.num_outputs ? ctx->ac.i32_1 : ctx->ac.i32_0, "");
1337       ac_build_ifcc(&ctx->ac, tmp, 5031);
1338       {
1339          LLVMValueRef args[] = {
1340             ngg_get_prim_cnt(ctx),
1341             ngg_get_query_buf(ctx),
1342             LLVMConstInt(ctx->ac.i32, 16, false), /* offset of stream[0].generated_primitives */
1343             ctx->ac.i32_0,                        /* soffset */
1344             ctx->ac.i32_0,                        /* cachepolicy */
1345          };
1346
1347          if (sel->so.num_outputs) {
1348             args[0] = ac_build_writelane(&ctx->ac, args[0], emitted_prims, ctx->ac.i32_1);
1349             args[2] = ac_build_writelane(&ctx->ac, args[2], LLVMConstInt(ctx->ac.i32, 24, false),
1350                                          ctx->ac.i32_1);
1351          }
1352
1353          /* TODO: should this be 64-bit atomics? */
1354          ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1355                             0);
1356       }
1357       ac_build_endif(&ctx->ac, 5031);
1358       ac_build_endif(&ctx->ac, 5030);
1359       ac_build_endif(&ctx->ac, 5029);
1360    }
1361
1362    /* Build the primitive export. */
1363    if (!gfx10_ngg_export_prim_early(ctx->shader)) {
1364       assert(!unterminated_es_if_block);
1365       gfx10_ngg_build_export_prim(ctx, user_edgeflags, NULL);
1366    }
1367
1368    /* Export per-vertex data (positions and parameters). */
1369    if (!unterminated_es_if_block)
1370       ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
1371    {
1372       unsigned i;
1373
1374       /* Unconditionally (re-)load the values for proper SSA form. */
1375       for (i = 0; i < info->num_outputs; i++) {
1376          /* If the NGG cull shader part computed the position, don't
1377           * use the position from the current shader part. Instead,
1378           * load it from LDS.
1379           */
1380          if (info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
1381              ctx->shader->key.opt.ngg_culling) {
1382             vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
1383
1384             for (unsigned j = 0; j < 4; j++) {
1385                tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
1386                tmp = ac_build_gep0(&ctx->ac, vertex_ptr, tmp);
1387                tmp = LLVMBuildLoad(builder, tmp, "");
1388                outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1389             }
1390          } else {
1391             for (unsigned j = 0; j < 4; j++) {
1392                outputs[i].values[j] = LLVMBuildLoad(builder, addrs[4 * i + j], "");
1393             }
1394          }
1395       }
1396
1397       if (ctx->shader->key.mono.u.vs_export_prim_id) {
1398          outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
1399          outputs[i].semantic_index = 0;
1400
1401          if (ctx->stage == MESA_SHADER_VERTEX) {
1402             /* Wait for GS stores to finish. */
1403             ac_build_s_barrier(&ctx->ac);
1404
1405             tmp = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
1406             tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1407             outputs[i].values[0] = LLVMBuildLoad(builder, tmp, "");
1408          } else {
1409             assert(ctx->stage == MESA_SHADER_TESS_EVAL);
1410             outputs[i].values[0] = si_get_primitive_id(ctx, 0);
1411          }
1412
1413          outputs[i].values[0] = ac_to_float(&ctx->ac, outputs[i].values[0]);
1414          for (unsigned j = 1; j < 4; j++)
1415             outputs[i].values[j] = LLVMGetUndef(ctx->ac.f32);
1416
1417          memset(outputs[i].vertex_stream, 0, sizeof(outputs[i].vertex_stream));
1418          i++;
1419       }
1420
1421       si_llvm_build_vs_exports(ctx, outputs, i);
1422    }
1423    ac_build_endif(&ctx->ac, 6002);
1424 }
1425
1426 static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
1427 {
1428    const struct si_shader_selector *sel = ctx->shader->selector;
1429    const struct si_shader_info *info = &sel->info;
1430
1431    LLVMTypeRef elements[2] = {
1432       LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
1433       LLVMArrayType(ctx->ac.i8, 4),
1434    };
1435    LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
1436    type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
1437    return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
1438 }
1439
1440 /**
1441  * Return a pointer to the LDS storage reserved for the N'th vertex, where N
1442  * is in emit order; that is:
1443  * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
1444  * - during vertex emit, i.e. while the API GS shader invocation is running,
1445  *   N = threadidx * gs_max_out_vertices + emitidx
1446  *
1447  * Goals of the LDS memory layout:
1448  * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
1449  *    in uniform control flow
1450  * 2. Eliminate bank conflicts on read for export if, additionally, there is no
1451  *    culling
1452  * 3. Agnostic to the number of waves (since we don't know it before compiling)
1453  * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
1454  * 5. Avoid wasting memory.
1455  *
1456  * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
1457  * layout, elimination of bank conflicts requires that each vertex occupy an
1458  * odd number of dwords. We use the additional dword to store the output stream
1459  * index as well as a flag to indicate whether this vertex ends a primitive
1460  * for rasterization.
1461  *
1462  * Swizzling is required to satisfy points 1 and 2 simultaneously.
1463  *
1464  * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
1465  * Indices are swizzled in groups of 32, which ensures point 1 without
1466  * disturbing point 2.
1467  *
1468  * \return an LDS pointer to type {[N x i32], [4 x i8]}
1469  */
1470 static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
1471 {
1472    struct si_shader_selector *sel = ctx->shader->selector;
1473    LLVMBuilderRef builder = ctx->ac.builder;
1474    LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
1475
1476    /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
1477    unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
1478    if (write_stride_2exp) {
1479       LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
1480       LLVMValueRef swizzle = LLVMBuildAnd(
1481          builder, row, LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1, false), "");
1482       vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
1483    }
1484
1485    return ac_build_gep0(&ctx->ac, storage, vertexidx);
1486 }
1487
1488 static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
1489                                            LLVMValueRef emitidx)
1490 {
1491    struct si_shader_selector *sel = ctx->shader->selector;
1492    LLVMBuilderRef builder = ctx->ac.builder;
1493    LLVMValueRef tmp;
1494
1495    tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
1496    tmp = LLVMBuildMul(builder, tmp, gsthread, "");
1497    const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
1498    return ngg_gs_vertex_ptr(ctx, vertexidx);
1499 }
1500
1501 static LLVMValueRef ngg_gs_get_emit_output_ptr(struct si_shader_context *ctx,
1502                                                LLVMValueRef vertexptr, unsigned out_idx)
1503 {
1504    LLVMValueRef gep_idx[3] = {
1505       ctx->ac.i32_0, /* implied C-style array */
1506       ctx->ac.i32_0, /* first struct entry */
1507       LLVMConstInt(ctx->ac.i32, out_idx, false),
1508    };
1509    return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1510 }
1511
1512 static LLVMValueRef ngg_gs_get_emit_primflag_ptr(struct si_shader_context *ctx,
1513                                                  LLVMValueRef vertexptr, unsigned stream)
1514 {
1515    LLVMValueRef gep_idx[3] = {
1516       ctx->ac.i32_0, /* implied C-style array */
1517       ctx->ac.i32_1, /* second struct entry */
1518       LLVMConstInt(ctx->ac.i32, stream, false),
1519    };
1520    return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
1521 }
1522
1523 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs)
1524 {
1525    const struct si_shader_selector *sel = ctx->shader->selector;
1526    const struct si_shader_info *info = &sel->info;
1527    LLVMBuilderRef builder = ctx->ac.builder;
1528    LLVMValueRef tmp;
1529    const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1530
1531    /* If this thread has already emitted the declared maximum number of
1532     * vertices, skip the write: excessive vertex emissions are not
1533     * supposed to have any effect.
1534     */
1535    const LLVMValueRef can_emit =
1536       LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
1537                     LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
1538
1539    tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1540    tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
1541    LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1542
1543    ac_build_ifcc(&ctx->ac, can_emit, 9001);
1544
1545    const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
1546    unsigned out_idx = 0;
1547    for (unsigned i = 0; i < info->num_outputs; i++) {
1548       for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
1549          if (!(info->output_usagemask[i] & (1 << chan)) ||
1550              ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
1551             continue;
1552
1553          LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
1554          out_val = ac_to_integer(&ctx->ac, out_val);
1555          LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
1556       }
1557    }
1558    assert(out_idx * 4 == sel->gsvs_vertex_size);
1559
1560    /* Determine and store whether this vertex completed a primitive. */
1561    const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
1562
1563    tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
1564    const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
1565
1566    /* Since the geometry shader emits triangle strips, we need to
1567     * track which primitive is odd and swap vertex indices to get
1568     * the correct vertex order.
1569     */
1570    LLVMValueRef is_odd = ctx->ac.i1false;
1571    if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
1572       tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
1573       is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
1574    }
1575
1576    tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
1577    LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
1578
1579    /* The per-vertex primitive flag encoding:
1580     *   bit 0: whether this vertex finishes a primitive
1581     *   bit 1: whether the primitive is odd (if we are emitting triangle strips)
1582     */
1583    tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
1584    tmp = LLVMBuildOr(
1585       builder, tmp,
1586       LLVMBuildShl(builder, LLVMBuildZExt(builder, is_odd, ctx->ac.i8, ""), ctx->ac.i8_1, ""), "");
1587    LLVMBuildStore(builder, tmp, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream));
1588
1589    tmp = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1590    tmp = LLVMBuildAdd(builder, tmp, LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i32, ""), "");
1591    LLVMBuildStore(builder, tmp, ctx->gs_generated_prims[stream]);
1592
1593    ac_build_endif(&ctx->ac, 9001);
1594 }
1595
1596 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
1597 {
1598    /* Zero out the part of LDS scratch that is used to accumulate the
1599     * per-stream generated primitive count.
1600     */
1601    LLVMBuilderRef builder = ctx->ac.builder;
1602    LLVMValueRef scratchptr = ctx->gs_ngg_scratch;
1603    LLVMValueRef tid = get_thread_id_in_tg(ctx);
1604    LLVMValueRef tmp;
1605
1606    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), "");
1607    ac_build_ifcc(&ctx->ac, tmp, 5090);
1608    {
1609       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid);
1610       LLVMBuildStore(builder, ctx->ac.i32_0, ptr);
1611    }
1612    ac_build_endif(&ctx->ac, 5090);
1613
1614    ac_build_s_barrier(&ctx->ac);
1615 }
1616
1617 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
1618 {
1619    const struct si_shader_selector *sel = ctx->shader->selector;
1620    const struct si_shader_info *info = &sel->info;
1621    const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
1622    LLVMBuilderRef builder = ctx->ac.builder;
1623    LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
1624    LLVMValueRef tmp, tmp2;
1625
1626    /* Zero out remaining (non-emitted) primitive flags.
1627     *
1628     * Note: Alternatively, we could pass the relevant gs_next_vertex to
1629     *       the emit threads via LDS. This is likely worse in the expected
1630     *       typical case where each GS thread emits the full set of
1631     *       vertices.
1632     */
1633    for (unsigned stream = 0; stream < 4; ++stream) {
1634       if (!info->num_stream_output_components[stream])
1635          continue;
1636
1637       const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
1638
1639       ac_build_bgnloop(&ctx->ac, 5100);
1640
1641       const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
1642       tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
1643                           LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
1644       ac_build_ifcc(&ctx->ac, tmp, 5101);
1645       ac_build_break(&ctx->ac);
1646       ac_build_endif(&ctx->ac, 5101);
1647
1648       tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
1649       LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
1650
1651       tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
1652       LLVMBuildStore(builder, i8_0, ngg_gs_get_emit_primflag_ptr(ctx, tmp, stream));
1653
1654       ac_build_endloop(&ctx->ac, 5100);
1655    }
1656
1657    /* Accumulate generated primitives counts across the entire threadgroup. */
1658    for (unsigned stream = 0; stream < 4; ++stream) {
1659       if (!info->num_stream_output_components[stream])
1660          continue;
1661
1662       LLVMValueRef numprims = LLVMBuildLoad(builder, ctx->gs_generated_prims[stream], "");
1663       numprims = ac_build_reduce(&ctx->ac, numprims, nir_op_iadd, ctx->ac.wave_size);
1664
1665       tmp = LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(&ctx->ac), ctx->ac.i32_0, "");
1666       ac_build_ifcc(&ctx->ac, tmp, 5105);
1667       {
1668          LLVMBuildAtomicRMW(
1669             builder, LLVMAtomicRMWBinOpAdd,
1670             ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, stream, false)),
1671             numprims, LLVMAtomicOrderingMonotonic, false);
1672       }
1673       ac_build_endif(&ctx->ac, 5105);
1674    }
1675
1676    ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
1677
1678    ac_build_s_barrier(&ctx->ac);
1679
1680    const LLVMValueRef tid = get_thread_id_in_tg(ctx);
1681    LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
1682
1683    /* Streamout */
1684    if (sel->so.num_outputs) {
1685       struct ngg_streamout nggso = {};
1686
1687       nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
1688
1689       LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
1690       for (unsigned stream = 0; stream < 4; ++stream) {
1691          if (!info->num_stream_output_components[stream])
1692             continue;
1693
1694          tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
1695          tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1696          tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1697          nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
1698       }
1699
1700       for (unsigned i = 0; i < verts_per_prim; ++i) {
1701          tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
1702                             "");
1703          tmp = ngg_gs_vertex_ptr(ctx, tmp);
1704          nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
1705       }
1706
1707       build_streamout(ctx, &nggso);
1708    }
1709
1710    /* Write shader query data. */
1711    if (ctx->screen->use_ngg_streamout) {
1712       tmp = si_unpack_param(ctx, ctx->vs_state_bits, 6, 1);
1713       tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1714       ac_build_ifcc(&ctx->ac, tmp, 5109); /* if (STREAMOUT_QUERY_ENABLED) */
1715       unsigned num_query_comps = sel->so.num_outputs ? 8 : 4;
1716       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
1717                           LLVMConstInt(ctx->ac.i32, num_query_comps, false), "");
1718       ac_build_ifcc(&ctx->ac, tmp, 5110);
1719       {
1720          LLVMValueRef offset;
1721          tmp = tid;
1722          if (sel->so.num_outputs)
1723             tmp = LLVMBuildAnd(builder, tmp, LLVMConstInt(ctx->ac.i32, 3, false), "");
1724          offset = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 32, false), "");
1725          if (sel->so.num_outputs) {
1726             tmp = LLVMBuildLShr(builder, tid, LLVMConstInt(ctx->ac.i32, 2, false), "");
1727             tmp = LLVMBuildNUWMul(builder, tmp, LLVMConstInt(ctx->ac.i32, 8, false), "");
1728             offset = LLVMBuildAdd(builder, offset, tmp, "");
1729          }
1730
1731          tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
1732          LLVMValueRef args[] = {
1733             tmp,           ngg_get_query_buf(ctx),
1734             offset,        LLVMConstInt(ctx->ac.i32, 16, false), /* soffset */
1735             ctx->ac.i32_0,                                       /* cachepolicy */
1736          };
1737          ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.raw.buffer.atomic.add.i32", ctx->ac.i32, args, 5,
1738                             0);
1739       }
1740       ac_build_endif(&ctx->ac, 5110);
1741       ac_build_endif(&ctx->ac, 5109);
1742    }
1743
1744    /* Determine vertex liveness. */
1745    LLVMValueRef vertliveptr = ac_build_alloca(&ctx->ac, ctx->ac.i1, "vertexlive");
1746
1747    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1748    ac_build_ifcc(&ctx->ac, tmp, 5120);
1749    {
1750       for (unsigned i = 0; i < verts_per_prim; ++i) {
1751          const LLVMValueRef primidx =
1752             LLVMBuildAdd(builder, tid, LLVMConstInt(ctx->ac.i32, i, false), "");
1753
1754          if (i > 0) {
1755             tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
1756             ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
1757          }
1758
1759          /* Load primitive liveness */
1760          tmp = ngg_gs_vertex_ptr(ctx, primidx);
1761          tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1762          const LLVMValueRef primlive = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
1763
1764          tmp = LLVMBuildLoad(builder, vertliveptr, "");
1765          tmp = LLVMBuildOr(builder, tmp, primlive, ""), LLVMBuildStore(builder, tmp, vertliveptr);
1766
1767          if (i > 0)
1768             ac_build_endif(&ctx->ac, 5121 + i);
1769       }
1770    }
1771    ac_build_endif(&ctx->ac, 5120);
1772
1773    /* Inclusive scan addition across the current wave. */
1774    LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
1775    struct ac_wg_scan vertlive_scan = {};
1776    vertlive_scan.op = nir_op_iadd;
1777    vertlive_scan.enable_reduce = true;
1778    vertlive_scan.enable_exclusive = true;
1779    vertlive_scan.src = vertlive;
1780    vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->ac.i32_0);
1781    vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
1782    vertlive_scan.numwaves = get_tgsize(ctx);
1783    vertlive_scan.maxwaves = 8;
1784
1785    ac_build_wg_scan(&ctx->ac, &vertlive_scan);
1786
1787    /* Skip all exports (including index exports) when possible. At least on
1788     * early gfx10 revisions this is also to avoid hangs.
1789     */
1790    LLVMValueRef have_exports =
1791       LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
1792    num_emit_threads = LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
1793
1794    /* Allocate export space. Send this message as early as possible, to
1795     * hide the latency of the SQ <-> SPI roundtrip.
1796     *
1797     * Note: We could consider compacting primitives for export as well.
1798     *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
1799     *       prim data per clock and skips null primitives at no additional
1800     *       cost. So compacting primitives can only be beneficial when
1801     *       there are 4 or more contiguous null primitives in the export
1802     *       (in the common case of single-dword prim exports).
1803     */
1804    ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), vertlive_scan.result_reduce,
1805                                  num_emit_threads);
1806
1807    /* Setup the reverse vertex compaction permutation. We re-use stream 1
1808     * of the primitive liveness flags, relying on the fact that each
1809     * threadgroup can have at most 256 threads. */
1810    ac_build_ifcc(&ctx->ac, vertlive, 5130);
1811    {
1812       tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
1813       tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
1814       LLVMBuildStore(builder, tmp2, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1));
1815    }
1816    ac_build_endif(&ctx->ac, 5130);
1817
1818    ac_build_s_barrier(&ctx->ac);
1819
1820    /* Export primitive data */
1821    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
1822    ac_build_ifcc(&ctx->ac, tmp, 5140);
1823    {
1824       LLVMValueRef flags;
1825       struct ac_ngg_prim prim = {};
1826       prim.num_vertices = verts_per_prim;
1827
1828       tmp = ngg_gs_vertex_ptr(ctx, tid);
1829       flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
1830       prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
1831
1832       for (unsigned i = 0; i < verts_per_prim; ++i) {
1833          prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
1834                                       LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
1835          prim.edgeflag[i] = ctx->ac.i1false;
1836       }
1837
1838       /* Geometry shaders output triangle strips, but NGG expects triangles. */
1839       if (verts_per_prim == 3) {
1840          LLVMValueRef is_odd = LLVMBuildLShr(builder, flags, ctx->ac.i8_1, "");
1841          is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
1842          LLVMValueRef flatshade_first = LLVMBuildICmp(
1843             builder, LLVMIntEQ, si_unpack_param(ctx, ctx->vs_state_bits, 4, 2), ctx->ac.i32_0, "");
1844
1845          ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
1846       }
1847
1848       ac_build_export_prim(&ctx->ac, &prim);
1849    }
1850    ac_build_endif(&ctx->ac, 5140);
1851
1852    /* Export position and parameter data */
1853    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
1854    ac_build_ifcc(&ctx->ac, tmp, 5145);
1855    {
1856       struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
1857
1858       tmp = ngg_gs_vertex_ptr(ctx, tid);
1859       tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
1860       tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
1861       const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
1862
1863       unsigned out_idx = 0;
1864       for (unsigned i = 0; i < info->num_outputs; i++) {
1865          outputs[i].semantic_name = info->output_semantic_name[i];
1866          outputs[i].semantic_index = info->output_semantic_index[i];
1867
1868          for (unsigned j = 0; j < 4; j++, out_idx++) {
1869             tmp = ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx);
1870             tmp = LLVMBuildLoad(builder, tmp, "");
1871             outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
1872             outputs[i].vertex_stream[j] = (info->output_streams[i] >> (2 * j)) & 3;
1873          }
1874       }
1875
1876       si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
1877    }
1878    ac_build_endif(&ctx->ac, 5145);
1879 }
1880
1881 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
1882                                      unsigned min_verts_per_prim, bool use_adjacency)
1883 {
1884    unsigned max_reuse = max_esverts - min_verts_per_prim;
1885    if (use_adjacency)
1886       max_reuse /= 2;
1887    *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
1888 }
1889
1890 unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
1891 {
1892    const struct si_shader_selector *sel = shader->selector;
1893
1894    if (sel->info.stage == MESA_SHADER_GEOMETRY && sel->so.num_outputs)
1895       return 44;
1896
1897    return 8;
1898 }
1899
1900 /**
1901  * Determine subgroup information like maximum number of vertices and prims.
1902  *
1903  * This happens before the shader is uploaded, since LDS relocations during
1904  * upload depend on the subgroup size.
1905  */
1906 bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
1907 {
1908    const struct si_shader_selector *gs_sel = shader->selector;
1909    const struct si_shader_selector *es_sel =
1910       shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
1911    const gl_shader_stage gs_stage = gs_sel->info.stage;
1912    const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
1913    const unsigned input_prim = si_get_input_prim(gs_sel);
1914    const bool use_adjacency =
1915       input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
1916    const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
1917    const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
1918
1919    /* All these are in dwords: */
1920    /* GE can only use 8K dwords (32KB) of LDS per workgroup.
1921     */
1922    const unsigned max_lds_size = 8 * 1024 - gfx10_ngg_get_scratch_dw_size(shader);
1923    const unsigned target_lds_size = max_lds_size;
1924    unsigned esvert_lds_size = 0;
1925    unsigned gsprim_lds_size = 0;
1926
1927    /* All these are per subgroup: */
1928    const unsigned min_esverts = gs_sel->screen->info.chip_class >= GFX10_3 ? 29 : 24;
1929    bool max_vert_out_per_gs_instance = false;
1930    unsigned max_gsprims_base = 128; /* default prim group size clamp */
1931    unsigned max_esverts_base = 128;
1932
1933    if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
1934       max_gsprims_base = 128 / 3;
1935       max_esverts_base = max_gsprims_base * 3;
1936    } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
1937       max_gsprims_base = 126;
1938       max_esverts_base = 128;
1939    }
1940
1941    /* Hardware has the following non-natural restrictions on the value
1942     * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
1943     * the draw:
1944     *  - at most 252 for any line input primitive type
1945     *  - at most 251 for any quad input primitive type
1946     *  - at most 251 for triangle strips with adjacency (this happens to
1947     *    be the natural limit for triangle *lists* with adjacency)
1948     */
1949    max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
1950
1951    if (gs_stage == MESA_SHADER_GEOMETRY) {
1952       bool force_multi_cycling = false;
1953       unsigned max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices * gs_num_invocations;
1954
1955 retry_select_mode:
1956       if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
1957          if (max_out_verts_per_gsprim) {
1958             max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
1959          }
1960       } else {
1961          /* Use special multi-cycling mode in which each GS
1962           * instance gets its own subgroup. Does not work with
1963           * tessellation. */
1964          max_vert_out_per_gs_instance = true;
1965          max_gsprims_base = 1;
1966          max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
1967       }
1968
1969       esvert_lds_size = es_sel->esgs_itemsize / 4;
1970       gsprim_lds_size = (gs_sel->gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
1971
1972       if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
1973          if (gs_sel->tess_turns_off_ngg || es_sel->info.stage != MESA_SHADER_TESS_EVAL) {
1974             force_multi_cycling = true;
1975             goto retry_select_mode;
1976          }
1977       }
1978    } else {
1979       /* VS and TES. */
1980       /* LDS size for passing data from ES to GS. */
1981       esvert_lds_size = ngg_nogs_vertex_size(shader);
1982    }
1983
1984    unsigned max_gsprims = max_gsprims_base;
1985    unsigned max_esverts = max_esverts_base;
1986
1987    if (esvert_lds_size)
1988       max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
1989    if (gsprim_lds_size)
1990       max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
1991
1992    max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
1993    clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
1994    assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
1995
1996    if (esvert_lds_size || gsprim_lds_size) {
1997       /* Now that we have a rough proportionality between esverts
1998        * and gsprims based on the primitive type, scale both of them
1999        * down simultaneously based on required LDS space.
2000        *
2001        * We could be smarter about this if we knew how much vertex
2002        * reuse to expect.
2003        */
2004       unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
2005       if (lds_total > target_lds_size) {
2006          max_esverts = max_esverts * target_lds_size / lds_total;
2007          max_gsprims = max_gsprims * target_lds_size / lds_total;
2008
2009          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2010          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2011          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2012       }
2013    }
2014
2015    /* Round up towards full wave sizes for better ALU utilization. */
2016    if (!max_vert_out_per_gs_instance) {
2017       const unsigned wavesize = si_get_shader_wave_size(shader);
2018       unsigned orig_max_esverts;
2019       unsigned orig_max_gsprims;
2020       do {
2021          orig_max_esverts = max_esverts;
2022          orig_max_gsprims = max_gsprims;
2023
2024          max_esverts = align(max_esverts, wavesize);
2025          max_esverts = MIN2(max_esverts, max_esverts_base);
2026          if (esvert_lds_size)
2027             max_esverts =
2028                MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
2029          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2030          /* Hardware restriction: minimum value of max_esverts */
2031          max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2032
2033          max_gsprims = align(max_gsprims, wavesize);
2034          max_gsprims = MIN2(max_gsprims, max_gsprims_base);
2035          if (gsprim_lds_size) {
2036             /* Don't count unusable vertices to the LDS size. Those are vertices above
2037              * the maximum number of vertices that can occur in the workgroup,
2038              * which is e.g. max_gsprims * 3 for triangles.
2039              */
2040             unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
2041             max_gsprims =
2042                MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
2043          }
2044          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
2045          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
2046       } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
2047
2048       /* Verify the restriction. */
2049       assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
2050    } else {
2051       /* Hardware restriction: minimum value of max_esverts */
2052       max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
2053    }
2054
2055    unsigned max_out_vertices =
2056       max_vert_out_per_gs_instance
2057          ? gs_sel->gs_max_out_vertices
2058          : gs_stage == MESA_SHADER_GEOMETRY
2059               ? max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices
2060               : max_esverts;
2061    assert(max_out_vertices <= 256);
2062
2063    unsigned prim_amp_factor = 1;
2064    if (gs_stage == MESA_SHADER_GEOMETRY) {
2065       /* Number of output primitives per GS input primitive after
2066        * GS instancing. */
2067       prim_amp_factor = gs_sel->gs_max_out_vertices;
2068    }
2069
2070    /* The GE only checks against the maximum number of ES verts after
2071     * allocating a full GS primitive. So we need to ensure that whenever
2072     * this check passes, there is enough space for a full primitive without
2073     * vertex reuse.
2074     */
2075    shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
2076    shader->ngg.max_gsprims = max_gsprims;
2077    shader->ngg.max_out_verts = max_out_vertices;
2078    shader->ngg.prim_amp_factor = prim_amp_factor;
2079    shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
2080
2081    /* Don't count unusable vertices. */
2082    shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
2083                                     esvert_lds_size;
2084    shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
2085
2086    assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
2087
2088    /* If asserts are disabled, we use the same conditions to return false */
2089    return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
2090           max_out_vertices <= 256 &&
2091           shader->ngg.hw_max_esverts >= min_esverts;
2092 }