src/gallium/drivers/radeonsi/gfx10_shader_ngg.c

   1 /*
   2  * Copyright 2017 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 #include "si_pipe.h"
  25 #include "si_shader_internal.h"
  26
  27 #include "sid.h"
  28
  29 #include "util/u_memory.h"
  30 #include "util/u_prim.h"
  31
  32 static LLVMValueRef get_wave_id_in_tg(struct si_shader_context *ctx)
  33 {
  34         return si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
  35 }
  36
  37 static LLVMValueRef get_tgsize(struct si_shader_context *ctx)
  38 {
  39         return si_unpack_param(ctx, ctx->param_merged_wave_info, 28, 4);
  40 }
  41
  42 static LLVMValueRef get_thread_id_in_tg(struct si_shader_context *ctx)
  43 {
  44         LLVMBuilderRef builder = ctx->ac.builder;
  45         LLVMValueRef tmp;
  46         tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
  47                            LLVMConstInt(ctx->ac.i32, 64, false), "");
  48         return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
  49 }
  50
  51 static LLVMValueRef ngg_get_vtx_cnt(struct si_shader_context *ctx)
  52 {
  53         return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
  54                             LLVMConstInt(ctx->ac.i32, 12, false),
  55                             LLVMConstInt(ctx->ac.i32, 9, false),
  56                             false);
  57 }
  58
  59 static LLVMValueRef ngg_get_prim_cnt(struct si_shader_context *ctx)
  60 {
  61         return ac_build_bfe(&ctx->ac, ctx->gs_tg_info,
  62                             LLVMConstInt(ctx->ac.i32, 22, false),
  63                             LLVMConstInt(ctx->ac.i32, 9, false),
  64                             false);
  65 }
  66
  67 /* Send GS Alloc Req message from the first wave of the group to SPI.
  68  * Message payload is:
  69  * - bits 0..10: vertices in group
  70  * - bits 12..22: primitives in group
  71  */
  72 static void build_sendmsg_gs_alloc_req(struct si_shader_context *ctx,
  73                                        LLVMValueRef vtx_cnt,
  74                                        LLVMValueRef prim_cnt)
  75 {
  76         LLVMBuilderRef builder = ctx->ac.builder;
  77         LLVMValueRef tmp;
  78
  79         tmp = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
  80         ac_build_ifcc(&ctx->ac, tmp, 5020);
  81
  82         tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->ac.i32, 12, false),"");
  83         tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
  84         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_ALLOC_REQ, tmp);
  85
  86         ac_build_endif(&ctx->ac, 5020);
  87 }
  88
  89 struct ngg_prim {
  90         unsigned num_vertices;
  91         LLVMValueRef isnull;
  92         LLVMValueRef index[3];
  93         LLVMValueRef edgeflag[3];
  94 };
  95
  96 static void build_export_prim(struct si_shader_context *ctx,
  97                               const struct ngg_prim *prim)
  98 {
  99         LLVMBuilderRef builder = ctx->ac.builder;
 100         struct ac_export_args args;
 101         LLVMValueRef tmp;
 102
 103         tmp = LLVMBuildZExt(builder, prim->isnull, ctx->ac.i32, "");
 104         args.out[0] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 31, false), "");
 105
 106         for (unsigned i = 0; i < prim->num_vertices; ++i) {
 107                 tmp = LLVMBuildShl(builder, prim->index[i],
 108                                    LLVMConstInt(ctx->ac.i32, 10 * i, false), "");
 109                 args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
 110                 tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->ac.i32, "");
 111                 tmp = LLVMBuildShl(builder, tmp,
 112                                    LLVMConstInt(ctx->ac.i32, 10 * i + 9, false), "");
 113                 args.out[0] = LLVMBuildOr(builder, args.out[0], tmp, "");
 114         }
 115
 116         args.out[0] = LLVMBuildBitCast(builder, args.out[0], ctx->ac.f32, "");
 117         args.out[1] = LLVMGetUndef(ctx->ac.f32);
 118         args.out[2] = LLVMGetUndef(ctx->ac.f32);
 119         args.out[3] = LLVMGetUndef(ctx->ac.f32);
 120
 121         args.target = V_008DFC_SQ_EXP_PRIM;
 122         args.enabled_channels = 1;
 123         args.done = true;
 124         args.valid_mask = false;
 125         args.compr = false;
 126
 127         ac_build_export(&ctx->ac, &args);
 128 }
 129
 130 /**
 131  * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
 132  */
 133 void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
 134                              unsigned max_outputs,
 135                              LLVMValueRef *addrs)
 136 {
 137         struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 138         struct tgsi_shader_info *info = &ctx->shader->selector->info;
 139         struct si_shader_output_values *outputs = NULL;
 140         LLVMBuilderRef builder = ctx->ac.builder;
 141         struct lp_build_if_state if_state;
 142         LLVMValueRef tmp;
 143
 144         assert(!ctx->shader->is_gs_copy_shader);
 145         assert(info->num_outputs <= max_outputs);
 146
 147         outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 148
 149         for (unsigned i = 0; i < info->num_outputs; i++) {
 150                 outputs[i].semantic_name = info->output_semantic_name[i];
 151                 outputs[i].semantic_index = info->output_semantic_index[i];
 152
 153                 /* This is used only by streamout. */
 154                 for (unsigned j = 0; j < 4; j++) {
 155                         outputs[i].values[j] =
 156                                 LLVMBuildLoad(builder,
 157                                               addrs[4 * i + j],
 158                                               "");
 159                         outputs[i].vertex_stream[j] =
 160                                 (info->output_streams[i] >> (2 * j)) & 3;
 161                 }
 162         }
 163
 164         lp_build_endif(&ctx->merged_wrap_if_state);
 165
 166         LLVMValueRef prims_in_wave = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
 167         LLVMValueRef vtx_in_wave = si_unpack_param(ctx, ctx->param_merged_wave_info, 0, 8);
 168         LLVMValueRef is_gs_thread = LLVMBuildICmp(builder, LLVMIntULT,
 169                                                   ac_get_thread_id(&ctx->ac), prims_in_wave, "");
 170         LLVMValueRef is_es_thread = LLVMBuildICmp(builder, LLVMIntULT,
 171                                                   ac_get_thread_id(&ctx->ac), vtx_in_wave, "");
 172         LLVMValueRef vtxindex[] = {
 173                 si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 0, 16),
 174                 si_unpack_param(ctx, ctx->param_gs_vtx01_offset, 16, 16),
 175                 si_unpack_param(ctx, ctx->param_gs_vtx23_offset, 0, 16),
 176         };
 177
 178         /* Determine the number of vertices per primitive. */
 179         unsigned num_vertices;
 180         LLVMValueRef num_vertices_val;
 181
 182         if (ctx->type == PIPE_SHADER_VERTEX) {
 183                 if (info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS]) {
 184                         /* Blits always use axis-aligned rectangles with 3 vertices. */
 185                         num_vertices = 3;
 186                         num_vertices_val = LLVMConstInt(ctx->i32, 3, 0);
 187                 } else {
 188                         /* Extract OUTPRIM field. */
 189                         tmp = si_unpack_param(ctx, ctx->param_vs_state_bits, 2, 2);
 190                         num_vertices_val = LLVMBuildAdd(builder, tmp, ctx->i32_1, "");
 191                         num_vertices = 3; /* TODO: optimize for points & lines */
 192                 }
 193         } else {
 194                 assert(ctx->type == PIPE_SHADER_TESS_EVAL);
 195
 196                 if (info->properties[TGSI_PROPERTY_TES_POINT_MODE])
 197                         num_vertices = 1;
 198                 else if (info->properties[TGSI_PROPERTY_TES_PRIM_MODE] == PIPE_PRIM_LINES)
 199                         num_vertices = 2;
 200                 else
 201                         num_vertices = 3;
 202
 203                 num_vertices_val = LLVMConstInt(ctx->i32, num_vertices, false);
 204         }
 205
 206         /* TODO: streamout */
 207
 208         /* TODO: primitive culling */
 209
 210         build_sendmsg_gs_alloc_req(ctx, ngg_get_vtx_cnt(ctx), ngg_get_prim_cnt(ctx));
 211
 212         /* Export primitive data to the index buffer. Format is:
 213          *  - bits 0..8: index 0
 214          *  - bit 9: edge flag 0
 215          *  - bits 10..18: index 1
 216          *  - bit 19: edge flag 1
 217          *  - bits 20..28: index 2
 218          *  - bit 29: edge flag 2
 219          *  - bit 31: null primitive (skip)
 220          *
 221          * For the first version, we will always build up all three indices
 222          * independent of the primitive type. The additional garbage data
 223          * shouldn't hurt.
 224          *
 225          * TODO: culling depends on the primitive type, so can have some
 226          * interaction here.
 227          */
 228         lp_build_if(&if_state, &ctx->gallivm, is_gs_thread);
 229         {
 230                 struct ngg_prim prim = {};
 231
 232                 prim.num_vertices = num_vertices;
 233                 prim.isnull = ctx->ac.i1false;
 234                 memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
 235
 236                 for (unsigned i = 0; i < num_vertices; ++i) {
 237                         tmp = LLVMBuildLShr(builder, ctx->abi.gs_invocation_id,
 238                                             LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
 239                         prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
 240                 }
 241
 242                 build_export_prim(ctx, &prim);
 243         }
 244         lp_build_endif(&if_state);
 245
 246         /* Export per-vertex data (positions and parameters). */
 247         lp_build_if(&if_state, &ctx->gallivm, is_es_thread);
 248         {
 249                 unsigned i;
 250
 251                 /* Unconditionally (re-)load the values for proper SSA form. */
 252                 for (i = 0; i < info->num_outputs; i++) {
 253                         for (unsigned j = 0; j < 4; j++) {
 254                                 outputs[i].values[j] =
 255                                         LLVMBuildLoad(builder,
 256                                                 addrs[4 * i + j],
 257                                                 "");
 258                         }
 259                 }
 260
 261                 /* TODO: Vertex shaders have to get PrimitiveID from GS VGPRs. */
 262                 if (ctx->type == PIPE_SHADER_TESS_EVAL &&
 263                     ctx->shader->key.mono.u.vs_export_prim_id) {
 264                         outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
 265                         outputs[i].semantic_index = 0;
 266                         outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
 267                         for (unsigned j = 1; j < 4; j++)
 268                                 outputs[i].values[j] = LLVMGetUndef(ctx->f32);
 269
 270                         memset(outputs[i].vertex_stream, 0,
 271                                sizeof(outputs[i].vertex_stream));
 272                         i++;
 273                 }
 274
 275                 si_llvm_export_vs(ctx, outputs, i);
 276         }
 277         lp_build_endif(&if_state);
 278
 279         FREE(outputs);
 280 }
 281
 282 static LLVMValueRef
 283 ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
 284 {
 285         const struct si_shader_selector *sel = ctx->shader->selector;
 286         const struct tgsi_shader_info *info = &sel->info;
 287
 288         LLVMTypeRef elements[2] = {
 289                 LLVMArrayType(ctx->ac.i32, 4 * info->num_outputs),
 290                 LLVMArrayType(ctx->ac.i8, 4),
 291         };
 292         LLVMTypeRef type = LLVMStructTypeInContext(ctx->ac.context, elements, 2, false);
 293         type = LLVMPointerType(LLVMArrayType(type, 0), AC_ADDR_SPACE_LDS);
 294         return LLVMBuildBitCast(ctx->ac.builder, ctx->gs_ngg_emit, type, "");
 295 }
 296
 297 /**
 298  * Return a pointer to the LDS storage reserved for the N'th vertex, where N
 299  * is in emit order; that is:
 300  * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
 301  * - during vertex emit, i.e. while the API GS shader invocation is running,
 302  *   N = threadidx * gs_max_out_vertices + emitidx
 303  *
 304  * Goals of the LDS memory layout:
 305  * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
 306  *    in uniform control flow
 307  * 2. Eliminate bank conflicts on read for export if, additionally, there is no
 308  *    culling
 309  * 3. Agnostic to the number of waves (since we don't know it before compiling)
 310  * 4. Allow coalescing of LDS instructions (ds_write_b128 etc.)
 311  * 5. Avoid wasting memory.
 312  *
 313  * We use an AoS layout due to point 4 (this also helps point 3). In an AoS
 314  * layout, elimination of bank conflicts requires that each vertex occupy an
 315  * odd number of dwords. We use the additional dword to store the output stream
 316  * index as well as a flag to indicate whether this vertex ends a primitive
 317  * for rasterization.
 318  *
 319  * Swizzling is required to satisfy points 1 and 2 simultaneously.
 320  *
 321  * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
 322  * Indices are swizzled in groups of 32, which ensures point 1 without
 323  * disturbing point 2.
 324  *
 325  * \return an LDS pointer to type {[N x i32], [4 x i8]}
 326  */
 327 static LLVMValueRef
 328 ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef vertexidx)
 329 {
 330         struct si_shader_selector *sel = ctx->shader->selector;
 331         LLVMBuilderRef builder = ctx->ac.builder;
 332         LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);
 333
 334         /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
 335         unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
 336         if (write_stride_2exp) {
 337                 LLVMValueRef row =
 338                         LLVMBuildLShr(builder, vertexidx,
 339                                       LLVMConstInt(ctx->ac.i32, 5, false), "");
 340                 LLVMValueRef swizzle =
 341                         LLVMBuildAnd(builder, row,
 342                                      LLVMConstInt(ctx->ac.i32, (1u << write_stride_2exp) - 1,
 343                                                   false), "");
 344                 vertexidx = LLVMBuildXor(builder, vertexidx, swizzle, "");
 345         }
 346
 347         return ac_build_gep0(&ctx->ac, storage, vertexidx);
 348 }
 349
 350 static LLVMValueRef
 351 ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMValueRef gsthread,
 352                        LLVMValueRef emitidx)
 353 {
 354         struct si_shader_selector *sel = ctx->shader->selector;
 355         LLVMBuilderRef builder = ctx->ac.builder;
 356         LLVMValueRef tmp;
 357
 358         tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
 359         tmp = LLVMBuildMul(builder, tmp, gsthread, "");
 360         const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
 361         return ngg_gs_vertex_ptr(ctx, vertexidx);
 362 }
 363
 364 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
 365                               unsigned stream,
 366                               LLVMValueRef *addrs)
 367 {
 368         const struct si_shader_selector *sel = ctx->shader->selector;
 369         const struct tgsi_shader_info *info = &sel->info;
 370         LLVMBuilderRef builder = ctx->ac.builder;
 371         struct lp_build_if_state if_state;
 372         LLVMValueRef tmp;
 373         const LLVMValueRef vertexidx =
 374                 LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
 375
 376         /* If this thread has already emitted the declared maximum number of
 377          * vertices, skip the write: excessive vertex emissions are not
 378          * supposed to have any effect.
 379          */
 380         const LLVMValueRef can_emit =
 381                 LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
 382                               LLVMConstInt(ctx->i32, sel->gs_max_out_vertices, false), "");
 383
 384         tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
 385         tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
 386         LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
 387
 388         lp_build_if(&if_state, &ctx->gallivm, can_emit);
 389
 390         const LLVMValueRef vertexptr =
 391                 ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
 392         unsigned out_idx = 0;
 393         for (unsigned i = 0; i < info->num_outputs; i++) {
 394                 for (unsigned chan = 0; chan < 4; chan++, out_idx++) {
 395                         if (!(info->output_usagemask[i] & (1 << chan)) ||
 396                             ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
 397                                 continue;
 398
 399                         LLVMValueRef out_val = LLVMBuildLoad(builder, addrs[4 * i + chan], "");
 400                         LLVMValueRef gep_idx[3] = {
 401                                 ctx->ac.i32_0, /* implied C-style array */
 402                                 ctx->ac.i32_0, /* first entry of struct */
 403                                 LLVMConstInt(ctx->ac.i32, out_idx, false),
 404                         };
 405                         LLVMValueRef ptr = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
 406
 407                         out_val = ac_to_integer(&ctx->ac, out_val);
 408                         LLVMBuildStore(builder, out_val, ptr);
 409                 }
 410         }
 411         assert(out_idx * 4 == sel->gsvs_vertex_size);
 412
 413         /* Determine and store whether this vertex completed a primitive. */
 414         const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");
 415
 416         tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
 417         const LLVMValueRef iscompleteprim =
 418                 LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");
 419
 420         tmp = LLVMBuildAdd(builder, curverts, ctx->ac.i32_1, "");
 421         LLVMBuildStore(builder, tmp, ctx->gs_curprim_verts[stream]);
 422
 423         LLVMValueRef gep_idx[3] = {
 424                 ctx->ac.i32_0, /* implied C-style array */
 425                 ctx->ac.i32_1, /* second struct entry */
 426                 LLVMConstInt(ctx->ac.i32, stream, false),
 427         };
 428         const LLVMValueRef primflagptr =
 429                 LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
 430
 431         tmp = LLVMBuildZExt(builder, iscompleteprim, ctx->ac.i8, "");
 432         LLVMBuildStore(builder, tmp, primflagptr);
 433
 434         lp_build_endif(&if_state);
 435 }
 436
 437 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 438 {
 439         const struct si_shader_selector *sel = ctx->shader->selector;
 440         const struct tgsi_shader_info *info = &sel->info;
 441         const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
 442         LLVMBuilderRef builder = ctx->ac.builder;
 443         LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
 444         LLVMValueRef tmp, tmp2;
 445
 446         /* Zero out remaining (non-emitted) primitive flags.
 447          *
 448          * Note: Alternatively, we could pass the relevant gs_next_vertex to
 449          *       the emit threads via LDS. This is likely worse in the expected
 450          *       typical case where each GS thread emits the full set of
 451          *       vertices.
 452          */
 453         for (unsigned stream = 0; stream < 4; ++stream) {
 454                 if (!info->num_stream_output_components[stream])
 455                         continue;
 456
 457                 const LLVMValueRef gsthread = get_thread_id_in_tg(ctx);
 458
 459                 ac_build_bgnloop(&ctx->ac, 5100);
 460
 461                 const LLVMValueRef vertexidx =
 462                         LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
 463                 tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
 464                         LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
 465                 ac_build_ifcc(&ctx->ac, tmp, 5101);
 466                 ac_build_break(&ctx->ac);
 467                 ac_build_endif(&ctx->ac, 5101);
 468
 469                 tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
 470                 LLVMBuildStore(builder, tmp, ctx->gs_next_vertex[stream]);
 471
 472                 tmp = ngg_gs_emit_vertex_ptr(ctx, gsthread, vertexidx);
 473                 LLVMValueRef gep_idx[3] = {
 474                         ctx->ac.i32_0, /* implied C-style array */
 475                         ctx->ac.i32_1, /* second entry of struct */
 476                         LLVMConstInt(ctx->ac.i32, stream, false),
 477                 };
 478                 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
 479                 LLVMBuildStore(builder, i8_0, tmp);
 480
 481                 ac_build_endloop(&ctx->ac, 5100);
 482         }
 483
 484         lp_build_endif(&ctx->merged_wrap_if_state);
 485
 486         ac_build_s_barrier(&ctx->ac);
 487
 488         const LLVMValueRef tid = get_thread_id_in_tg(ctx);
 489         LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
 490
 491         /* TODO: streamout */
 492
 493         /* TODO: culling */
 494
 495         /* Determine vertex liveness. */
 496         LLVMValueRef vertliveptr = lp_build_alloca(&ctx->gallivm, ctx->ac.i1, "vertexlive");
 497
 498         tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
 499         ac_build_ifcc(&ctx->ac, tmp, 5120);
 500         {
 501                 for (unsigned i = 0; i < verts_per_prim; ++i) {
 502                         const LLVMValueRef primidx =
 503                                 LLVMBuildAdd(builder, tid,
 504                                              LLVMConstInt(ctx->ac.i32, i, false), "");
 505
 506                         if (i > 0) {
 507                                 tmp = LLVMBuildICmp(builder, LLVMIntULT, primidx, num_emit_threads, "");
 508                                 ac_build_ifcc(&ctx->ac, tmp, 5121 + i);
 509                         }
 510
 511                         /* Load primitive liveness */
 512                         tmp = ngg_gs_vertex_ptr(ctx, primidx);
 513                         LLVMValueRef gep_idx[3] = {
 514                                 ctx->ac.i32_0, /* implicit C-style array */
 515                                 ctx->ac.i32_1, /* second value of struct */
 516                                 ctx->ac.i32_0, /* stream 0 */
 517                         };
 518                         tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
 519                         tmp = LLVMBuildLoad(builder, tmp, "");
 520                         const LLVMValueRef primlive =
 521                                 LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
 522
 523                         tmp = LLVMBuildLoad(builder, vertliveptr, "");
 524                         tmp = LLVMBuildOr(builder, tmp, primlive, ""),
 525                         LLVMBuildStore(builder, tmp, vertliveptr);
 526
 527                         if (i > 0)
 528                                 ac_build_endif(&ctx->ac, 5121 + i);
 529                 }
 530         }
 531         ac_build_endif(&ctx->ac, 5120);
 532
 533         /* Inclusive scan addition across the current wave. */
 534         LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
 535         struct ac_wg_scan vertlive_scan = {};
 536         vertlive_scan.op = nir_op_iadd;
 537         vertlive_scan.enable_reduce = true;
 538         vertlive_scan.enable_exclusive = true;
 539         vertlive_scan.src = vertlive;
 540         vertlive_scan.scratch = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ctx->i32_0);
 541         vertlive_scan.waveidx = get_wave_id_in_tg(ctx);
 542         vertlive_scan.numwaves = get_tgsize(ctx);
 543         vertlive_scan.maxwaves = 8;
 544
 545         ac_build_wg_scan(&ctx->ac, &vertlive_scan);
 546
 547         /* Skip all exports (including index exports) when possible. At least on
 548          * early gfx10 revisions this is also to avoid hangs.
 549          */
 550         LLVMValueRef have_exports =
 551                 LLVMBuildICmp(builder, LLVMIntNE, vertlive_scan.result_reduce, ctx->ac.i32_0, "");
 552         num_emit_threads =
 553                 LLVMBuildSelect(builder, have_exports, num_emit_threads, ctx->ac.i32_0, "");
 554
 555         /* Allocate export space. Send this message as early as possible, to
 556          * hide the latency of the SQ <-> SPI roundtrip.
 557          *
 558          * Note: We could consider compacting primitives for export as well.
 559          *       PA processes 1 non-null prim / clock, but it fetches 4 DW of
 560          *       prim data per clock and skips null primitives at no additional
 561          *       cost. So compacting primitives can only be beneficial when
 562          *       there are 4 or more contiguous null primitives in the export
 563          *       (in the common case of single-dword prim exports).
 564          */
 565         build_sendmsg_gs_alloc_req(ctx, vertlive_scan.result_reduce, num_emit_threads);
 566
 567         /* Setup the reverse vertex compaction permutation. We re-use stream 1
 568          * of the primitive liveness flags, relying on the fact that each
 569          * threadgroup can have at most 256 threads. */
 570         ac_build_ifcc(&ctx->ac, vertlive, 5130);
 571         {
 572                 tmp = ngg_gs_vertex_ptr(ctx, vertlive_scan.result_exclusive);
 573                 LLVMValueRef gep_idx[3] = {
 574                         ctx->ac.i32_0, /* implicit C-style array */
 575                         ctx->ac.i32_1, /* second value of struct */
 576                         ctx->ac.i32_1, /* stream 1 */
 577                 };
 578                 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
 579                 tmp2 = LLVMBuildTrunc(builder, tid, ctx->ac.i8, "");
 580                 LLVMBuildStore(builder, tmp2, tmp);
 581         }
 582         ac_build_endif(&ctx->ac, 5130);
 583
 584         ac_build_s_barrier(&ctx->ac);
 585
 586         /* Export primitive data */
 587         tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
 588         ac_build_ifcc(&ctx->ac, tmp, 5140);
 589         {
 590                 struct ngg_prim prim = {};
 591                 prim.num_vertices = verts_per_prim;
 592
 593                 tmp = ngg_gs_vertex_ptr(ctx, tid);
 594                 LLVMValueRef gep_idx[3] = {
 595                         ctx->ac.i32_0, /* implicit C-style array */
 596                         ctx->ac.i32_1, /* second value of struct */
 597                         ctx->ac.i32_0, /* primflag */
 598                 };
 599                 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
 600                 tmp = LLVMBuildLoad(builder, tmp, "");
 601                 prim.isnull = LLVMBuildICmp(builder, LLVMIntEQ, tmp,
 602                                             LLVMConstInt(ctx->ac.i8, 0, false), "");
 603
 604                 for (unsigned i = 0; i < verts_per_prim; ++i) {
 605                         prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
 606                                 LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
 607                         prim.edgeflag[i] = ctx->ac.i1false;
 608                 }
 609
 610                 build_export_prim(ctx, &prim);
 611         }
 612         ac_build_endif(&ctx->ac, 5140);
 613
 614         /* Export position and parameter data */
 615         tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
 616         ac_build_ifcc(&ctx->ac, tmp, 5145);
 617         {
 618                 struct si_shader_output_values *outputs = NULL;
 619                 outputs = MALLOC(info->num_outputs * sizeof(outputs[0]));
 620
 621                 tmp = ngg_gs_vertex_ptr(ctx, tid);
 622                 LLVMValueRef gep_idx[3] = {
 623                         ctx->ac.i32_0, /* implicit C-style array */
 624                         ctx->ac.i32_1, /* second value of struct */
 625                         ctx->ac.i32_1, /* stream 1: source data index */
 626                 };
 627                 tmp = LLVMBuildGEP(builder, tmp, gep_idx, 3, "");
 628                 tmp = LLVMBuildLoad(builder, tmp, "");
 629                 tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
 630                 const LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tmp);
 631
 632                 unsigned out_idx = 0;
 633                 gep_idx[1] = ctx->ac.i32_0;
 634                 for (unsigned i = 0; i < info->num_outputs; i++) {
 635                         outputs[i].semantic_name = info->output_semantic_name[i];
 636                         outputs[i].semantic_index = info->output_semantic_index[i];
 637
 638                         for (unsigned j = 0; j < 4; j++, out_idx++) {
 639                                 gep_idx[2] = LLVMConstInt(ctx->ac.i32, out_idx, false);
 640                                 tmp = LLVMBuildGEP(builder, vertexptr, gep_idx, 3, "");
 641                                 tmp = LLVMBuildLoad(builder, tmp, "");
 642                                 outputs[i].values[j] = ac_to_float(&ctx->ac, tmp);
 643                                 outputs[i].vertex_stream[j] =
 644                                         (info->output_streams[i] >> (2 * j)) & 3;
 645                         }
 646                 }
 647
 648                 si_llvm_export_vs(ctx, outputs, info->num_outputs);
 649
 650                 FREE(outputs);
 651         }
 652         ac_build_endif(&ctx->ac, 5145);
 653 }